[PATCH v6][RFC]xen: sched: convert RTDS from time to event driven model

xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v6][RFC]xen: sched: convert RTDS from time to event driven model
@ 2016-02-25 20:05 Tianyang Chen
  2016-02-25 23:31 ` Dario Faggioli
  0 siblings, 1 reply; 8+ messages in thread
From: Tianyang Chen @ 2016-02-25 20:05 UTC (permalink / raw)
  To: xen-devel
  Cc: dario.faggioli, Tianyang Chen, george.dunlap, Dagaen Golomb,
	Meng Xu

changes since v5:
    removed unnecessary vcpu_on_replq() checks
    deadline_queue_insert() returns a flag to indicate if it's
    needed to re-program the timer
    removed unnecessary timer checks
    added helper functions to remove vcpus from queues
    coding style

Changes since v4:
    removed unnecessary replenishment queue checks in vcpu_wake()
    extended replq_remove() to all cases in vcpu_sleep()
    used _deadline_queue_insert() helper function for both queues
    _replq_insert() and _replq_remove() program timer internally

Changes since v3:
    removed running queue.
    added repl queue to keep track of repl events.
    timer is now per scheduler.
    timer is init on a valid cpu in a cpupool.

Budget replenishment and enforcement are separated by adding
a replenishment timer, which fires at the next most imminent
release time of all runnable vcpus.

A new runningq has been added to keep track of all vcpus that
are on pcpus.

The following functions have major changes to manage the runningq
and replenishment

repl_handler(): It is a timer handler which is re-programmed
to fire at the nearest vcpu deadline to replenish vcpus on runq,
depeletedq and runningq. When the replenishment is done, each
replenished vcpu should tickle a pcpu to see if it needs to
preempt any running vcpus.

rt_schedule(): picks the highest runnable vcpu based on cpu
affinity and ret.time will be passed to schedule(). If an idle
vcpu is picked, -1 is returned to avoid busy-waiting. repl_update()
has been removed.

rt_vcpu_wake(): when a vcpu is awake, it tickles instead of
picking one from runq. When a vcpu wakes up, it might reprogram
the timer if it has a more recent release time.

rt_context_saved(): when context switching is finished, the
preempted vcpu will be put back into the runq. Runningq is
updated and picking from runq and tickling are removed.

Simplified funtional graph:

schedule.c SCHEDULE_SOFTIRQ:
    rt_schedule():
        [spin_lock]
        burn_budget(scurr)
        snext = runq_pick()
        [spin_unlock]

sched_rt.c TIMER_SOFTIRQ
    replenishment_timer_handler()
        [spin_lock]
        <for_each_vcpu_on_q(i)> {
            replenish(i)
            runq_tickle(i)
        }>
        program_timer()
        [spin_lock]

Signed-off-by: Tianyang Chen <tiche@seas.upenn.edu>
Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
---
 xen/common/sched_rt.c |  328 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 244 insertions(+), 84 deletions(-)

diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
index 2e5430f..16f77f9 100644
--- a/xen/common/sched_rt.c
+++ b/xen/common/sched_rt.c
@@ -16,6 +16,7 @@
 #include <xen/delay.h>
 #include <xen/event.h>
 #include <xen/time.h>
+#include <xen/timer.h>
 #include <xen/perfc.h>
 #include <xen/sched-if.h>
 #include <xen/softirq.h>
@@ -87,7 +88,7 @@
 #define RTDS_DEFAULT_BUDGET     (MICROSECS(4000))
 
 #define UPDATE_LIMIT_SHIFT      10
-#define MAX_SCHEDULE            (MILLISECS(1))
+
 /*
  * Flags
  */
@@ -142,6 +143,9 @@ static cpumask_var_t *_cpumask_scratch;
  */
 static unsigned int nr_rt_ops;
 
+/* handler for the replenishment timer */
+static void repl_handler(void *data);
+
 /*
  * Systme-wide private data, include global RunQueue/DepletedQ
  * Global lock is referenced by schedule_data.schedule_lock from all
@@ -152,7 +156,9 @@ struct rt_private {
     struct list_head sdom;      /* list of availalbe domains, used for dump */
     struct list_head runq;      /* ordered list of runnable vcpus */
     struct list_head depletedq; /* unordered list of depleted vcpus */
+    struct list_head replq;     /* ordered list of vcpus that need replenishment */
     cpumask_t tickled;          /* cpus been tickled */
+    struct timer *repl_timer;   /* replenishment timer */
 };
 
 /*
@@ -160,6 +166,7 @@ struct rt_private {
  */
 struct rt_vcpu {
     struct list_head q_elem;    /* on the runq/depletedq list */
+    struct list_head replq_elem;/* on the repl event list */
 
     /* Up-pointers */
     struct rt_dom *sdom;
@@ -213,8 +220,14 @@ static inline struct list_head *rt_depletedq(const struct scheduler *ops)
     return &rt_priv(ops)->depletedq;
 }
 
+static inline struct list_head *rt_replq(const struct scheduler *ops)
+{
+    return &rt_priv(ops)->replq;
+}
+
 /*
- * Queue helper functions for runq and depletedq
+ * Queue helper functions for runq, depletedq
+ * and replenishment event queue
  */
 static int
 __vcpu_on_q(const struct rt_vcpu *svc)
@@ -228,6 +241,18 @@ __q_elem(struct list_head *elem)
     return list_entry(elem, struct rt_vcpu, q_elem);
 }
 
+static struct rt_vcpu *
+__replq_elem(struct list_head *elem)
+{
+    return list_entry(elem, struct rt_vcpu, replq_elem);
+}
+
+static int
+__vcpu_on_replq(const struct rt_vcpu *svc)
+{
+   return !list_empty(&svc->replq_elem);
+}
+
 /*
  * Debug related code, dump vcpu/cpu information
  */
@@ -288,7 +313,7 @@ rt_dump_pcpu(const struct scheduler *ops, int cpu)
 static void
 rt_dump(const struct scheduler *ops)
 {
-    struct list_head *runq, *depletedq, *iter;
+    struct list_head *runq, *depletedq, *replq, *iter;
     struct rt_private *prv = rt_priv(ops);
     struct rt_vcpu *svc;
     struct rt_dom *sdom;
@@ -301,6 +326,7 @@ rt_dump(const struct scheduler *ops)
 
     runq = rt_runq(ops);
     depletedq = rt_depletedq(ops);
+    replq = rt_replq(ops);
 
     printk("Global RunQueue info:\n");
     list_for_each( iter, runq )
@@ -316,6 +342,13 @@ rt_dump(const struct scheduler *ops)
         rt_dump_vcpu(ops, svc);
     }
 
+    printk("Global Replenishment Event info:\n");
+    list_for_each( iter, replq )
+    {
+        svc = __replq_elem(iter);
+        rt_dump_vcpu(ops, svc);
+    }
+
     printk("Domain info:\n");
     list_for_each( iter, &prv->sdom )
     {
@@ -380,11 +413,74 @@ rt_update_deadline(s_time_t now, struct rt_vcpu *svc)
     return;
 }
 
+/* a helper function that only removes a vcpu from a queue */
+static inline void
+deadline_queue_remove(struct list_head *elem)
+{
+    list_del_init(elem);
+}
+
 static inline void
 __q_remove(struct rt_vcpu *svc)
 {
     if ( __vcpu_on_q(svc) )
-        list_del_init(&svc->q_elem);
+        deadline_queue_remove(&svc->q_elem);
+}
+
+/*
+ * Removing a vcpu from the replenishment queue could
+ * re-program the timer for the next replenishment event
+ * if there is any on the list
+ */
+static inline void
+__replq_remove(const struct scheduler *ops, struct rt_vcpu *svc)
+{
+    struct rt_private *prv = rt_priv(ops);
+    struct list_head *replq = rt_replq(ops);
+    struct timer* repl_timer = prv->repl_timer;
+
+    /*
+     * Disarm the timer before re-programming it.
+     * It doesn't matter if the vcpu to be removed
+     * is on top of the list or not because the timer
+     * is stopped and needs to be re-programmed anyways
+     */
+    stop_timer(repl_timer);
+
+    deadline_queue_remove(&svc->replq_elem);
+
+    /* re-arm the timer for the next replenishment event */
+    if( !list_empty(replq) )
+    {
+        struct rt_vcpu *svc_next = __replq_elem(replq->next);
+        set_timer(repl_timer, svc_next->cur_deadline);
+    }
+}
+
+/*
+ * An utility function that inserts a vcpu to a
+ * queue based on certain order (EDF). The returned
+ * value is 1 if a vcpu has been inserted to the
+ * front of a list
+ */
+static int
+deadline_queue_insert(struct rt_vcpu * (*_get_q_elem)(struct list_head *elem),
+    struct rt_vcpu *svc, struct list_head *elem, struct list_head *queue)
+{
+    struct list_head *iter;
+    int pos = 0;
+
+    list_for_each(iter, queue)
+    {
+        struct rt_vcpu * iter_svc = (*_get_q_elem)(iter);
+        if ( svc->cur_deadline <= iter_svc->cur_deadline )
+            break;
+
+        pos++;
+    }
+
+    list_add_tail(elem, iter);
+    return !pos;
 }
 
 /*
@@ -397,7 +493,6 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
 {
     struct rt_private *prv = rt_priv(ops);
     struct list_head *runq = rt_runq(ops);
-    struct list_head *iter;
 
     ASSERT( spin_is_locked(&prv->lock) );
 
@@ -405,22 +500,38 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
 
     /* add svc to runq if svc still has budget */
     if ( svc->cur_budget > 0 )
-    {
-        list_for_each(iter, runq)
-        {
-            struct rt_vcpu * iter_svc = __q_elem(iter);
-            if ( svc->cur_deadline <= iter_svc->cur_deadline )
-                    break;
-         }
-        list_add_tail(&svc->q_elem, iter);
-    }
+        deadline_queue_insert(&__q_elem, svc, &svc->q_elem, runq);
     else
     {
         list_add(&svc->q_elem, &prv->depletedq);
+        ASSERT( __vcpu_on_replq(svc) );
     }
 }
 
 /*
+ * Insert svc into the replenishment event list
+ * in replenishment time order.
+ * vcpus that need to be replished earlier go first.
+ * The timer may be re-programmed if svc is inserted
+ * at the front of the event list.
+ */
+static void
+__replq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
+{
+    struct list_head *replq = rt_replq(ops);
+    struct rt_private *prv = rt_priv(ops);
+    struct timer *repl_timer = prv->repl_timer;
+    int set;
+
+    ASSERT( !__vcpu_on_replq(svc) );
+
+    set = deadline_queue_insert(&__replq_elem, svc, &svc->replq_elem, replq);
+ 
+    if( set )
+        set_timer(repl_timer,svc->cur_deadline);
+}
+
+/*
  * Init/Free related code
  */
 static int
@@ -449,11 +560,18 @@ rt_init(struct scheduler *ops)
     INIT_LIST_HEAD(&prv->sdom);
     INIT_LIST_HEAD(&prv->runq);
     INIT_LIST_HEAD(&prv->depletedq);
+    INIT_LIST_HEAD(&prv->replq);
 
     cpumask_clear(&prv->tickled);
 
     ops->sched_data = prv;
 
+    /* 
+     * The timer initialization will happen later when 
+     * the first pcpu is added to this pool in alloc_pdata
+     */
+    prv->repl_timer = NULL;
+
     return 0;
 
  no_mem:
@@ -473,6 +591,10 @@ rt_deinit(const struct scheduler *ops)
         xfree(_cpumask_scratch);
         _cpumask_scratch = NULL;
     }
+
+    kill_timer(prv->repl_timer);
+    xfree(prv->repl_timer);
+
     xfree(prv);
 }
 
@@ -493,6 +615,17 @@ rt_alloc_pdata(const struct scheduler *ops, int cpu)
     if ( !alloc_cpumask_var(&_cpumask_scratch[cpu]) )
         return NULL;
 
+    if( prv->repl_timer == NULL )
+    {   
+        /* allocate the timer on the first cpu of this pool */
+        prv->repl_timer = xzalloc(struct timer);
+
+        if(prv->repl_timer == NULL )
+            return NULL;
+
+        init_timer(prv->repl_timer, repl_handler, (void *)ops, cpu);
+    }
+
     /* 1 indicates alloc. succeed in schedule.c */
     return (void *)1;
 }
@@ -586,6 +719,7 @@ rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
         return NULL;
 
     INIT_LIST_HEAD(&svc->q_elem);
+    INIT_LIST_HEAD(&svc->replq_elem);
     svc->flags = 0U;
     svc->sdom = dd;
     svc->vcpu = vc;
@@ -609,7 +743,8 @@ rt_free_vdata(const struct scheduler *ops, void *priv)
 }
 
 /*
- * This function is called in sched_move_domain() in schedule.c
+ * It is called in sched_move_domain() and sched_init_vcpu
+ * in schedule.c
  * When move a domain to a new cpupool.
  * It inserts vcpus of moving domain to the scheduler's RunQ in
  * dest. cpupool.
@@ -651,6 +786,10 @@ rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
     lock = vcpu_schedule_lock_irq(vc);
     if ( __vcpu_on_q(svc) )
         __q_remove(svc);
+
+    if( __vcpu_on_replq(svc) )
+        __replq_remove(ops,svc);
+
     vcpu_schedule_unlock_irq(lock, vc);
 }
 
@@ -785,44 +924,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask)
 }
 
 /*
- * Update vcpu's budget and
- * sort runq by insert the modifed vcpu back to runq
- * lock is grabbed before calling this function
- */
-static void
-__repl_update(const struct scheduler *ops, s_time_t now)
-{
-    struct list_head *runq = rt_runq(ops);
-    struct list_head *depletedq = rt_depletedq(ops);
-    struct list_head *iter;
-    struct list_head *tmp;
-    struct rt_vcpu *svc = NULL;
-
-    list_for_each_safe(iter, tmp, runq)
-    {
-        svc = __q_elem(iter);
-        if ( now < svc->cur_deadline )
-            break;
-
-        rt_update_deadline(now, svc);
-        /* reinsert the vcpu if its deadline is updated */
-        __q_remove(svc);
-        __runq_insert(ops, svc);
-    }
-
-    list_for_each_safe(iter, tmp, depletedq)
-    {
-        svc = __q_elem(iter);
-        if ( now >= svc->cur_deadline )
-        {
-            rt_update_deadline(now, svc);
-            __q_remove(svc); /* remove from depleted queue */
-            __runq_insert(ops, svc); /* add to runq */
-        }
-    }
-}
-
-/*
  * schedule function for rt scheduler.
  * The lock is already grabbed in schedule.c, no need to lock here
  */
@@ -841,7 +942,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
     /* burn_budget would return for IDLE VCPU */
     burn_budget(ops, scurr, now);
 
-    __repl_update(ops, now);
 
     if ( tasklet_work_scheduled )
     {
@@ -868,6 +968,8 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
         set_bit(__RTDS_delayed_runq_add, &scurr->flags);
 
     snext->last_start = now;
+
+    ret.time =  -1; /* if an idle vcpu is picked */ 
     if ( !is_idle_vcpu(snext->vcpu) )
     {
         if ( snext != scurr )
@@ -880,9 +982,11 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
             snext->vcpu->processor = cpu;
             ret.migrated = 1;
         }
+
+        ret.time = snext->budget; /* invoke the scheduler next time */
+
     }
 
-    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
     ret.task = snext->vcpu;
 
     /* TRACE */
@@ -924,6 +1028,8 @@ rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
         __q_remove(svc);
     else if ( svc->flags & RTDS_delayed_runq_add )
         clear_bit(__RTDS_delayed_runq_add, &svc->flags);
+
+    __replq_remove(ops, svc);
 }
 
 /*
@@ -1026,10 +1132,6 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
 {
     struct rt_vcpu * const svc = rt_vcpu(vc);
     s_time_t now = NOW();
-    struct rt_private *prv = rt_priv(ops);
-    struct rt_vcpu *snext = NULL; /* highest priority on RunQ */
-    struct rt_dom *sdom = NULL;
-    cpumask_t *online;
 
     BUG_ON( is_idle_vcpu(vc) );
 
@@ -1051,6 +1153,22 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
     else
         SCHED_STAT_CRANK(vcpu_wake_not_runnable);
 
+    /*
+     * If a deadline passed while svc was asleep/blocked, we need new
+     * scheduling parameters ( a new deadline and full budget), and
+     * also a new replenishment event
+     */
+    if ( now >= svc->cur_deadline)
+    {   
+        rt_update_deadline(now, svc);
+        __replq_remove(ops, svc);
+    }
+
+    if( !__vcpu_on_replq(svc) )
+    {
+        __replq_insert(ops, svc);
+        ASSERT( vcpu_runnable(vc) );
+    }
     /* If context hasn't been saved for this vcpu yet, we can't put it on
      * the Runqueue/DepletedQ. Instead, we set a flag so that it will be
      * put on the Runqueue/DepletedQ after the context has been saved.
@@ -1061,22 +1179,10 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
         return;
     }
 
-    if ( now >= svc->cur_deadline)
-        rt_update_deadline(now, svc);
-
     /* insert svc to runq/depletedq because svc is not in queue now */
     __runq_insert(ops, svc);
 
-    __repl_update(ops, now);
-
-    ASSERT(!list_empty(&prv->sdom));
-    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
-    online = cpupool_domain_cpumask(sdom->dom);
-    snext = __runq_pick(ops, online); /* pick snext from ALL valid cpus */
-
-    runq_tickle(ops, snext);
-
-    return;
+    runq_tickle(ops, svc);
 }
 
 /*
@@ -1087,10 +1193,6 @@ static void
 rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
 {
     struct rt_vcpu *svc = rt_vcpu(vc);
-    struct rt_vcpu *snext = NULL;
-    struct rt_dom *sdom = NULL;
-    struct rt_private *prv = rt_priv(ops);
-    cpumask_t *online;
     spinlock_t *lock = vcpu_schedule_lock_irq(vc);
 
     clear_bit(__RTDS_scheduled, &svc->flags);
@@ -1102,14 +1204,7 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
          likely(vcpu_runnable(vc)) )
     {
         __runq_insert(ops, svc);
-        __repl_update(ops, NOW());
-
-        ASSERT(!list_empty(&prv->sdom));
-        sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
-        online = cpupool_domain_cpumask(sdom->dom);
-        snext = __runq_pick(ops, online); /* pick snext from ALL cpus */
-
-        runq_tickle(ops, snext);
+        runq_tickle(ops, svc);
     }
 out:
     vcpu_schedule_unlock_irq(lock, vc);
@@ -1168,6 +1263,71 @@ rt_dom_cntl(
     return rc;
 }
 
+/*
+ * The replenishment timer handler picks vcpus
+ * from the replq and does the actual replenishment
+ */
+static void repl_handler(void *data){
+    unsigned long flags;
+    s_time_t now = NOW();
+    struct scheduler *ops = data; 
+    struct rt_private *prv = rt_priv(ops);
+    struct list_head *replq = rt_replq(ops);
+    struct timer *repl_timer = prv->repl_timer;
+    struct list_head *iter, *tmp;
+    struct rt_vcpu *svc = NULL;
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    stop_timer(repl_timer);
+
+    list_for_each_safe(iter, tmp, replq)
+    {
+        svc = __replq_elem(iter);
+
+        if ( now < svc->cur_deadline )
+            break;
+
+        rt_update_deadline(now, svc);
+
+        /*
+         * when the replenishment happens
+         * svc is either on a pcpu or on
+         * runq/depletedq
+         */
+        if( __vcpu_on_q(svc) )
+        {
+            /* put back to runq */
+            __q_remove(svc);
+            __runq_insert(ops, svc);
+        }
+
+        /* 
+         * tickle regardless where it's at 
+         * because a running vcpu could have
+         * a later deadline than others after
+         * replenishment
+         */
+        runq_tickle(ops, svc);
+
+        /* 
+         * update replenishment event queue
+         * without reprogramming the timer
+         */
+        deadline_queue_remove(&svc->replq_elem);
+        deadline_queue_insert(&__replq_elem, svc, &svc->replq_elem, replq);
+    }
+
+    /* 
+     * use the vcpu that's on the top
+     * or else don't program the timer
+     */
+    if( !list_empty(replq) )
+        set_timer(repl_timer, __replq_elem(replq->next)->cur_deadline);
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+}
+
 static struct rt_private _rt_priv;
 
 static const struct scheduler sched_rtds_def = {
-- 
1.7.9.5


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v6][RFC]xen: sched: convert RTDS from time to event driven model
  2016-02-25 20:05 [PATCH v6][RFC]xen: sched: convert RTDS from time to event driven model Tianyang Chen
@ 2016-02-25 23:31 ` Dario Faggioli
  2016-02-26  5:15   ` Tianyang Chen
  0 siblings, 1 reply; 8+ messages in thread
From: Dario Faggioli @ 2016-02-25 23:31 UTC (permalink / raw)
  To: Tianyang Chen, xen-devel; +Cc: george.dunlap, Dagaen Golomb, Meng Xu


[-- Attachment #1.1: Type: text/plain, Size: 13088 bytes --]

Hey again,

Thanks for turning up so quickly.

We are getting closer and closer, although (of course :-)) I have some
more comments.

However, is there a particular reason why you are keeping the RFC tag?
Until you do that, it's like saying that you are chasing feedback, but
you do not think yourself the patch should be considered for being
upstreamed. As far as my opinion goes, this patch is not ready to go in
right now (as I said, I've got questions and comments), but its status
is way past RFC.

On Thu, 2016-02-25 at 15:05 -0500, Tianyang Chen wrote:
> changes since v5:
>     removed unnecessary vcpu_on_replq() checks
>     deadline_queue_insert() returns a flag to indicate if it's
>     needed to re-program the timer
>     removed unnecessary timer checks
>     added helper functions to remove vcpus from queues
>     coding style
> 
> Changes since v4:
>     removed unnecessary replenishment queue checks in vcpu_wake()
>     extended replq_remove() to all cases in vcpu_sleep()
>     used _deadline_queue_insert() helper function for both queues
>     _replq_insert() and _replq_remove() program timer internally
> 
> Changes since v3:
>     removed running queue.
>     added repl queue to keep track of repl events.
>     timer is now per scheduler.
>     timer is init on a valid cpu in a cpupool.
> 
So, this does not belong here. It is ok to have it in this part of the
email, but it should not land in the actual commit changelog, once the
patch will be committed into Xen's git tree.

The way to achieve the above is to put this summary of changes below
the actual changelog, and below the Signed-of-by lines, after a marker
that looks like this "---".

> Budget replenishment and enforcement are separated by adding
> a replenishment timer, which fires at the next most imminent
> release time of all runnable vcpus.
> 
> A new runningq has been added to keep track of all vcpus that
> are on pcpus.
> 
Mmm.. Is this the proper changelog? runningq is something we discussed,
and that appeared in v2, but is certainly no longer here... :-O

> diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
> index 2e5430f..16f77f9 100644
> --- a/xen/common/sched_rt.c
> +++ b/xen/common/sched_rt.c
> 
> @@ -213,8 +220,14 @@ static inline struct list_head
> *rt_depletedq(const struct scheduler *ops)
>      return &rt_priv(ops)->depletedq;
>  }
>  
> +static inline struct list_head *rt_replq(const struct scheduler
> *ops)
> +{
> +    return &rt_priv(ops)->replq;
> +}
> +
>  /*
> - * Queue helper functions for runq and depletedq
> + * Queue helper functions for runq, depletedq
> + * and replenishment event queue
>
Full stop. :-)

In any case, I'd change this in something like:
"Helper functions for manipulating the runqueue, the depleted queue,
and the replenishment events queue."

> @@ -228,6 +241,18 @@ __q_elem(struct list_head *elem)
>      return list_entry(elem, struct rt_vcpu, q_elem);
>  }
>  
> +static struct rt_vcpu *
> +__replq_elem(struct list_head *elem)
> +{
> +    return list_entry(elem, struct rt_vcpu, replq_elem);
> +}
> +
> +static int
> +__vcpu_on_replq(const struct rt_vcpu *svc)
> +{
> +   return !list_empty(&svc->replq_elem);
> +}
> +
>
Ok, sorry for changing my mind again, but I really can't stand seeing
these underscores. Please, rename these as replq_elem() and
vcpu_on_replq(). There is nor sensible reason why we should prefix
these with '__'.

I know, that will create some amount of inconsistency, but:
 - there is inconsistency already (here and in other sched_* file)
 - not introducing more __ prefixed function is a step in the right 
   direction; we'll convert the one that are already there with time.

> + * Removing a vcpu from the replenishment queue could
> + * re-program the timer for the next replenishment event
> + * if there is any on the list
> 
> + */
> +static inline void
> +__replq_remove(const struct scheduler *ops, struct rt_vcpu *svc)
> +{
> +    struct rt_private *prv = rt_priv(ops);
> +    struct list_head *replq = rt_replq(ops);
> +    struct timer* repl_timer = prv->repl_timer;
> +
> +    /*
> +     * Disarm the timer before re-programming it.
> +     * It doesn't matter if the vcpu to be removed
> +     * is on top of the list or not because the timer
> +     * is stopped and needs to be re-programmed anyways
> +     */
> +    stop_timer(repl_timer);
> +
> +    deadline_queue_remove(&svc->replq_elem);
> +
> +    /* re-arm the timer for the next replenishment event */
> +    if( !list_empty(replq) )
> +    {
> +        struct rt_vcpu *svc_next = __replq_elem(replq->next);
> +        set_timer(repl_timer, svc_next->cur_deadline);
> +    }
>
Wait, maybe you misunderstood and/or I did not make myself clear enough
(in which case, sorry). I never meant to say "always stop the timer".
Atually, in one of my last email I said the opposite, and I think that
would be the best thing to do.

Do you think there is any specific reason why we need to always stop
and restart it? If not, I think we can:
 - have deadline_queue_remove() also return whether the element 
   removed was the first one (i.e., the one the timer was programmed 
   after);
 - if it was, re-program the timer after the new front of the queue;
 - if it wasn't, nothing to do.

Thoughts?

> +/*
> + * An utility function that inserts a vcpu to a
> + * queue based on certain order (EDF). The returned
> + * value is 1 if a vcpu has been inserted to the
> + * front of a list
> + */
> +static int
> +deadline_queue_insert(struct rt_vcpu * (*_get_q_elem)(struct
> list_head *elem),
> +    struct rt_vcpu *svc, struct list_head *elem, struct list_head
> *queue)
> +{
> +    struct list_head *iter;
> +    int pos = 0;
> +
> +    list_for_each(iter, queue)
> +    {
> +        struct rt_vcpu * iter_svc = (*_get_q_elem)(iter);
> +        if ( svc->cur_deadline <= iter_svc->cur_deadline )
> +            break;
> +
> +        pos++;
> +    }
> +
> +    list_add_tail(elem, iter);
> +    return !pos;
>  }
> 
Ok.

> @@ -405,22 +500,38 @@ __runq_insert(const struct scheduler *ops,
> struct rt_vcpu *svc)
>  
>      /* add svc to runq if svc still has budget */
>      if ( svc->cur_budget > 0 )
> -    {
> -        list_for_each(iter, runq)
> -        {
> -            struct rt_vcpu * iter_svc = __q_elem(iter);
> -            if ( svc->cur_deadline <= iter_svc->cur_deadline )
> -                    break;
> -         }
> -        list_add_tail(&svc->q_elem, iter);
> -    }
> +        deadline_queue_insert(&__q_elem, svc, &svc->q_elem, runq);
>      else
>      {
>          list_add(&svc->q_elem, &prv->depletedq);
> +        ASSERT( __vcpu_on_replq(svc) );
>
So, by doing this, you're telling me that, if the vcpu is added to the
depleted queue, there must be a replenishment planned for it (or the
ASSERT() would fail).

But if we are adding it to the runq, there may or may not be a
replenishment planned for it.

Is this what we want? Why there must not be a replenishment planned
already for a vcpu going into the runq (to happen at its next
deadline)?

>  /*
> + * Insert svc into the replenishment event list
> + * in replenishment time order.
> + * vcpus that need to be replished earlier go first.
> + * The timer may be re-programmed if svc is inserted
> + * at the front of the event list.
> + */
> +static void
> +__replq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
> +{
> +    struct list_head *replq = rt_replq(ops);
> +    struct rt_private *prv = rt_priv(ops);
> +    struct timer *repl_timer = prv->repl_timer;
> +    int set;
> +
> +    ASSERT( !__vcpu_on_replq(svc) );
> +
> +    set = deadline_queue_insert(&__replq_elem, svc, &svc-
> >replq_elem, replq);
> + 
> +    if( set )
> +        set_timer(repl_timer,svc->cur_deadline);
> +}
A matter of taste, mostly, but I'd avoid the local variable (if this
function will at some point become more complex, then we'll see, but
for now, I think it's ok to just use the return value of
deadline_queue_insert() inside the if().

> @@ -868,6 +968,8 @@ rt_schedule(const struct scheduler *ops, s_time_t
> now, bool_t tasklet_work_sched
>          set_bit(__RTDS_delayed_runq_add, &scurr->flags);
>  
>      snext->last_start = now;
> +
>
You don't need to add neither this blank line...

> +    ret.time =  -1; /* if an idle vcpu is picked */ 
>      if ( !is_idle_vcpu(snext->vcpu) )
>      {
>          if ( snext != scurr )
> @@ -880,9 +982,11 @@ rt_schedule(const struct scheduler *ops,
> s_time_t now, bool_t tasklet_work_sched
>              snext->vcpu->processor = cpu;
>              ret.migrated = 1;
>          }
> +
> +        ret.time = snext->budget; /* invoke the scheduler next time
> */
> +
...nor this one.
>      }

> @@ -924,6 +1028,8 @@ rt_vcpu_sleep(const struct scheduler *ops,
> struct vcpu *vc)
>          __q_remove(svc);
>      else if ( svc->flags & RTDS_delayed_runq_add )
>          clear_bit(__RTDS_delayed_runq_add, &svc->flags);
> +
> +    __replq_remove(ops, svc);
>
What I said in my last email is that you probably can get rid of this
(see below, whe commenting on context_saved()).
 
> @@ -1051,6 +1153,22 @@ rt_vcpu_wake(const struct scheduler *ops,
> struct vcpu *vc)
>      else
>          SCHED_STAT_CRANK(vcpu_wake_not_runnable);
>  
> +    /*
> +     * If a deadline passed while svc was asleep/blocked, we need
> new
> +     * scheduling parameters ( a new deadline and full budget), and
> +     * also a new replenishment event
> +     */
> +    if ( now >= svc->cur_deadline)
> +    {   
> +        rt_update_deadline(now, svc);
> +        __replq_remove(ops, svc);
> +    }
> +
> +    if( !__vcpu_on_replq(svc) )
> +    {
> +        __replq_insert(ops, svc);
> +        ASSERT( vcpu_runnable(vc) );
>
Mmm... What's this assert about?

> @@ -1087,10 +1193,6 @@ static void
>  rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
>  {
>      struct rt_vcpu *svc = rt_vcpu(vc);
> -    struct rt_vcpu *snext = NULL;
> -    struct rt_dom *sdom = NULL;
> -    struct rt_private *prv = rt_priv(ops);
> -    cpumask_t *online;
>      spinlock_t *lock = vcpu_schedule_lock_irq(vc);
>  
>      clear_bit(__RTDS_scheduled, &svc->flags);
> @@ -1102,14 +1204,7 @@ rt_context_saved(const struct scheduler *ops,
> struct vcpu *vc)
>           likely(vcpu_runnable(vc)) )
>      {
>          __runq_insert(ops, svc);
> -        __repl_update(ops, NOW());
> -
> -        ASSERT(!list_empty(&prv->sdom));
> -        sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
> -        online = cpupool_domain_cpumask(sdom->dom);
> -        snext = __runq_pick(ops, online); /* pick snext from ALL
> cpus */
> -
> -        runq_tickle(ops, snext);
> +        runq_tickle(ops, svc);
>      }
>
So, here we are.

What I meant was to make this function look more or less like this:

static void
rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
{
    struct rt_vcpu *svc = rt_vcpu(vc);
    spinlock_t *lock = vcpu_schedule_lock_irq(vc);

    clear_bit(__RTDS_scheduled, &svc->flags);
    /* not insert idle vcpu to runq */
    if ( is_idle_vcpu(vc) )
        goto out;

    if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
         likely(vcpu_runnable(vc)) )
    {
        __runq_insert(ops, svc);
        runq_tickle(ops, snext);
    }
    else
        __replq_remove(ops, svc);
out:
    vcpu_schedule_unlock_irq(lock, vc);
}

And, as said above, if you do this, try also removing the
__replq_remove() call from rt_vcpu_sleep(), this one should cover for
that (and, actually, more than just that!).

After all this, check whether you still have the assert in
__replq_insert() triggering and let me know

Thanks and Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v6][RFC]xen: sched: convert RTDS from time to event driven model
  2016-02-25 23:31 ` Dario Faggioli
@ 2016-02-26  5:15   ` Tianyang Chen
  2016-02-26  9:11     ` Dario Faggioli
  0 siblings, 1 reply; 8+ messages in thread
From: Tianyang Chen @ 2016-02-26  5:15 UTC (permalink / raw)
  To: Dario Faggioli, xen-devel; +Cc: george.dunlap, Dagaen Golomb, Meng Xu



On 2/25/2016 6:31 PM, Dario Faggioli wrote:
> Hey again,
>
> Thanks for turning up so quickly.
>
> We are getting closer and closer, although (of course :-)) I have some
> more comments.
>
> However, is there a particular reason why you are keeping the RFC tag?
> Until you do that, it's like saying that you are chasing feedback, but
> you do not think yourself the patch should be considered for being
> upstreamed. As far as my opinion goes, this patch is not ready to go in
> right now (as I said, I've got questions and comments), but its status
> is way past RFC.
>

Oh OK, I had the impression that RFC means request for comments and it 
should always be used because indeed, I'm asking for comments.

> On Thu, 2016-02-25 at 15:05 -0500, Tianyang Chen wrote:
>> changes since v5:
>>      removed unnecessary vcpu_on_replq() checks
>>      deadline_queue_insert() returns a flag to indicate if it's
>>      needed to re-program the timer
>>      removed unnecessary timer checks
>>      added helper functions to remove vcpus from queues
>>      coding style
>>
>> Changes since v4:
>>      removed unnecessary replenishment queue checks in vcpu_wake()
>>      extended replq_remove() to all cases in vcpu_sleep()
>>      used _deadline_queue_insert() helper function for both queues
>>      _replq_insert() and _replq_remove() program timer internally
>>
>> Changes since v3:
>>      removed running queue.
>>      added repl queue to keep track of repl events.
>>      timer is now per scheduler.
>>      timer is init on a valid cpu in a cpupool.
>>
> So, this does not belong here. It is ok to have it in this part of the
> email, but it should not land in the actual commit changelog, once the
> patch will be committed into Xen's git tree.
>
> The way to achieve the above is to put this summary of changes below
> the actual changelog, and below the Signed-of-by lines, after a marker
> that looks like this "---".
>
>> Budget replenishment and enforcement are separated by adding
>> a replenishment timer, which fires at the next most imminent
>> release time of all runnable vcpus.
>>
>> A new runningq has been added to keep track of all vcpus that
>> are on pcpus.
>>
> Mmm.. Is this the proper changelog? runningq is something we discussed,
> and that appeared in v2, but is certainly no longer here... :-O
>

oops...

> Wait, maybe you misunderstood and/or I did not make myself clear enough
> (in which case, sorry). I never meant to say "always stop the timer".
> Atually, in one of my last email I said the opposite, and I think that
> would be the best thing to do.
>
> Do you think there is any specific reason why we need to always stop
> and restart it? If not, I think we can:
>   - have deadline_queue_remove() also return whether the element
>     removed was the first one (i.e., the one the timer was programmed
>     after);
>   - if it was, re-program the timer after the new front of the queue;
>   - if it wasn't, nothing to do.
>
> Thoughts?
>

It was my thought originally that the timer needs to be re-programmed 
only when the top vcpu is taken off. So did you mean when I manipulated 
repl_timer->expires manually, the timer should be stopped using proper 
timer API? The manipulation is gone now. Also, set_timer internally 
disables the timer so I assume it's safe just to call set_timer() 
directly, right?

>> @@ -405,22 +500,38 @@ __runq_insert(const struct scheduler *ops,
>> struct rt_vcpu *svc)
>>
>>       /* add svc to runq if svc still has budget */
>>       if ( svc->cur_budget > 0 )
>> -    {
>> -        list_for_each(iter, runq)
>> -        {
>> -            struct rt_vcpu * iter_svc = __q_elem(iter);
>> -            if ( svc->cur_deadline <= iter_svc->cur_deadline )
>> -                    break;
>> -         }
>> -        list_add_tail(&svc->q_elem, iter);
>> -    }
>> +        deadline_queue_insert(&__q_elem, svc, &svc->q_elem, runq);
>>       else
>>       {
>>           list_add(&svc->q_elem, &prv->depletedq);
>> +        ASSERT( __vcpu_on_replq(svc) );
>>
> So, by doing this, you're telling me that, if the vcpu is added to the
> depleted queue, there must be a replenishment planned for it (or the
> ASSERT() would fail).
>
> But if we are adding it to the runq, there may or may not be a
> replenishment planned for it.
>
> Is this what we want? Why there must not be a replenishment planned
> already for a vcpu going into the runq (to happen at its next
> deadline)?
>

It looks like the current code doesn't add a vcpu to the replenishment 
queue when vcpu_insert() is called. When the scheduler is initialized, 
all the vcpus are added to the replenishment queue after waking up from 
sleep. This needs to be changed (add vcpu to replq in vcpu_insert()) to 
make it consistent in a sense that when rt_vcpu_insert() is called, it 
needs to have a corresponding replenishment event queued.

This way the ASSERT() is for both cases in __runq_insert() to enforce 
the fact that "when a vcpu is inserted to runq/depletedq, a 
replenishment event is waiting for it".

>> @@ -1087,10 +1193,6 @@ static void
>>   rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
>>   {
>>       struct rt_vcpu *svc = rt_vcpu(vc);
>> -    struct rt_vcpu *snext = NULL;
>> -    struct rt_dom *sdom = NULL;
>> -    struct rt_private *prv = rt_priv(ops);
>> -    cpumask_t *online;
>>       spinlock_t *lock = vcpu_schedule_lock_irq(vc);
>>
>>       clear_bit(__RTDS_scheduled, &svc->flags);
>> @@ -1102,14 +1204,7 @@ rt_context_saved(const struct scheduler *ops,
>> struct vcpu *vc)
>>            likely(vcpu_runnable(vc)) )
>>       {
>>           __runq_insert(ops, svc);
>> -        __repl_update(ops, NOW());
>> -
>> -        ASSERT(!list_empty(&prv->sdom));
>> -        sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
>> -        online = cpupool_domain_cpumask(sdom->dom);
>> -        snext = __runq_pick(ops, online); /* pick snext from ALL
>> cpus */
>> -
>> -        runq_tickle(ops, snext);
>> +        runq_tickle(ops, svc);
>>       }
>>
> So, here we are.
>
> What I meant was to make this function look more or less like this:
>
> static void
> rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
> {
>      struct rt_vcpu *svc = rt_vcpu(vc);
>      spinlock_t *lock = vcpu_schedule_lock_irq(vc);
>
>      clear_bit(__RTDS_scheduled, &svc->flags);
>      /* not insert idle vcpu to runq */
>      if ( is_idle_vcpu(vc) )
>          goto out;
>
>      if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
>           likely(vcpu_runnable(vc)) )
>      {
>          __runq_insert(ops, svc);
>          runq_tickle(ops, snext);
>      }
>      else
>          __replq_remove(ops, svc);
> out:
>      vcpu_schedule_unlock_irq(lock, vc);
> }
>
> And, as said above, if you do this, try also removing the
> __replq_remove() call from rt_vcpu_sleep(), this one should cover for
> that (and, actually, more than just that!).
>
> After all this, check whether you still have the assert in
> __replq_insert() triggering and let me know

So after moving the __replq_remove() to rt_context_saved(), the assert 
in __replq_insert() still fails when dom0 boots up.

(XEN) Assertion '!vcpu_on_replq(svc)' failed at sched_rt.c:524
(XEN) ----[ Xen-4.7-unstable  x86_64  debug=y  Tainted:    C ]----
(XEN) CPU:    0
(XEN) RIP:    e008:[<ffff82d080128c07>] sched_rt.c#__replq_insert+0x2b/0x64
(XEN) RFLAGS: 0000000000010002   CONTEXT: hypervisor (d0v3)
(XEN) rax: 0000000000000001   rbx: ffff83023b522940   rcx: 0000000000000001
(XEN) rdx: 00000031bb1b9980   rsi: ffff83023b486760   rdi: ffff83023b486760
(XEN) rbp: ffff8300bfcffd48   rsp: ffff8300bfcffd28   r8:  0000000000000004
(XEN) r9:  00000000deadbeef   r10: ffff82d08025f5a0   r11: 0000000000000206
(XEN) r12: ffff83023b486760   r13: ffff83023b522d80   r14: ffff83023b4b5000
(XEN) r15: 000000023a6e774b   cr0: 0000000080050033   cr4: 00000000000406a0
(XEN) cr3: 0000000231c0d000   cr2: ffff8802200d81f8
(XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: e010   cs: e008
(XEN) Xen stack trace from rsp=ffff8300bfcffd28:
(XEN)    ffff82d08019642e ffff83023b486760 ffff8300bfd47000 ffff82d080299b00
(XEN)    ffff8300bfcffd88 ffff82d08012a072 ffff8300bfcffd70 ffff8300bfd47000
(XEN)    000000023a6e75c0 ffff83023b522940 ffff82d08032bc00 0000000000000282
(XEN)    ffff8300bfcffdd8 ffff82d08012be2c ffff83023b4b5000 ffff83023b4f1000
(XEN)    ffff8300bfd44000 ffff8300bfd47000 0000000000000001 ffff83023b4b40c0
(XEN)    ffff880230c14440 0000000000000000 ffff8300bfcffde8 ffff82d08012c347
(XEN)    ffff8300bfcffe08 ffff82d080169cea ffff83023b4b5000 0000000000000003
(XEN)    ffff8300bfcffe18 ffff82d080169d65 ffff8300bfcffe38 ffff82d08010762a
(XEN)    ffff83023b4b40c0 ffff83023b4b5000 ffff8300bfcffe68 ffff82d08010822a
(XEN)    ffff8300bfcffe68 fffffffffffffff2 ffff88022061dcb4 0000000000000000
(XEN)    ffff8300bfcffef8 ffff82d0801096fc 0000000000000001 ffff8300bfcfff18
(XEN)    ffff8300bfcffef8 ffff82d080240e85 ffff8300bfcf8000 0000000000000000
(XEN)    0000000000000246 ffffffff810013aa 0000000000000003 ffffffff810013aa
(XEN)    ffff8300bfcffee8 ffff8300bfd44000 ffff8802205e8000 0000000000000000
(XEN)    ffff880230c14440 0000000000000000 00007cff403000c7 ffff82d0802439e2
(XEN)    ffffffff8100140a 0000000000000020 0000000000000003 ffff880230c71900
(XEN)    ffff8802206584d0 ffff880220658000 ffff88022061dcb8 0000000000000000
(XEN)    0000000000000206 0000000000000000 ffff880223000168 ffff880223408e00
(XEN)    0000000000000020 ffffffff8100140a 0000000000000000 ffff88022061dcb4
(XEN)    0000000000000004 0001010000000000 ffffffff8100140a 000000000000e033
(XEN) Xen call trace:
(XEN)    [<ffff82d080128c07>] sched_rt.c#__replq_insert+0x2b/0x64
(XEN)    [<ffff82d08012a072>] sched_rt.c#rt_vcpu_wake+0xf2/0x12c
(XEN)    [<ffff82d08012be2c>] vcpu_wake+0x213/0x3d4
(XEN)    [<ffff82d08012c347>] vcpu_unblock+0x4b/0x4d
(XEN)    [<ffff82d080169cea>] vcpu_kick+0x20/0x6f
(XEN)    [<ffff82d080169d65>] vcpu_mark_events_pending+0x2c/0x2f
(XEN)    [<ffff82d08010762a>] event_2l.c#evtchn_2l_set_pending+0xa9/0xb9
(XEN)    [<ffff82d08010822a>] evtchn_send+0x158/0x183
(XEN)    [<ffff82d0801096fc>] do_event_channel_op+0xe21/0x147d
(XEN)    [<ffff82d0802439e2>] lstar_enter+0xe2/0x13c
(XEN)
(XEN)
(XEN) ****************************************
(XEN) Panic on CPU 0:
(XEN) Assertion '!vcpu_on_replq(svc)' failed at sched_rt.c:524
(XEN) ****************************************

Thanks,
Tianyang

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v6][RFC]xen: sched: convert RTDS from time to event driven model
  2016-02-26  5:15   ` Tianyang Chen
@ 2016-02-26  9:11     ` Dario Faggioli
  2016-02-26 17:28       ` Tianyang Chen
  0 siblings, 1 reply; 8+ messages in thread
From: Dario Faggioli @ 2016-02-26  9:11 UTC (permalink / raw)
  To: Tianyang Chen, xen-devel; +Cc: george.dunlap, Dagaen Golomb, Meng Xu


[-- Attachment #1.1: Type: text/plain, Size: 9246 bytes --]

On Fri, 2016-02-26 at 00:15 -0500, Tianyang Chen wrote:
> On 2/25/2016 6:31 PM, Dario Faggioli wrote:
> > As far as my opinion goes, this patch is not ready to
> > go in
> > right now (as I said, I've got questions and comments), but its
> > status
> > is way past RFC.
> > 
> Oh OK, I had the impression that RFC means request for comments and
> it 
> should always be used because indeed, I'm asking for comments.
> 
Exactly. Everyone is asking for comments when sending out a patch
series, and that's why it's not necessary to state it with the tag...
it's always the case, so no need to tell it all the times!

And, for the same reason, this means that, when the tag is there,
you're not only asking for comments and/or, if everything is ok,
inclusion, but you're asking for "just some feedback on a draft", and
as I said, we're beyond that phase. :-)

> > Wait, maybe you misunderstood and/or I did not make myself clear
> > enough
> > (in which case, sorry). I never meant to say "always stop the
> > timer".
> > Atually, in one of my last email I said the opposite, and I think
> > that
> > would be the best thing to do.
> > 
> > Do you think there is any specific reason why we need to always
> > stop
> > and restart it? If not, I think we can:
> >   - have deadline_queue_remove() also return whether the element
> >     removed was the first one (i.e., the one the timer was
> > programmed
> >     after);
> >   - if it was, re-program the timer after the new front of the
> > queue;
> >   - if it wasn't, nothing to do.
> > 
> > Thoughts?
> > 
> It was my thought originally that the timer needs to be re-
> programmed 
> only when the top vcpu is taken off. So did you mean when I
> manipulated 
> repl_timer->expires manually, the timer should be stopped using
> proper 
> timer API? The manipulation is gone now.
>
I know... This is mostly due to my fault commenting on this in two
different emails.

So, basically, yes, I meant that, if you want to fiddle with the timer
--like you where doing with those 'repl_timer->expires = 0'-- you need
to do it properly, with the proper API, locking, etc.

Then, in a subsequent email, I said that you just only need to do that
in a subset of the cases when this function is called.

Of course, the desired result is the combination of the two above
considerations, i.e.:
 - only stop/restart the timer when necessary,
 - if necessary, do it properly.

>  Also, set_timer internally 
> disables the timer so I assume it's safe just to call set_timer() 
> directly, right?
> 
Yes, it is.

> > > @@ -405,22 +500,38 @@ __runq_insert(const struct scheduler *ops,
> > > struct rt_vcpu *svc)
> > > 
> > >       /* add svc to runq if svc still has budget */
> > >       if ( svc->cur_budget > 0 )
> > > -    {
> > > -        list_for_each(iter, runq)
> > > -        {
> > > -            struct rt_vcpu * iter_svc = __q_elem(iter);
> > > -            if ( svc->cur_deadline <= iter_svc->cur_deadline )
> > > -                    break;
> > > -         }
> > > -        list_add_tail(&svc->q_elem, iter);
> > > -    }
> > > +        deadline_queue_insert(&__q_elem, svc, &svc->q_elem,
> > > runq);
> > >       else
> > >       {
> > >           list_add(&svc->q_elem, &prv->depletedq);
> > > +        ASSERT( __vcpu_on_replq(svc) );
> > > 
> > So, by doing this, you're telling me that, if the vcpu is added to
> > the
> > depleted queue, there must be a replenishment planned for it (or
> > the
> > ASSERT() would fail).
> > 
> > But if we are adding it to the runq, there may or may not be a
> > replenishment planned for it.
> > 
> > Is this what we want? Why there must not be a replenishment planned
> > already for a vcpu going into the runq (to happen at its next
> > deadline)?
> > 
> It looks like the current code doesn't add a vcpu to the
> replenishment 
> queue when vcpu_insert() is called. When the scheduler is
> initialized, 
> all the vcpus are added to the replenishment queue after waking up
> from 
> sleep. This needs to be changed (add vcpu to replq in vcpu_insert())
> to 
> make it consistent in a sense that when rt_vcpu_insert() is called,
> it 
> needs to have a corresponding replenishment event queued.
> 
> This way the ASSERT() is for both cases in __runq_insert() to
> enforce 
> the fact that "when a vcpu is inserted to runq/depletedq, a 
> replenishment event is waiting for it".
> 
I am ok with this (calling replq_insert() in rt_vcpu_insert()). Not
just doing that unconditionally though, as a vcpu can, e.g., be paused
when the insert_vcpu hook is called, and in that case, I don't think we
want to enqueue the replenishment event, do we?

> > static void
> > rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
> > {
> >      struct rt_vcpu *svc = rt_vcpu(vc);
> >      spinlock_t *lock = vcpu_schedule_lock_irq(vc);
> > 
> >      clear_bit(__RTDS_scheduled, &svc->flags);
> >      /* not insert idle vcpu to runq */
> >      if ( is_idle_vcpu(vc) )
> >          goto out;
> > 
> >      if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags)
> > &&
> >           likely(vcpu_runnable(vc)) )
> >      {
> >          __runq_insert(ops, svc);
> >          runq_tickle(ops, snext);
> >      }
> >      else
> >          __replq_remove(ops, svc);
> > out:
> >      vcpu_schedule_unlock_irq(lock, vc);
> > }
> > 
> > And, as said above, if you do this, try also removing the
> > __replq_remove() call from rt_vcpu_sleep(), this one should cover
> > for
> > that (and, actually, more than just that!).
> > 
> > After all this, check whether you still have the assert in
> > __replq_insert() triggering and let me know
> So after moving the __replq_remove() to rt_context_saved(), the
> assert 
> in __replq_insert() still fails when dom0 boots up.
> 
Well, maybe removing __replq_remove() from rt_vcpu_sleep() is not
entirely ok, as if we do that we fail to deal with the case of when
(still in rt_vcpu_sleep()), __vcpu_on_q(svc) is true.

So, I'd say, do as I said above wrt rt_context_saved(). For
rt_vcpu_sleep(), you can try something like this:

static void
rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
{
    struct rt_vcpu * const svc = rt_vcpu(vc);

    BUG_ON( is_idle_vcpu(vc) );
    SCHED_STAT_CRANK(vcpu_sleep);

    if ( curr_on_cpu(vc->processor) == vc )
        cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
    else if ( __vcpu_on_q(svc) )
    {
        __q_remove(svc);
        __replq_remove(svc);  <=== *** LOOK HERE ***
    }
    else if ( svc->flags & RTDS_delayed_runq_add )
        clear_bit(__RTDS_delayed_runq_add, &svc->flags);
}

In fact, in both the first and the third case, we go will at some point
pass through rt_context_switch(), and hit the __replq_remove() that I
made you put there.

In the case in the middle, as the vcpu was just queued, and for making
it go to sleep, it is enough to remove it from the runq (or depletedq,
in the case of this scheduler), we won't go through
rt_schedule()-->rt_context_saved(), and hence the __replq_remove()
won't be called.

Sorry for the overlook, can you try this.

That being said...

> (XEN) Assertion '!vcpu_on_replq(svc)' failed at sched_rt.c:524
> (XEN) ----[ Xen-4.7-unstable  x86_64  debug=y  Tainted:    C ]----
> 
> (XEN) Xen call trace:
> (XEN)    [<ffff82d080128c07>] sched_rt.c#__replq_insert+0x2b/0x64
> (XEN)    [<ffff82d08012a072>] sched_rt.c#rt_vcpu_wake+0xf2/0x12c
> (XEN)    [<ffff82d08012be2c>] vcpu_wake+0x213/0x3d4
> (XEN)    [<ffff82d08012c347>] vcpu_unblock+0x4b/0x4d
> (XEN)    [<ffff82d080169cea>] vcpu_kick+0x20/0x6f
> (XEN)    [<ffff82d080169d65>] vcpu_mark_events_pending+0x2c/0x2f
> (XEN)    [<ffff82d08010762a>]
> event_2l.c#evtchn_2l_set_pending+0xa9/0xb9
> (XEN)    [<ffff82d08010822a>] evtchn_send+0x158/0x183
> (XEN)    [<ffff82d0801096fc>] do_event_channel_op+0xe21/0x147d
> (XEN)    [<ffff82d0802439e2>] lstar_enter+0xe2/0x13c
> (XEN)
>
... This says that the we call __replq_insert() from rt_vcpu_wake() and
find out in there that vcpu_on_replq() is true.

However, in v6 code for rt_vcpu_wake(), I can see this:

+    if( !__vcpu_on_replq(svc) )
+    {
+        __replq_insert(ops, svc);
+        ASSERT( vcpu_runnable(vc) );
+    }

which would make me think that, if vcpu_on_replq() is true, we
shouldn't be calling __replq_insert() in the first place.

So, have you made other changes wrt v6 when trying this?

Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v6][RFC]xen: sched: convert RTDS from time to event driven model
  2016-02-26  9:11     ` Dario Faggioli
@ 2016-02-26 17:28       ` Tianyang Chen
  2016-02-26 18:09         ` Dario Faggioli
  0 siblings, 1 reply; 8+ messages in thread
From: Tianyang Chen @ 2016-02-26 17:28 UTC (permalink / raw)
  To: Dario Faggioli, xen-devel; +Cc: george.dunlap, Dagaen Golomb, Meng Xu

On 2/26/2016 4:11 AM, Dario Faggioli wrote:
>> It looks like the current code doesn't add a vcpu to the
>> replenishment
>> queue when vcpu_insert() is called. When the scheduler is
>> initialized,
>> all the vcpus are added to the replenishment queue after waking up
>> from
>> sleep. This needs to be changed (add vcpu to replq in vcpu_insert())
>> to
>> make it consistent in a sense that when rt_vcpu_insert() is called,
>> it
>> needs to have a corresponding replenishment event queued.
>>
>> This way the ASSERT() is for both cases in __runq_insert() to
>> enforce
>> the fact that "when a vcpu is inserted to runq/depletedq, a
>> replenishment event is waiting for it".
>>
> I am ok with this (calling replq_insert() in rt_vcpu_insert()). Not
> just doing that unconditionally though, as a vcpu can, e.g., be paused
> when the insert_vcpu hook is called, and in that case, I don't think we
> want to enqueue the replenishment event, do we?
>

Yes.

>>> static void
>>> rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
>>> {
>>>       struct rt_vcpu *svc = rt_vcpu(vc);
>>>       spinlock_t *lock = vcpu_schedule_lock_irq(vc);
>>>
>>>       clear_bit(__RTDS_scheduled, &svc->flags);
>>>       /* not insert idle vcpu to runq */
>>>       if ( is_idle_vcpu(vc) )
>>>           goto out;
>>>
>>>       if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags)
>>> &&
>>>            likely(vcpu_runnable(vc)) )
>>>       {
>>>           __runq_insert(ops, svc);
>>>           runq_tickle(ops, snext);
>>>       }
>>>       else
>>>           __replq_remove(ops, svc);
>>> out:
>>>       vcpu_schedule_unlock_irq(lock, vc);
>>> }
>>>
>>> And, as said above, if you do this, try also removing the
>>> __replq_remove() call from rt_vcpu_sleep(), this one should cover
>>> for
>>> that (and, actually, more than just that!).
>>>
>>> After all this, check whether you still have the assert in
>>> __replq_insert() triggering and let me know
>> So after moving the __replq_remove() to rt_context_saved(), the
>> assert
>> in __replq_insert() still fails when dom0 boots up.
>>
> Well, maybe removing __replq_remove() from rt_vcpu_sleep() is not
> entirely ok, as if we do that we fail to deal with the case of when
> (still in rt_vcpu_sleep()), __vcpu_on_q(svc) is true.
>
> So, I'd say, do as I said above wrt rt_context_saved(). For
> rt_vcpu_sleep(), you can try something like this:
>
> static void
> rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
> {
>      struct rt_vcpu * const svc = rt_vcpu(vc);
>
>      BUG_ON( is_idle_vcpu(vc) );
>      SCHED_STAT_CRANK(vcpu_sleep);
>
>      if ( curr_on_cpu(vc->processor) == vc )
>          cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
>      else if ( __vcpu_on_q(svc) )
>      {
>          __q_remove(svc);
>          __replq_remove(svc);  <=== *** LOOK HERE ***
>      }
>      else if ( svc->flags & RTDS_delayed_runq_add )
>          clear_bit(__RTDS_delayed_runq_add, &svc->flags);
> }
>
> In fact, in both the first and the third case, we go will at some point
> pass through rt_context_switch(), and hit the __replq_remove() that I
> made you put there.
>
> In the case in the middle, as the vcpu was just queued, and for making
> it go to sleep, it is enough to remove it from the runq (or depletedq,
> in the case of this scheduler), we won't go through
> rt_schedule()-->rt_context_saved(), and hence the __replq_remove()
> won't be called.
>
> Sorry for the overlook, can you try this.
>
> That being said...
>
>> (XEN) Assertion '!vcpu_on_replq(svc)' failed at sched_rt.c:524
>> (XEN) ----[ Xen-4.7-unstable  x86_64  debug=y  Tainted:    C ]----
>>
>> (XEN) Xen call trace:
>> (XEN)    [<ffff82d080128c07>] sched_rt.c#__replq_insert+0x2b/0x64
>> (XEN)    [<ffff82d08012a072>] sched_rt.c#rt_vcpu_wake+0xf2/0x12c
>> (XEN)    [<ffff82d08012be2c>] vcpu_wake+0x213/0x3d4
>> (XEN)    [<ffff82d08012c347>] vcpu_unblock+0x4b/0x4d
>> (XEN)    [<ffff82d080169cea>] vcpu_kick+0x20/0x6f
>> (XEN)    [<ffff82d080169d65>] vcpu_mark_events_pending+0x2c/0x2f
>> (XEN)    [<ffff82d08010762a>]
>> event_2l.c#evtchn_2l_set_pending+0xa9/0xb9
>> (XEN)    [<ffff82d08010822a>] evtchn_send+0x158/0x183
>> (XEN)    [<ffff82d0801096fc>] do_event_channel_op+0xe21/0x147d
>> (XEN)    [<ffff82d0802439e2>] lstar_enter+0xe2/0x13c
>> (XEN)
>>
> ... This says that the we call __replq_insert() from rt_vcpu_wake() and
> find out in there that vcpu_on_replq() is true.
>
> However, in v6 code for rt_vcpu_wake(), I can see this:
>
> +    if( !__vcpu_on_replq(svc) )
> +    {
> +        __replq_insert(ops, svc);
> +        ASSERT( vcpu_runnable(vc) );
> +    }
>
> which would make me think that, if vcpu_on_replq() is true, we
> shouldn't be calling __replq_insert() in the first place.
>
> So, have you made other changes wrt v6 when trying this?

The v6 doesn't have the if statement commented out when I submitted it. 
But I tried commenting it out, the assertion failed.

It fails in V6 with:
rt_vcpu_sleep(): removing replenishment event for all cases
rt_context_saved(): not removing replenishment events
rt_vcpu_wake(): not checking if the event is already queued.

or with:
rt_vcpu_sleep(): not removing replenishment event at all
rt_context_saved(): removing replenishment events if not runnable
rt_vcpu_wake(): not checking if the event is already queued.

Also with:
rt_vcpu_sleep(): removing replenishment event if the vcpu is on 
runq/depletedq
rt_context_saved(): removing replenishment events if not runnable
rt_vcpu_wake(): not checking if the event is already queued.

I added debug prints in all these functions and noticed that it could be 
caused by racing between spurious wakeups and context switching. See the 
following events for the last modification above:

(XEN) cpu1 picked idle
(XEN) d0 attempted to change d0v1's CR4 flags 00000620 -> 00040660
(XEN) cpu2 picked idle
(XEN) vcpu1 sleeps on cpu
(XEN) cpu0 picked idle
(XEN) vcpu1 context saved not runnable
(XEN) vcpu1 wakes up nowhere
(XEN) cpu0 picked vcpu1
(XEN) vcpu1 sleeps on cpu
(XEN) cpu0 picked idle
(XEN) vcpu1 context saved not runnable
(XEN) vcpu1 wakes up nowhere
(XEN) cpu0 picked vcpu1
(XEN) cpu0 picked idle
(XEN) vcpu1 context saved not runnable
(XEN) cpu0 picked vcpu0
(XEN) vcpu1 wakes up nowhere
(XEN) cpu1 picked vcpu1     *** vcpu1 is on a cpu
(XEN) cpu1 picked idle      *** vcpu1 is waiting to be context switched
(XEN) vcpu2 wakes up nowhere
(XEN) cpu0 picked vcpu0
(XEN) cpu2 picked vcpu2
(XEN) cpu0 picked vcpu0
(XEN) cpu0 picked vcpu0
(XEN) d0 attempted to change d0v2's CR4 flags 00000620 -> 00040660
(XEN) cpu0 picked vcpu0
(XEN) vcpu2 sleeps on cpu
(XEN) vcpu1 wakes up nowhere      *** vcpu1 wakes up without sleep?

(XEN) Assertion '!__vcpu_on_replq(svc)' failed at sched_rt.c:526
(XEN) ----[ Xen-4.7-unstable  x86_64  debug=y  Tainted:    C ]----
(XEN) CPU:    0
(XEN) RIP:    e008:[<ffff82d08012a151>] sched_rt.c#rt_vcpu_wake+0x11f/0x17b
...
(XEN) Xen call trace:
(XEN)    [<ffff82d08012a151>] sched_rt.c#rt_vcpu_wake+0x11f/0x17b
(XEN)    [<ffff82d08012bf2c>] vcpu_wake+0x213/0x3d4
(XEN)    [<ffff82d08012c447>] vcpu_unblock+0x4b/0x4d
(XEN)    [<ffff82d080169cea>] vcpu_kick+0x20/0x6f
(XEN)    [<ffff82d080169d65>] vcpu_mark_events_pending+0x2c/0x2f
(XEN)    [<ffff82d08010762a>] event_2l.c#evtchn_2l_set_pending+0xa9/0xb9
(XEN)    [<ffff82d080108312>] send_guest_vcpu_virq+0x9d/0xba
(XEN)    [<ffff82d080196cbe>] send_timer_event+0xe/0x10
(XEN)    [<ffff82d08012a7b5>] schedule.c#vcpu_singleshot_timer_fn+0x9/0xb
(XEN)    [<ffff82d080131978>] timer.c#execute_timer+0x4e/0x6c
(XEN)    [<ffff82d080131ab9>] timer.c#timer_softirq_action+0xdd/0x213
(XEN)    [<ffff82d08012df32>] softirq.c#__do_softirq+0x82/0x8d
(XEN)    [<ffff82d08012df8a>] do_softirq+0x13/0x15
(XEN)    [<ffff82d080243ad1>] cpufreq.c#process_softirqs+0x21/0x30

So, it looks like spurious wakeup for vcpu1 happens before it was 
completely context switched off a cpu. But rt_vcpu_wake() didn't see it 
on cpu with curr_on_cpu() so it fell through the first two RETURNs.

I guess the replenishment queue check is necessary for this situation?

Thanks,
Tianyang

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v6][RFC]xen: sched: convert RTDS from time to event driven model
  2016-02-26 17:28       ` Tianyang Chen
@ 2016-02-26 18:09         ` Dario Faggioli
  2016-02-26 18:33           ` Chen, Tianyang
  0 siblings, 1 reply; 8+ messages in thread
From: Dario Faggioli @ 2016-02-26 18:09 UTC (permalink / raw)
  To: Tianyang Chen, xen-devel; +Cc: george.dunlap, Dagaen Golomb, Meng Xu


[-- Attachment #1.1: Type: text/plain, Size: 3866 bytes --]

On Fri, 2016-02-26 at 12:28 -0500, Tianyang Chen wrote:
> > So, have you made other changes wrt v6 when trying this?
> The v6 doesn't have the if statement commented out when I submitted
> it. 
> But I tried commenting it out, the assertion failed.
> 
Ok, thanks for these tests. Can you send (just quick-&-dirtily, as an
attached to a replay to this email, no need of a proper re-submission
of a new version) the patch that does this:

> rt_vcpu_sleep(): removing replenishment event if the vcpu is on 
> runq/depletedq
> rt_context_saved(): removing replenishment events if not runnable
> rt_vcpu_wake(): not checking if the event is already queued.
> 
> I added debug prints in all these functions and noticed that it could
> be 
> caused by racing between spurious wakeups and context switching. 
>
And the code that produces these debug output as well?

> (XEN) cpu1 picked idle
> (XEN) d0 attempted to change d0v1's CR4 flags 00000620 -> 00040660
> (XEN) cpu2 picked idle
> (XEN) vcpu1 sleeps on cpu
> (XEN) cpu0 picked idle
> (XEN) vcpu1 context saved not runnable
> (XEN) vcpu1 wakes up nowhere
> (XEN) cpu0 picked vcpu1
> (XEN) vcpu1 sleeps on cpu
> (XEN) cpu0 picked idle
> (XEN) vcpu1 context saved not runnable
> (XEN) vcpu1 wakes up nowhere
> (XEN) cpu0 picked vcpu1
> (XEN) cpu0 picked idle
> (XEN) vcpu1 context saved not runnable
> (XEN) cpu0 picked vcpu0
> (XEN) vcpu1 wakes up nowhere
> (XEN) cpu1 picked vcpu1     *** vcpu1 is on a cpu
> (XEN) cpu1 picked idle      *** vcpu1 is waiting to be context
> switched
> (XEN) vcpu2 wakes up nowhere
> (XEN) cpu0 picked vcpu0
> (XEN) cpu2 picked vcpu2
> (XEN) cpu0 picked vcpu0
> (XEN) cpu0 picked vcpu0
> (XEN) d0 attempted to change d0v2's CR4 flags 00000620 -> 00040660
> (XEN) cpu0 picked vcpu0
> (XEN) vcpu2 sleeps on cpu
> (XEN) vcpu1 wakes up nowhere      *** vcpu1 wakes up without sleep?
> 
> (XEN) Assertion '!__vcpu_on_replq(svc)' failed at sched_rt.c:526
> (XEN) ----[ Xen-4.7-unstable  x86_64  debug=y  Tainted:    C ]----
> (XEN) CPU:    0
> (XEN) RIP:    e008:[<ffff82d08012a151>]
> sched_rt.c#rt_vcpu_wake+0x11f/0x17b
> ...
> (XEN) Xen call trace:
> (XEN)    [<ffff82d08012a151>] sched_rt.c#rt_vcpu_wake+0x11f/0x17b
> (XEN)    [<ffff82d08012bf2c>] vcpu_wake+0x213/0x3d4
> (XEN)    [<ffff82d08012c447>] vcpu_unblock+0x4b/0x4d
> (XEN)    [<ffff82d080169cea>] vcpu_kick+0x20/0x6f
> (XEN)    [<ffff82d080169d65>] vcpu_mark_events_pending+0x2c/0x2f
> (XEN)    [<ffff82d08010762a>]
> event_2l.c#evtchn_2l_set_pending+0xa9/0xb9
> (XEN)    [<ffff82d080108312>] send_guest_vcpu_virq+0x9d/0xba
> (XEN)    [<ffff82d080196cbe>] send_timer_event+0xe/0x10
> (XEN)    [<ffff82d08012a7b5>]
> schedule.c#vcpu_singleshot_timer_fn+0x9/0xb
> (XEN)    [<ffff82d080131978>] timer.c#execute_timer+0x4e/0x6c
> (XEN)    [<ffff82d080131ab9>] timer.c#timer_softirq_action+0xdd/0x213
> (XEN)    [<ffff82d08012df32>] softirq.c#__do_softirq+0x82/0x8d
> (XEN)    [<ffff82d08012df8a>] do_softirq+0x13/0x15
> (XEN)    [<ffff82d080243ad1>] cpufreq.c#process_softirqs+0x21/0x30
> 
> 
> So, it looks like spurious wakeup for vcpu1 happens before it was 
> completely context switched off a cpu. But rt_vcpu_wake() didn't see
> it 
> on cpu with curr_on_cpu() so it fell through the first two RETURNs.
> 
> I guess the replenishment queue check is necessary for this
> situation?
> 
Perhaps, but I first want to make sure we understand what is really
happening.

Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v6][RFC]xen: sched: convert RTDS from time to event driven model
  2016-02-26 18:09         ` Dario Faggioli
@ 2016-02-26 18:33           ` Chen, Tianyang
  2016-03-04 16:34             ` Dario Faggioli
  0 siblings, 1 reply; 8+ messages in thread
From: Chen, Tianyang @ 2016-02-26 18:33 UTC (permalink / raw)
  To: Dario Faggioli; +Cc: xen-devel, george.dunlap, Dagaen Golomb, Meng Xu

[-- Attachment #1: Type: text/plain, Size: 3680 bytes --]

Attached.

Tianyang

On 2016-02-26 13:09, Dario Faggioli wrote:
> On Fri, 2016-02-26 at 12:28 -0500, Tianyang Chen wrote:
>> > So, have you made other changes wrt v6 when trying this?
>> The v6 doesn't have the if statement commented out when I submitted
>> it. 
>> But I tried commenting it out, the assertion failed.
>> 
> Ok, thanks for these tests. Can you send (just quick-&-dirtily, as an
> attached to a replay to this email, no need of a proper re-submission
> of a new version) the patch that does this:
> 
>> rt_vcpu_sleep(): removing replenishment event if the vcpu is on 
>> runq/depletedq
>> rt_context_saved(): removing replenishment events if not runnable
>> rt_vcpu_wake(): not checking if the event is already queued.
>> 
>> I added debug prints in all these functions and noticed that it could
>> be 
>> caused by racing between spurious wakeups and context switching.
>> 
> And the code that produces these debug output as well?
> 
>> (XEN) cpu1 picked idle
>> (XEN) d0 attempted to change d0v1's CR4 flags 00000620 -> 00040660
>> (XEN) cpu2 picked idle
>> (XEN) vcpu1 sleeps on cpu
>> (XEN) cpu0 picked idle
>> (XEN) vcpu1 context saved not runnable
>> (XEN) vcpu1 wakes up nowhere
>> (XEN) cpu0 picked vcpu1
>> (XEN) vcpu1 sleeps on cpu
>> (XEN) cpu0 picked idle
>> (XEN) vcpu1 context saved not runnable
>> (XEN) vcpu1 wakes up nowhere
>> (XEN) cpu0 picked vcpu1
>> (XEN) cpu0 picked idle
>> (XEN) vcpu1 context saved not runnable
>> (XEN) cpu0 picked vcpu0
>> (XEN) vcpu1 wakes up nowhere
>> (XEN) cpu1 picked vcpu1     *** vcpu1 is on a cpu
>> (XEN) cpu1 picked idle      *** vcpu1 is waiting to be context
>> switched
>> (XEN) vcpu2 wakes up nowhere
>> (XEN) cpu0 picked vcpu0
>> (XEN) cpu2 picked vcpu2
>> (XEN) cpu0 picked vcpu0
>> (XEN) cpu0 picked vcpu0
>> (XEN) d0 attempted to change d0v2's CR4 flags 00000620 -> 00040660
>> (XEN) cpu0 picked vcpu0
>> (XEN) vcpu2 sleeps on cpu
>> (XEN) vcpu1 wakes up nowhere      *** vcpu1 wakes up without sleep?
>> 
>> (XEN) Assertion '!__vcpu_on_replq(svc)' failed at sched_rt.c:526
>> (XEN) ----[ Xen-4.7-unstable  x86_64  debug=y  Tainted:    C ]----
>> (XEN) CPU:    0
>> (XEN) RIP:    e008:[<ffff82d08012a151>]
>> sched_rt.c#rt_vcpu_wake+0x11f/0x17b
>> ...
>> (XEN) Xen call trace:
>> (XEN)    [<ffff82d08012a151>] sched_rt.c#rt_vcpu_wake+0x11f/0x17b
>> (XEN)    [<ffff82d08012bf2c>] vcpu_wake+0x213/0x3d4
>> (XEN)    [<ffff82d08012c447>] vcpu_unblock+0x4b/0x4d
>> (XEN)    [<ffff82d080169cea>] vcpu_kick+0x20/0x6f
>> (XEN)    [<ffff82d080169d65>] vcpu_mark_events_pending+0x2c/0x2f
>> (XEN)    [<ffff82d08010762a>]
>> event_2l.c#evtchn_2l_set_pending+0xa9/0xb9
>> (XEN)    [<ffff82d080108312>] send_guest_vcpu_virq+0x9d/0xba
>> (XEN)    [<ffff82d080196cbe>] send_timer_event+0xe/0x10
>> (XEN)    [<ffff82d08012a7b5>]
>> schedule.c#vcpu_singleshot_timer_fn+0x9/0xb
>> (XEN)    [<ffff82d080131978>] timer.c#execute_timer+0x4e/0x6c
>> (XEN)    [<ffff82d080131ab9>] timer.c#timer_softirq_action+0xdd/0x213
>> (XEN)    [<ffff82d08012df32>] softirq.c#__do_softirq+0x82/0x8d
>> (XEN)    [<ffff82d08012df8a>] do_softirq+0x13/0x15
>> (XEN)    [<ffff82d080243ad1>] cpufreq.c#process_softirqs+0x21/0x30
>> 
>> 
>> So, it looks like spurious wakeup for vcpu1 happens before it was 
>> completely context switched off a cpu. But rt_vcpu_wake() didn't see
>> it 
>> on cpu with curr_on_cpu() so it fell through the first two RETURNs.
>> 
>> I guess the replenishment queue check is necessary for this
>> situation?
>> 
> Perhaps, but I first want to make sure we understand what is really
> happening.
> 
> Regards,
> Dario

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: sched_rt.c --]
[-- Type: text/x-c; name=sched_rt.c, Size: 37986 bytes --]

/*****************************************************************************
 * Preemptive Global Earliest Deadline First  (EDF) scheduler for Xen
 * EDF scheduling is a real-time scheduling algorithm used in embedded field.
 *
 * by Sisu Xi, 2013, Washington University in Saint Louis
 * and Meng Xu, 2014, University of Pennsylvania
 *
 * based on the code of credit Scheduler
 */

#include <xen/config.h>
#include <xen/init.h>
#include <xen/lib.h>
#include <xen/sched.h>
#include <xen/domain.h>
#include <xen/delay.h>
#include <xen/event.h>
#include <xen/time.h>
#include <xen/timer.h>
#include <xen/perfc.h>
#include <xen/sched-if.h>
#include <xen/softirq.h>
#include <asm/atomic.h>
#include <xen/errno.h>
#include <xen/trace.h>
#include <xen/cpu.h>
#include <xen/keyhandler.h>
#include <xen/trace.h>
#include <xen/guest_access.h>

/*
 * TODO:
 *
 * Migration compensation and resist like credit2 to better use cache;
 * Lock Holder Problem, using yield?
 * Self switch problem: VCPUs of the same domain may preempt each other;
 */

/*
 * Design:
 *
 * This scheduler follows the Preemptive Global Earliest Deadline First (EDF)
 * theory in real-time field.
 * At any scheduling point, the VCPU with earlier deadline has higher priority.
 * The scheduler always picks highest priority VCPU to run on a feasible PCPU.
 * A PCPU is feasible if the VCPU can run on this PCPU and (the PCPU is idle or
 * has a lower-priority VCPU running on it.)
 *
 * Each VCPU has a dedicated period and budget.
 * The deadline of a VCPU is at the end of each period;
 * A VCPU has its budget replenished at the beginning of each period;
 * While scheduled, a VCPU burns its budget.
 * The VCPU needs to finish its budget before its deadline in each period;
 * The VCPU discards its unused budget at the end of each period.
 * If a VCPU runs out of budget in a period, it has to wait until next period.
 *
 * Each VCPU is implemented as a deferable server.
 * When a VCPU has a task running on it, its budget is continuously burned;
 * When a VCPU has no task but with budget left, its budget is preserved.
 *
 * Queue scheme:
 * A global runqueue and a global depletedqueue for each CPU pool.
 * The runqueue holds all runnable VCPUs with budget, sorted by deadline;
 * The depletedqueue holds all VCPUs without budget, unsorted;
 *
 * Note: cpumask and cpupool is supported.
 */

/*
 * Locking:
 * A global system lock is used to protect the RunQ and DepletedQ.
 * The global lock is referenced by schedule_data.schedule_lock
 * from all physical cpus.
 *
 * The lock is already grabbed when calling wake/sleep/schedule/ functions
 * in schedule.c
 *
 * The functions involes RunQ and needs to grab locks are:
 *    vcpu_insert, vcpu_remove, context_saved, __runq_insert
 */


/*
 * Default parameters:
 * Period and budget in default is 10 and 4 ms, respectively
 */
#define RTDS_DEFAULT_PERIOD     (MICROSECS(10000))
#define RTDS_DEFAULT_BUDGET     (MICROSECS(4000))

#define UPDATE_LIMIT_SHIFT      10

/*
 * Flags
 */
/*
 * RTDS_scheduled: Is this vcpu either running on, or context-switching off,
 * a phyiscal cpu?
 * + Accessed only with global lock held.
 * + Set when chosen as next in rt_schedule().
 * + Cleared after context switch has been saved in rt_context_saved()
 * + Checked in vcpu_wake to see if we can add to the Runqueue, or if we should
 *   set RTDS_delayed_runq_add
 * + Checked to be false in runq_insert.
 */
#define __RTDS_scheduled            1
#define RTDS_scheduled (1<<__RTDS_scheduled)
/*
 * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ
 * once it's done being context switching out?
 * + Set when scheduling out in rt_schedule() if prev is runable
 * + Set in rt_vcpu_wake if it finds RTDS_scheduled set
 * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ
 *   and clears the bit.
 */
#define __RTDS_delayed_runq_add     2
#define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add)

/*
 * rt tracing events ("only" 512 available!). Check
 * include/public/trace.h for more details.
 */
#define TRC_RTDS_TICKLE           TRC_SCHED_CLASS_EVT(RTDS, 1)
#define TRC_RTDS_RUNQ_PICK        TRC_SCHED_CLASS_EVT(RTDS, 2)
#define TRC_RTDS_BUDGET_BURN      TRC_SCHED_CLASS_EVT(RTDS, 3)
#define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4)
#define TRC_RTDS_SCHED_TASKLET    TRC_SCHED_CLASS_EVT(RTDS, 5)

 /*
  * Useful to avoid too many cpumask_var_t on the stack.
  */
static cpumask_var_t *_cpumask_scratch;
#define cpumask_scratch _cpumask_scratch[smp_processor_id()]

/*
 * We want to only allocate the _cpumask_scratch array the first time an
 * instance of this scheduler is used, and avoid reallocating and leaking
 * the old one when more instance are activated inside new cpupools. We
 * also want to get rid of it when the last instance is de-inited.
 *
 * So we (sort of) reference count the number of initialized instances. This
 * does not need to happen via atomic_t refcounters, as it only happens either
 * during boot, or under the protection of the cpupool_lock spinlock.
 */
static unsigned int nr_rt_ops;

/* handler for the replenishment timer */
static void repl_handler(void *data);

/*
 * Systme-wide private data, include global RunQueue/DepletedQ
 * Global lock is referenced by schedule_data.schedule_lock from all
 * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
 */
struct rt_private {
    spinlock_t lock;            /* the global coarse grand lock */
    struct list_head sdom;      /* list of availalbe domains, used for dump */
    struct list_head runq;      /* ordered list of runnable vcpus */
    struct list_head depletedq; /* unordered list of depleted vcpus */
    struct list_head replq;     /* ordered list of vcpus that need replenishment */
    cpumask_t tickled;          /* cpus been tickled */
    struct timer *repl_timer;   /* replenishment timer */
};

/*
 * Virtual CPU
 */
struct rt_vcpu {
    struct list_head q_elem;    /* on the runq/depletedq list */
    struct list_head replq_elem;/* on the repl event list */

    /* Up-pointers */
    struct rt_dom *sdom;
    struct vcpu *vcpu;

    /* VCPU parameters, in nanoseconds */
    s_time_t period;
    s_time_t budget;

    /* VCPU current infomation in nanosecond */
    s_time_t cur_budget;        /* current budget */
    s_time_t last_start;        /* last start time */
    s_time_t cur_deadline;      /* current deadline for EDF */

    unsigned flags;             /* mark __RTDS_scheduled, etc.. */
};

/*
 * Domain
 */
struct rt_dom {
    struct list_head sdom_elem; /* link list on rt_priv */
    struct domain *dom;         /* pointer to upper domain */
};

/*
 * Useful inline functions
 */
static inline struct rt_private *rt_priv(const struct scheduler *ops)
{
    return ops->sched_data;
}

static inline struct rt_vcpu *rt_vcpu(const struct vcpu *vcpu)
{
    return vcpu->sched_priv;
}

static inline struct rt_dom *rt_dom(const struct domain *dom)
{
    return dom->sched_priv;
}

static inline struct list_head *rt_runq(const struct scheduler *ops)
{
    return &rt_priv(ops)->runq;
}

static inline struct list_head *rt_depletedq(const struct scheduler *ops)
{
    return &rt_priv(ops)->depletedq;
}

static inline struct list_head *rt_replq(const struct scheduler *ops)
{
    return &rt_priv(ops)->replq;
}

/*
 * Queue helper functions for runq, depletedq
 * and replenishment event queue
 */
static int
__vcpu_on_q(const struct rt_vcpu *svc)
{
   return !list_empty(&svc->q_elem);
}

static struct rt_vcpu *
__q_elem(struct list_head *elem)
{
    return list_entry(elem, struct rt_vcpu, q_elem);
}

static struct rt_vcpu *
__replq_elem(struct list_head *elem)
{
    return list_entry(elem, struct rt_vcpu, replq_elem);
}

static int
__vcpu_on_replq(const struct rt_vcpu *svc)
{
   return !list_empty(&svc->replq_elem);
}

/*
 * Debug related code, dump vcpu/cpu information
 */
static void
rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc)
{
    cpumask_t *cpupool_mask, *mask;

    ASSERT(svc != NULL);
    /* idle vcpu */
    if( svc->sdom == NULL )
    {
        printk("\n");
        return;
    }

    /*
     * We can't just use 'cpumask_scratch' because the dumping can
     * happen from a pCPU outside of this scheduler's cpupool, and
     * hence it's not right to use the pCPU's scratch mask (which
     * may even not exist!). On the other hand, it is safe to use
     * svc->vcpu->processor's own scratch space, since we hold the
     * runqueue lock.
     */
    mask = _cpumask_scratch[svc->vcpu->processor];

    cpupool_mask = cpupool_domain_cpumask(svc->vcpu->domain);
    cpumask_and(mask, cpupool_mask, svc->vcpu->cpu_hard_affinity);
    cpulist_scnprintf(keyhandler_scratch, sizeof(keyhandler_scratch), mask);
    printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
           " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n"
           " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%s\n",
            svc->vcpu->domain->domain_id,
            svc->vcpu->vcpu_id,
            svc->vcpu->processor,
            svc->period,
            svc->budget,
            svc->cur_budget,
            svc->cur_deadline,
            svc->last_start,
            __vcpu_on_q(svc),
            vcpu_runnable(svc->vcpu),
            svc->flags,
            keyhandler_scratch);
}

static void
rt_dump_pcpu(const struct scheduler *ops, int cpu)
{
    struct rt_private *prv = rt_priv(ops);
    unsigned long flags;

    spin_lock_irqsave(&prv->lock, flags);
    rt_dump_vcpu(ops, rt_vcpu(curr_on_cpu(cpu)));
    spin_unlock_irqrestore(&prv->lock, flags);
}

static void
rt_dump(const struct scheduler *ops)
{
    struct list_head *runq, *depletedq, *replq, *iter;
    struct rt_private *prv = rt_priv(ops);
    struct rt_vcpu *svc;
    struct rt_dom *sdom;
    unsigned long flags;

    spin_lock_irqsave(&prv->lock, flags);

    if ( list_empty(&prv->sdom) )
        goto out;

    runq = rt_runq(ops);
    depletedq = rt_depletedq(ops);
    replq = rt_replq(ops);

    printk("Global RunQueue info:\n");
    list_for_each( iter, runq )
    {
        svc = __q_elem(iter);
        rt_dump_vcpu(ops, svc);
    }

    printk("Global DepletedQueue info:\n");
    list_for_each( iter, depletedq )
    {
        svc = __q_elem(iter);
        rt_dump_vcpu(ops, svc);
    }

    printk("Global Replenishment Event info:\n");
    list_for_each( iter, replq )
    {
        svc = __replq_elem(iter);
        rt_dump_vcpu(ops, svc);
    }

    printk("Domain info:\n");
    list_for_each( iter, &prv->sdom )
    {
        struct vcpu *v;

        sdom = list_entry(iter, struct rt_dom, sdom_elem);
        printk("\tdomain: %d\n", sdom->dom->domain_id);

        for_each_vcpu ( sdom->dom, v )
        {
            svc = rt_vcpu(v);
            rt_dump_vcpu(ops, svc);
        }
    }

 out:
    spin_unlock_irqrestore(&prv->lock, flags);
}

/*
 * update deadline and budget when now >= cur_deadline
 * it need to be updated to the deadline of the current period
 */
static void
rt_update_deadline(s_time_t now, struct rt_vcpu *svc)
{
    ASSERT(now >= svc->cur_deadline);
    ASSERT(svc->period != 0);

    if ( svc->cur_deadline + (svc->period << UPDATE_LIMIT_SHIFT) > now )
    {
        do
            svc->cur_deadline += svc->period;
        while ( svc->cur_deadline <= now );
    }
    else
    {
        long count = ((now - svc->cur_deadline) / svc->period) + 1;
        svc->cur_deadline += count * svc->period;
    }

    svc->cur_budget = svc->budget;

    /* TRACE */
    {
        struct {
            unsigned dom:16,vcpu:16;
            unsigned cur_deadline_lo, cur_deadline_hi;
            unsigned cur_budget_lo, cur_budget_hi;
        } d;
        d.dom = svc->vcpu->domain->domain_id;
        d.vcpu = svc->vcpu->vcpu_id;
        d.cur_deadline_lo = (unsigned) svc->cur_deadline;
        d.cur_deadline_hi = (unsigned) (svc->cur_deadline >> 32);
        d.cur_budget_lo = (unsigned) svc->cur_budget;
        d.cur_budget_hi = (unsigned) (svc->cur_budget >> 32);
        trace_var(TRC_RTDS_BUDGET_REPLENISH, 1,
                  sizeof(d),
                  (unsigned char *) &d);
    }

    return;
}

/* a helper function that only removes a vcpu from a queue */
static inline void
deadline_queue_remove(struct list_head *elem)
{
    list_del_init(elem);
}

static inline void
__q_remove(struct rt_vcpu *svc)
{
    if ( __vcpu_on_q(svc) )
        deadline_queue_remove(&svc->q_elem);
}

/*
 * Removing a vcpu from the replenishment queue could
 * re-program the timer for the next replenishment event
 * if there is any on the list
 */
static inline void
__replq_remove(const struct scheduler *ops, struct rt_vcpu *svc)
{
    struct rt_private *prv = rt_priv(ops);
    struct list_head *replq = rt_replq(ops);
    struct timer* repl_timer = prv->repl_timer;

    /*
     * Disarm the timer before re-programming it.
     * It doesn't matter if the vcpu to be removed
     * is on top of the list or not because the timer
     * is stopped and needs to be re-programmed anyways
     */
    stop_timer(repl_timer);

    deadline_queue_remove(&svc->replq_elem);

    /* re-arm the timer for the next replenishment event */
    if( !list_empty(replq) )
    {
        struct rt_vcpu *svc_next = __replq_elem(replq->next);
        set_timer(repl_timer, svc_next->cur_deadline);
    }
}

/*
 * An utility function that inserts a vcpu to a
 * queue based on certain order (EDF). The returned
 * value is 1 if a vcpu has been inserted to the
 * front of a list
 */
static int
deadline_queue_insert(struct rt_vcpu * (*_get_q_elem)(struct list_head *elem),
    struct rt_vcpu *svc, struct list_head *elem, struct list_head *queue)
{
    struct list_head *iter;
    int pos = 0;

    list_for_each(iter, queue)
    {
        struct rt_vcpu * iter_svc = (*_get_q_elem)(iter);
        if ( svc->cur_deadline <= iter_svc->cur_deadline )
            break;

        pos++;
    }

    list_add_tail(elem, iter);
    return !pos;
}

/*
 * Insert svc with budget in RunQ according to EDF:
 * vcpus with smaller deadlines go first.
 * Insert svc without budget in DepletedQ unsorted;
 */
static void
__runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
{
    struct rt_private *prv = rt_priv(ops);
    struct list_head *runq = rt_runq(ops);

    ASSERT( spin_is_locked(&prv->lock) );

    ASSERT( !__vcpu_on_q(svc) );

    /* add svc to runq if svc still has budget */
    if ( svc->cur_budget > 0 )
        deadline_queue_insert(&__q_elem, svc, &svc->q_elem, runq);
    else
    {
        list_add(&svc->q_elem, &prv->depletedq);
        ASSERT( __vcpu_on_replq(svc) );
    }
}

/*
 * Insert svc into the replenishment event list
 * in replenishment time order.
 * vcpus that need to be replished earlier go first.
 * The timer may be re-programmed if svc is inserted
 * at the front of the event list.
 */
static void
__replq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
{
    struct list_head *replq = rt_replq(ops);
    struct rt_private *prv = rt_priv(ops);
    struct timer *repl_timer = prv->repl_timer;
    int set;

    ASSERT( !__vcpu_on_replq(svc) );

    set = deadline_queue_insert(&__replq_elem, svc, &svc->replq_elem, replq);
 
    if( set )
        set_timer(repl_timer,svc->cur_deadline);
}

/*
 * Init/Free related code
 */
static int
rt_init(struct scheduler *ops)
{
    struct rt_private *prv = xzalloc(struct rt_private);

    printk("Initializing RTDS scheduler\n"
           "WARNING: This is experimental software in development.\n"
           "Use at your own risk.\n");

    if ( prv == NULL )
        return -ENOMEM;

    ASSERT( _cpumask_scratch == NULL || nr_rt_ops > 0 );

    if ( !_cpumask_scratch )
    {
        _cpumask_scratch = xmalloc_array(cpumask_var_t, nr_cpu_ids);
        if ( !_cpumask_scratch )
            goto no_mem;
    }
    nr_rt_ops++;

    spin_lock_init(&prv->lock);
    INIT_LIST_HEAD(&prv->sdom);
    INIT_LIST_HEAD(&prv->runq);
    INIT_LIST_HEAD(&prv->depletedq);
    INIT_LIST_HEAD(&prv->replq);

    cpumask_clear(&prv->tickled);

    ops->sched_data = prv;

    /* 
     * The timer initialization will happen later when 
     * the first pcpu is added to this pool in alloc_pdata
     */
    prv->repl_timer = NULL;

    return 0;

 no_mem:
    xfree(prv);
    return -ENOMEM;
}

static void
rt_deinit(const struct scheduler *ops)
{
    struct rt_private *prv = rt_priv(ops);

    ASSERT( _cpumask_scratch && nr_rt_ops > 0 );

    if ( (--nr_rt_ops) == 0 )
    {
        xfree(_cpumask_scratch);
        _cpumask_scratch = NULL;
    }

    kill_timer(prv->repl_timer);
    xfree(prv->repl_timer);

    xfree(prv);
}

/*
 * Point per_cpu spinlock to the global system lock;
 * All cpu have same global system lock
 */
static void *
rt_alloc_pdata(const struct scheduler *ops, int cpu)
{
    struct rt_private *prv = rt_priv(ops);
    unsigned long flags;

    spin_lock_irqsave(&prv->lock, flags);
    per_cpu(schedule_data, cpu).schedule_lock = &prv->lock;
    spin_unlock_irqrestore(&prv->lock, flags);

    if ( !alloc_cpumask_var(&_cpumask_scratch[cpu]) )
        return NULL;

    if( prv->repl_timer == NULL )
    {   
        /* allocate the timer on the first cpu of this pool */
        prv->repl_timer = xzalloc(struct timer);

        if(prv->repl_timer == NULL )
            return NULL;

        init_timer(prv->repl_timer, repl_handler, (void *)ops, cpu);
    }

    /* 1 indicates alloc. succeed in schedule.c */
    return (void *)1;
}

static void
rt_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
{
    struct rt_private *prv = rt_priv(ops);
    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
    unsigned long flags;

    spin_lock_irqsave(&prv->lock, flags);

    /* Move spinlock back to the default lock */
    ASSERT(sd->schedule_lock == &prv->lock);
    ASSERT(!spin_is_locked(&sd->_lock));
    sd->schedule_lock = &sd->_lock;

    spin_unlock_irqrestore(&prv->lock, flags);

    free_cpumask_var(_cpumask_scratch[cpu]);
}

static void *
rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
{
    unsigned long flags;
    struct rt_dom *sdom;
    struct rt_private * prv = rt_priv(ops);

    sdom = xzalloc(struct rt_dom);
    if ( sdom == NULL )
        return NULL;

    INIT_LIST_HEAD(&sdom->sdom_elem);
    sdom->dom = dom;

    /* spinlock here to insert the dom */
    spin_lock_irqsave(&prv->lock, flags);
    list_add_tail(&sdom->sdom_elem, &(prv->sdom));
    spin_unlock_irqrestore(&prv->lock, flags);

    return sdom;
}

static void
rt_free_domdata(const struct scheduler *ops, void *data)
{
    unsigned long flags;
    struct rt_dom *sdom = data;
    struct rt_private *prv = rt_priv(ops);

    spin_lock_irqsave(&prv->lock, flags);
    list_del_init(&sdom->sdom_elem);
    spin_unlock_irqrestore(&prv->lock, flags);
    xfree(data);
}

static int
rt_dom_init(const struct scheduler *ops, struct domain *dom)
{
    struct rt_dom *sdom;

    /* IDLE Domain does not link on rt_private */
    if ( is_idle_domain(dom) )
        return 0;

    sdom = rt_alloc_domdata(ops, dom);
    if ( sdom == NULL )
        return -ENOMEM;

    dom->sched_priv = sdom;

    return 0;
}

static void
rt_dom_destroy(const struct scheduler *ops, struct domain *dom)
{
    rt_free_domdata(ops, rt_dom(dom));
}

static void *
rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
{
    struct rt_vcpu *svc;

    /* Allocate per-VCPU info */
    svc = xzalloc(struct rt_vcpu);
    if ( svc == NULL )
        return NULL;

    INIT_LIST_HEAD(&svc->q_elem);
    INIT_LIST_HEAD(&svc->replq_elem);
    svc->flags = 0U;
    svc->sdom = dd;
    svc->vcpu = vc;
    svc->last_start = 0;

    svc->period = RTDS_DEFAULT_PERIOD;
    if ( !is_idle_vcpu(vc) )
        svc->budget = RTDS_DEFAULT_BUDGET;

    SCHED_STAT_CRANK(vcpu_alloc);

    return svc;
}

static void
rt_free_vdata(const struct scheduler *ops, void *priv)
{
    struct rt_vcpu *svc = priv;

    xfree(svc);
}

/*
 * It is called in sched_move_domain() and sched_init_vcpu
 * in schedule.c
 * When move a domain to a new cpupool.
 * It inserts vcpus of moving domain to the scheduler's RunQ in
 * dest. cpupool.
 */
static void
rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
{
    struct rt_vcpu *svc = rt_vcpu(vc);
    s_time_t now = NOW();
    spinlock_t *lock;

    BUG_ON( is_idle_vcpu(vc) );

    lock = vcpu_schedule_lock_irq(vc);
    if ( now >= svc->cur_deadline )
        rt_update_deadline(now, svc);

    if ( !__vcpu_on_q(svc) && vcpu_runnable(vc) && !vc->is_running )
    {
        __runq_insert(ops, svc);
        printk("vcpu%d inserted to run queue\n",vc->vcpu_id);

    }
    vcpu_schedule_unlock_irq(lock, vc);

    SCHED_STAT_CRANK(vcpu_insert);
}

/*
 * Remove rt_vcpu svc from the old scheduler in source cpupool.
 */
static void
rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
{
    struct rt_vcpu * const svc = rt_vcpu(vc);
    struct rt_dom * const sdom = svc->sdom;
    spinlock_t *lock;

    SCHED_STAT_CRANK(vcpu_remove);

    BUG_ON( sdom == NULL );

    lock = vcpu_schedule_lock_irq(vc);
    if ( __vcpu_on_q(svc) )
        __q_remove(svc);

    if( __vcpu_on_replq(svc) )
        __replq_remove(ops,svc);

    vcpu_schedule_unlock_irq(lock, vc);
}

/*
 * Pick a valid CPU for the vcpu vc
 * Valid CPU of a vcpu is intesection of vcpu's affinity
 * and available cpus
 */
static int
rt_cpu_pick(const struct scheduler *ops, struct vcpu *vc)
{
    cpumask_t cpus;
    cpumask_t *online;
    int cpu;

    online = cpupool_domain_cpumask(vc->domain);
    cpumask_and(&cpus, online, vc->cpu_hard_affinity);

    cpu = cpumask_test_cpu(vc->processor, &cpus)
            ? vc->processor
            : cpumask_cycle(vc->processor, &cpus);
    ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) );

    return cpu;
}

/*
 * Burn budget in nanosecond granularity
 */
static void
burn_budget(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now)
{
    s_time_t delta;

    /* don't burn budget for idle VCPU */
    if ( is_idle_vcpu(svc->vcpu) )
        return;

    /* burn at nanoseconds level */
    delta = now - svc->last_start;
    /*
     * delta < 0 only happens in nested virtualization;
     * TODO: how should we handle delta < 0 in a better way?
     */
    if ( delta < 0 )
    {
        printk("%s, ATTENTION: now is behind last_start! delta=%"PRI_stime"\n",
                __func__, delta);
        svc->last_start = now;
        return;
    }

    svc->cur_budget -= delta;

    if ( svc->cur_budget < 0 )
        svc->cur_budget = 0;

    /* TRACE */
    {
        struct {
            unsigned dom:16, vcpu:16;
            unsigned cur_budget_lo;
            unsigned cur_budget_hi;
            int delta;
        } d;
        d.dom = svc->vcpu->domain->domain_id;
        d.vcpu = svc->vcpu->vcpu_id;
        d.cur_budget_lo = (unsigned) svc->cur_budget;
        d.cur_budget_hi = (unsigned) (svc->cur_budget >> 32);
        d.delta = delta;
        trace_var(TRC_RTDS_BUDGET_BURN, 1,
                  sizeof(d),
                  (unsigned char *) &d);
    }
}

/*
 * RunQ is sorted. Pick first one within cpumask. If no one, return NULL
 * lock is grabbed before calling this function
 */
static struct rt_vcpu *
__runq_pick(const struct scheduler *ops, const cpumask_t *mask)
{
    struct list_head *runq = rt_runq(ops);
    struct list_head *iter;
    struct rt_vcpu *svc = NULL;
    struct rt_vcpu *iter_svc = NULL;
    cpumask_t cpu_common;
    cpumask_t *online;

    list_for_each(iter, runq)
    {
        iter_svc = __q_elem(iter);

        /* mask cpu_hard_affinity & cpupool & mask */
        online = cpupool_domain_cpumask(iter_svc->vcpu->domain);
        cpumask_and(&cpu_common, online, iter_svc->vcpu->cpu_hard_affinity);
        cpumask_and(&cpu_common, mask, &cpu_common);
        if ( cpumask_empty(&cpu_common) )
            continue;

        ASSERT( iter_svc->cur_budget > 0 );

        svc = iter_svc;
        break;
    }

    /* TRACE */
    {
        if( svc != NULL )
        {
            struct {
                unsigned dom:16, vcpu:16;
                unsigned cur_deadline_lo, cur_deadline_hi;
                unsigned cur_budget_lo, cur_budget_hi;
            } d;
            d.dom = svc->vcpu->domain->domain_id;
            d.vcpu = svc->vcpu->vcpu_id;
            d.cur_deadline_lo = (unsigned) svc->cur_deadline;
            d.cur_deadline_hi = (unsigned) (svc->cur_deadline >> 32);
            d.cur_budget_lo = (unsigned) svc->cur_budget;
            d.cur_budget_hi = (unsigned) (svc->cur_budget >> 32);
            trace_var(TRC_RTDS_RUNQ_PICK, 1,
                      sizeof(d),
                      (unsigned char *) &d);
        }
        else
            trace_var(TRC_RTDS_RUNQ_PICK, 1, 0, NULL);
    }

    return svc;
}

/*
 * schedule function for rt scheduler.
 * The lock is already grabbed in schedule.c, no need to lock here
 */
static struct task_slice
rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled)
{
    const int cpu = smp_processor_id();
    struct rt_private *prv = rt_priv(ops);
    struct rt_vcpu *const scurr = rt_vcpu(current);
    struct rt_vcpu *snext = NULL;
    struct task_slice ret = { .migrated = 0 };

    /* clear ticked bit now that we've been scheduled */
    cpumask_clear_cpu(cpu, &prv->tickled);

    /* burn_budget would return for IDLE VCPU */
    burn_budget(ops, scurr, now);


    if ( tasklet_work_scheduled )
    {
        snext = rt_vcpu(idle_vcpu[cpu]);
    }
    else
    {
        snext = __runq_pick(ops, cpumask_of(cpu));
        if ( snext == NULL )
            snext = rt_vcpu(idle_vcpu[cpu]);

        /* if scurr has higher priority and budget, still pick scurr */
        if ( !is_idle_vcpu(current) &&
             vcpu_runnable(current) &&
             scurr->cur_budget > 0 &&
             ( is_idle_vcpu(snext->vcpu) ||
               scurr->cur_deadline <= snext->cur_deadline ) )
            snext = scurr;
    }

    if ( snext != scurr &&
         !is_idle_vcpu(current) &&
         vcpu_runnable(current) )
        set_bit(__RTDS_delayed_runq_add, &scurr->flags);

    snext->last_start = now;

    ret.time =  -1; /* if an idle vcpu is picked */ 
    if ( !is_idle_vcpu(snext->vcpu) )
    {
        if ( snext != scurr )
        {
            __q_remove(snext);
            set_bit(__RTDS_scheduled, &snext->flags);
        }
        if ( snext->vcpu->processor != cpu )
        {
            snext->vcpu->processor = cpu;
            ret.migrated = 1;
        }

        ret.time = snext->budget; /* invoke the scheduler next time */

    }

    ret.task = snext->vcpu;
    if( is_idle_vcpu(snext->vcpu))
    {
        printk("cpu%d picked idle\n",cpu);
    }
    else
        printk("cpu%d picked vcpu%d\n",cpu, snext->vcpu->vcpu_id);

    /* TRACE */
    {
        struct {
            unsigned dom:16,vcpu:16;
            unsigned cur_deadline_lo, cur_deadline_hi;
            unsigned cur_budget_lo, cur_budget_hi;
        } d;
        d.dom = snext->vcpu->domain->domain_id;
        d.vcpu = snext->vcpu->vcpu_id;
        d.cur_deadline_lo = (unsigned) snext->cur_deadline;
        d.cur_deadline_hi = (unsigned) (snext->cur_deadline >> 32);
        d.cur_budget_lo = (unsigned) snext->cur_budget;
        d.cur_budget_hi = (unsigned) (snext->cur_budget >> 32);
        trace_var(TRC_RTDS_SCHED_TASKLET, 1,
                  sizeof(d),
                  (unsigned char *)&d);
    }

    return ret;
}

/*
 * Remove VCPU from RunQ
 * The lock is already grabbed in schedule.c, no need to lock here
 */
static void
rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
{
    struct rt_vcpu * const svc = rt_vcpu(vc);

    BUG_ON( is_idle_vcpu(vc) );
    SCHED_STAT_CRANK(vcpu_sleep);

    if ( curr_on_cpu(vc->processor) == vc )
    {
        cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
        printk("vcpu%d sleeps on cpu\n",vc->vcpu_id);

    }
    else if ( __vcpu_on_q(svc) )
    {
        printk("vcpu%d sleeps on queue\n",vc->vcpu_id);

        __q_remove(svc);
       __replq_remove(ops, svc); 
    }
    else if ( svc->flags & RTDS_delayed_runq_add )
    {
        clear_bit(__RTDS_delayed_runq_add, &svc->flags);
        printk("vcpu%d sleeps before context switch finished\n",vc->vcpu_id);

    }
}

/*
 * Pick a cpu where to run a vcpu,
 * possibly kicking out the vcpu running there
 * Called by wake() and context_saved()
 * We have a running candidate here, the kick logic is:
 * Among all the cpus that are within the cpu affinity
 * 1) if the new->cpu is idle, kick it. This could benefit cache hit
 * 2) if there are any idle vcpu, kick it.
 * 3) now all pcpus are busy;
 *    among all the running vcpus, pick lowest priority one
 *    if snext has higher priority, kick it.
 *
 * TODO:
 * 1) what if these two vcpus belongs to the same domain?
 *    replace a vcpu belonging to the same domain introduces more overhead
 *
 * lock is grabbed before calling this function
 */
static void
runq_tickle(const struct scheduler *ops, struct rt_vcpu *new)
{
    struct rt_private *prv = rt_priv(ops);
    struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */
    struct rt_vcpu *iter_svc;
    struct vcpu *iter_vc;
    int cpu = 0, cpu_to_tickle = 0;
    cpumask_t not_tickled;
    cpumask_t *online;

    if ( new == NULL || is_idle_vcpu(new->vcpu) )
        return;

    online = cpupool_domain_cpumask(new->vcpu->domain);
    cpumask_and(&not_tickled, online, new->vcpu->cpu_hard_affinity);
    cpumask_andnot(&not_tickled, &not_tickled, &prv->tickled);

    /* 1) if new's previous cpu is idle, kick it for cache benefit */
    if ( is_idle_vcpu(curr_on_cpu(new->vcpu->processor)) )
    {
        cpu_to_tickle = new->vcpu->processor;
        goto out;
    }

    /* 2) if there are any idle pcpu, kick it */
    /* The same loop also find the one with lowest priority */
    for_each_cpu(cpu, &not_tickled)
    {
        iter_vc = curr_on_cpu(cpu);
        if ( is_idle_vcpu(iter_vc) )
        {
            cpu_to_tickle = cpu;
            goto out;
        }
        iter_svc = rt_vcpu(iter_vc);
        if ( latest_deadline_vcpu == NULL ||
             iter_svc->cur_deadline > latest_deadline_vcpu->cur_deadline )
            latest_deadline_vcpu = iter_svc;
    }

    /* 3) candicate has higher priority, kick out lowest priority vcpu */
    if ( latest_deadline_vcpu != NULL &&
         new->cur_deadline < latest_deadline_vcpu->cur_deadline )
    {
        cpu_to_tickle = latest_deadline_vcpu->vcpu->processor;
        goto out;
    }

    /* didn't tickle any cpu */
    SCHED_STAT_CRANK(tickle_idlers_none);
    return;
out:
    /* TRACE */
    {
        struct {
            unsigned cpu:16, pad:16;
        } d;
        d.cpu = cpu_to_tickle;
        d.pad = 0;
        trace_var(TRC_RTDS_TICKLE, 0,
                  sizeof(d),
                  (unsigned char *)&d);
    }

    cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
    SCHED_STAT_CRANK(tickle_idlers_some);
    cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
    return;
}

/*
 * Should always wake up runnable vcpu, put it back to RunQ.
 * Check priority to raise interrupt
 * The lock is already grabbed in schedule.c, no need to lock here
 * TODO: what if these two vcpus belongs to the same domain?
 */
static void
rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
{
    struct rt_vcpu * const svc = rt_vcpu(vc);
    s_time_t now = NOW();

    BUG_ON( is_idle_vcpu(vc) );

    if ( unlikely(curr_on_cpu(vc->processor) == vc) )
    {
        printk("vcpu%d wakes up on cpu\n",vc->vcpu_id);
        SCHED_STAT_CRANK(vcpu_wake_running);
        return;
    }

    /* on RunQ/DepletedQ, just update info is ok */
    if ( unlikely(__vcpu_on_q(svc)) )
    {
        printk("vcpu%d wakes up on queue\n",vc->vcpu_id);

        SCHED_STAT_CRANK(vcpu_wake_onrunq);
        return;
    }

    printk("vcpu%d wakes up nowhere\n",vc->vcpu_id);

    if ( likely(vcpu_runnable(vc)) )
        SCHED_STAT_CRANK(vcpu_wake_runnable);
    else
        SCHED_STAT_CRANK(vcpu_wake_not_runnable);

    /*
     * If a deadline passed while svc was asleep/blocked, we need new
     * scheduling parameters ( a new deadline and full budget), and
     * also a new replenishment event
     */
    if ( now >= svc->cur_deadline)
    {   
        rt_update_deadline(now, svc);
        __replq_remove(ops, svc);
    }

   // if( !__vcpu_on_replq(svc) )
        __replq_insert(ops, svc);

    /* If context hasn't been saved for this vcpu yet, we can't put it on
     * the Runqueue/DepletedQ. Instead, we set a flag so that it will be
     * put on the Runqueue/DepletedQ after the context has been saved.
     */
    if ( unlikely(svc->flags & RTDS_scheduled) )
    {
        set_bit(__RTDS_delayed_runq_add, &svc->flags);
        return;
    }

    /* insert svc to runq/depletedq because svc is not in queue now */
    __runq_insert(ops, svc);

    runq_tickle(ops, svc);
}

/*
 * scurr has finished context switch, insert it back to the RunQ,
 * and then pick the highest priority vcpu from runq to run
 */
static void
rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
{
    struct rt_vcpu *svc = rt_vcpu(vc);
    spinlock_t *lock = vcpu_schedule_lock_irq(vc);

    clear_bit(__RTDS_scheduled, &svc->flags);
    /* not insert idle vcpu to runq */
    if ( is_idle_vcpu(vc) )
        goto out;

    if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
         likely(vcpu_runnable(vc)) )
    {
        printk("vcpu%d context saved and added back to queue\n",vc->vcpu_id);

        __runq_insert(ops, svc);
        runq_tickle(ops, svc);
    }
    else
    {
        __replq_remove(ops, svc);
        printk("vcpu%d context saved not runnable\n",vc->vcpu_id);

    }
out:
    vcpu_schedule_unlock_irq(lock, vc);
}

/*
 * set/get each vcpu info of each domain
 */
static int
rt_dom_cntl(
    const struct scheduler *ops,
    struct domain *d,
    struct xen_domctl_scheduler_op *op)
{
    struct rt_private *prv = rt_priv(ops);
    struct rt_vcpu *svc;
    struct vcpu *v;
    unsigned long flags;
    int rc = 0;

    switch ( op->cmd )
    {
    case XEN_DOMCTL_SCHEDOP_getinfo:
        if ( d->max_vcpus > 0 )
        {
            spin_lock_irqsave(&prv->lock, flags);
            svc = rt_vcpu(d->vcpu[0]);
            op->u.rtds.period = svc->period / MICROSECS(1);
            op->u.rtds.budget = svc->budget / MICROSECS(1);
            spin_unlock_irqrestore(&prv->lock, flags);
        }
        else
        {
            /* If we don't have vcpus yet, let's just return the defaults. */
            op->u.rtds.period = RTDS_DEFAULT_PERIOD;
            op->u.rtds.budget = RTDS_DEFAULT_BUDGET;
        }
        break;
    case XEN_DOMCTL_SCHEDOP_putinfo:
        if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 )
        {
            rc = -EINVAL;
            break;
        }
        spin_lock_irqsave(&prv->lock, flags);
        for_each_vcpu ( d, v )
        {
            svc = rt_vcpu(v);
            svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */
            svc->budget = MICROSECS(op->u.rtds.budget);
        }
        spin_unlock_irqrestore(&prv->lock, flags);
        break;
    }

    return rc;
}

/*
 * The replenishment timer handler picks vcpus
 * from the replq and does the actual replenishment
 */
static void repl_handler(void *data){
    unsigned long flags;
    s_time_t now = NOW();
    struct scheduler *ops = data; 
    struct rt_private *prv = rt_priv(ops);
    struct list_head *replq = rt_replq(ops);
    struct timer *repl_timer = prv->repl_timer;
    struct list_head *iter, *tmp;
    struct rt_vcpu *svc = NULL;

    spin_lock_irqsave(&prv->lock, flags);

    stop_timer(repl_timer);

    list_for_each_safe(iter, tmp, replq)
    {
        svc = __replq_elem(iter);

        if ( now < svc->cur_deadline )
            break;

        rt_update_deadline(now, svc);

        /*
         * when the replenishment happens
         * svc is either on a pcpu or on
         * runq/depletedq
         */
        if( __vcpu_on_q(svc) )
        {
            /* put back to runq */
            __q_remove(svc);
            __runq_insert(ops, svc);
        }

        /* 
         * tickle regardless where it's at 
         * because a running vcpu could have
         * a later deadline than others after
         * replenishment
         */
        runq_tickle(ops, svc);

        /* 
         * update replenishment event queue
         * without reprogramming the timer
         */
        deadline_queue_remove(&svc->replq_elem);
        deadline_queue_insert(&__replq_elem, svc, &svc->replq_elem, replq);
    }

    /* 
     * use the vcpu that's on the top
     * or else don't program the timer
     */
    if( !list_empty(replq) )
        set_timer(repl_timer, __replq_elem(replq->next)->cur_deadline);

    spin_unlock_irqrestore(&prv->lock, flags);
}

static struct rt_private _rt_priv;

static const struct scheduler sched_rtds_def = {
    .name           = "SMP RTDS Scheduler",
    .opt_name       = "rtds",
    .sched_id       = XEN_SCHEDULER_RTDS,
    .sched_data     = &_rt_priv,

    .dump_cpu_state = rt_dump_pcpu,
    .dump_settings  = rt_dump,
    .init           = rt_init,
    .deinit         = rt_deinit,
    .alloc_pdata    = rt_alloc_pdata,
    .free_pdata     = rt_free_pdata,
    .alloc_domdata  = rt_alloc_domdata,
    .free_domdata   = rt_free_domdata,
    .init_domain    = rt_dom_init,
    .destroy_domain = rt_dom_destroy,
    .alloc_vdata    = rt_alloc_vdata,
    .free_vdata     = rt_free_vdata,
    .insert_vcpu    = rt_vcpu_insert,
    .remove_vcpu    = rt_vcpu_remove,

    .adjust         = rt_dom_cntl,

    .pick_cpu       = rt_cpu_pick,
    .do_schedule    = rt_schedule,
    .sleep          = rt_vcpu_sleep,
    .wake           = rt_vcpu_wake,
    .context_saved  = rt_context_saved,
};

REGISTER_SCHEDULER(sched_rtds_def);

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v6][RFC]xen: sched: convert RTDS from time to event driven model
  2016-02-26 18:33           ` Chen, Tianyang
@ 2016-03-04 16:34             ` Dario Faggioli
  0 siblings, 0 replies; 8+ messages in thread
From: Dario Faggioli @ 2016-03-04 16:34 UTC (permalink / raw)
  To: Chen, Tianyang; +Cc: xen-devel, george.dunlap, Dagaen Golomb, Meng Xu


[-- Attachment #1.1.1: Type: text/plain, Size: 4389 bytes --]

On Fri, 2016-02-26 at 13:33 -0500, Chen, Tianyang wrote:
> Attached.
> 
Hey,

here I am... sorry it took a bit.

I've had a look, and I've been able to come up with some code that I at
least do not dislike too much! ;-P

Have a look at the attached patch (you should apply it on top of the
sched_rt.c file that you sent me in attachment).

About what was happening...

> On 2016-02-26 13:09, Dario Faggioli wrote:
> > On Fri, 2016-02-26 at 12:28 -0500, Tianyang Chen wrote:
> > > 
> > > I added debug prints in all these functions and noticed that it
> > > could
> > > be 
> > > caused by racing between spurious wakeups and context switching.
> > > 
It's not really spurious wakeup, it's regular wakeup happening
immediately after the vcpu blocked, when we still haven't been able to
execute rt_context_saved().

It's not really a race, and in fact we have the _RTDS_scheduled and
_RTDS_delayed_runq_add flags that are meant to deal with exactly these
situations.

In fact (I added some more debug printk):

> > > (XEN) vcpu1 wakes up nowhere
> > > (XEN) cpu1 picked vcpu1     *** vcpu1 is on a cpu
> > > (XEN) cpu1 picked idle      *** vcpu1 is waiting to be context
> > > switched
> > > (XEN) vcpu2 wakes up nowhere
> > > (XEN) cpu0 picked vcpu0
> > > (XEN) cpu2 picked vcpu2
> > > (XEN) cpu0 picked vcpu0
> > > (XEN) cpu0 picked vcpu0
> > > (XEN) d0 attempted to change d0v2's CR4 flags 00000620 ->
> > > 00040660
> > > (XEN) cpu0 picked vcpu0
> > > (XEN) vcpu2 sleeps on cpu
> > > (XEN) vcpu1 wakes up nowhere      *** vcpu1 wakes up without
> > > sleep?
> > > 
> > > (XEN) Assertion '!__vcpu_on_replq(svc)' failed at sched_rt.c:526
> > > (XEN) ----[ Xen-4.7-unstable  x86_64  debug=y  Tainted:    C ]---
> > > -
>
(XEN) [   56.113897] vcpu13 wakes up nowhere
(XEN) [   56.113906] cpu4 picked vcpu13
(XEN) [   56.113962] vcpu13 blocks
(XEN) [   56.113965] cpu4 picked idle
(XEN) [   56.113969] vcpu13 unblocks
(XEN) [   56.113972] vcpu13 wakes up nowhere
(XEN) [   56.113980] vcpu13 woken while still in scheduling tail
(XEN) [   56.113985] vcpu13 context saved and added back to queue
(XEN) [   56.113992] cpu4 picked vcpu13

So, as you can see, at 56.113962 vcpu13 blocks (note, _blocks_ !=
_sleeps_), and cpu4 goes idle. Ideally, rt_context_saved() would run
and remove the replenishment event of vcpu13 from the replenishment
queue.

But, at 56.113969, vcpu13 wakes up already, before rt_context_saved()
had a chance to run (which happens at 56.113985). It is not a spurious
wakeup, as the vcpu was actually blocked and is being unblocked.

Since rt_context_saved() hasn't run yet, the replenishment event is
still in the queue, and hence any ASSERT asking for
!__vcpu_on_replq(vcpu13) is doomed to making Xen explode! :-/

That does not happen in the execution trace above, because that was
collected with my patch applied, where I either leave the replenishment
event alone, if it still valid (i.e., no deadline miss happened during
the blocked period) or, if a new one needs to be enqueued, I dequeue
the old one first.

The end result is not particularly pretty, but I am up for doing even
worse, for the sake of keeping things like this:

 ASSERT( !__vcpu_on_replq(svc) );

at the beginning of replq_insert() (and its appropriate counterpart at
the beginning of replq_remove()).

In fact, I consider them really really helpful, when reasoning and
trying to figure out how the code works... There is nothing that I hate
more than an 'enqueue' function for which you don't know if it is ok to
call it when the entity being enqueued is in the queue already (and
what actually happens if it is).

These ASSERTs, greatly help, from that point of view, clearly stating
that, _no_, in this case it's absolutely not right to call the enqueue
function if the event is in the queue already. :-)

Hope I made myself clear.

I gave some testing to the attached patch, and it seems to work to me,
but I'd appreciate if you could do more of that.

Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.1.2: sched_rt_wakeup_schedtail.patch --]
[-- Type: text/x-patch, Size: 2990 bytes --]

--- /home/dario/Desktop/sched_rt.c	2016-03-04 15:47:14.901787122 +0100
+++ xen/common/sched_rt.c	2016-03-04 16:55:06.746641424 +0100
@@ -439,6 +439,8 @@
     struct list_head *replq = rt_replq(ops);
     struct timer* repl_timer = prv->repl_timer;
 
+    ASSERT( __vcpu_on_replq(svc) );
+
     /*
      * Disarm the timer before re-programming it.
      * It doesn't matter if the vcpu to be removed
@@ -580,7 +582,7 @@
 }
 
 static void
-rt_deinit(const struct scheduler *ops)
+rt_deinit(struct scheduler *ops)
 {
     struct rt_private *prv = rt_priv(ops);
 
@@ -1153,6 +1155,7 @@
 {
     struct rt_vcpu * const svc = rt_vcpu(vc);
     s_time_t now = NOW();
+    bool_t missed;
 
     BUG_ON( is_idle_vcpu(vc) );
 
@@ -1181,28 +1184,42 @@
 
     /*
      * If a deadline passed while svc was asleep/blocked, we need new
-     * scheduling parameters ( a new deadline and full budget), and
-     * also a new replenishment event
+     * scheduling parameters (a new deadline and full budget).
      */
-    if ( now >= svc->cur_deadline)
-    {   
+    if ( (missed = now >= svc->cur_deadline) )
         rt_update_deadline(now, svc);
-        __replq_remove(ops, svc);
-    }
-
-   // if( !__vcpu_on_replq(svc) )
-        __replq_insert(ops, svc);
 
-    /* If context hasn't been saved for this vcpu yet, we can't put it on
-     * the Runqueue/DepletedQ. Instead, we set a flag so that it will be
-     * put on the Runqueue/DepletedQ after the context has been saved.
+    /* 
+     * If context hasn't been saved for this vcpu yet, we can't put it on
+     * the run-queue/depleted-queue. Instead, we set the appropriate flag,
+     * it the vcpu will be put back on queue after the context has been saved
+     * (in rt_context_save()).
      */
     if ( unlikely(svc->flags & RTDS_scheduled) )
     {
+        printk("vcpu%d woken while still in scheduling tail\n", vc->vcpu_id);
         set_bit(__RTDS_delayed_runq_add, &svc->flags);
+        /*
+         * The vcpu is waking up already, and we didn't even had the time to
+         * remove its next replenishment event from the replenishment queue
+         * when he blocked! No big deal. If we did not miss the deadline in
+         * the meantime, let's just leave it there. If we did, let's remove it
+         * and queue a new one (to occur at our new deadline).
+         */
+        if ( missed )
+        {
+           // XXX You may want to implement something like replq_reinsert(),
+           //     for dealing with this case, and calling that one from here
+           //     (of course!). I'd do that by merging _remove and _insert
+           //     and killing the duplicated bits.
+           __replq_remove(ops, svc); 
+           __replq_insert(ops, svc); 
+        }
         return;
     }
 
+    /* Replenishment event got cancelled when we blocked. Add it back. */
+    __replq_insert(ops, svc); 
     /* insert svc to runq/depletedq because svc is not in queue now */
     __runq_insert(ops, svc);
 

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-03-04 16:34 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-02-25 20:05 [PATCH v6][RFC]xen: sched: convert RTDS from time to event driven model Tianyang Chen
2016-02-25 23:31 ` Dario Faggioli
2016-02-26  5:15   ` Tianyang Chen
2016-02-26  9:11     ` Dario Faggioli
2016-02-26 17:28       ` Tianyang Chen
2016-02-26 18:09         ` Dario Faggioli
2016-02-26 18:33           ` Chen, Tianyang
2016-03-04 16:34             ` Dario Faggioli

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).