On 02/08/14 16:31, Meng Xu wrote:

> +/*
> + * Init/Free related code
> + */
> +static int
> +rt_init(struct scheduler *ops)
> +{
> +    struct rt_private *prv = xzalloc(struct rt_private);
> +
> +    if ( prv == NULL )
> +        return -ENOMEM;

Newline in here.

​Changed, thanks!

> +    ops->sched_data = prv;

Is it safe to set ops->sched_data with a half constructed rt_private?  I
suspect this wants to be the very last (non-debug) action in this function.

​I think it should be fine. I double checked the _init function in sched_credit2.c. It has the similar operation: first assign prv to ops->sched_data and then set the value of prv. 
Of course, I can switch ​them. But I'm not sure if that really matter. :-)

It means that anyone else who peaks at ops->sched_data will see a partially constructed rt_private object.  This can be a source of subtle bugs, so it is better to code appropriately, rather than relying on an assumption that noone would go peaking before this function returns.


> +static void *
> +rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
> +{
> +    unsigned long flags;
> +    struct rt_dom *sdom;
> +    struct rt_private * prv = RT_PRIV(ops);
> +
> +    printtime();
> +    printk("dom=%d\n", dom->domain_id);
> +
> +    sdom = xzalloc(struct rt_dom);
> +    if ( sdom == NULL )
> +    {
> +        printk("%s, xzalloc failed\n", __func__);
> +        return NULL;
> +    }
> +
> +    INIT_LIST_HEAD(&sdom->vcpu);
> +    INIT_LIST_HEAD(&sdom->sdom_elem);
> +    sdom->dom = dom;
> +
> +    /* spinlock here to insert the dom */
> +    spin_lock_irqsave(&prv->lock, flags);
> +    list_add_tail(&sdom->sdom_elem, &(prv->sdom));
> +    spin_unlock_irqrestore(&prv->lock, flags);
> +
> +    return (void *)sdom;

Bogus cast.

​I think we have to cast it to void * ​because the definition of this function asks the return type to be void *. In addition, the credit2 scheduler also did the same cast in  this _alloc_domdata function. So I guess this should be fine?


In C, all pointers are implicitly castable to/from void*.  This is not the case in C++.  (which suggests to me that George learnt C++ before C, or is at least more familiar with C++?)

The act of using type punning means that you are trying to do something which the C type system doesn't like.  This in turn requires more thought from people reading the code to work out whether it is actually safe or not.  As a result, we strive for as few type overrides as possible.

Please don't fall into the trap of assuming the credit2 code, or indeed any of the other code in Xen, is perfect.  None of it is, although most of it is good.

 
> +/*
> + * set/get each vcpu info of each domain
> + */
> +static int
> +rt_dom_cntl(
> +    const struct scheduler *ops,
> +    struct domain *d,
> +    struct xen_domctl_scheduler_op *op)
> +{
> +    xen_domctl_sched_rt_params_t *local_sched;
> +    struct rt_dom * const sdom = RT_DOM(d);
> +    struct list_head *iter;
> +    int vcpu_index = 0;
> +    int rc = -EINVAL;
> +
> +    switch ( op->cmd )
> +    {
> +    case XEN_DOMCTL_SCHEDOP_getnumvcpus:
> +        op->u.rt.nr_vcpus = 0;
> +        list_for_each( iter, &sdom->vcpu )
> +            vcpu_index++;
> +        op->u.rt.nr_vcpus = vcpu_index;
> +        rc = 0;
> +        break;
> +    case XEN_DOMCTL_SCHEDOP_getinfo:
> +        /* for debug use, whenever adjust Dom0 parameter, do global dump */
> +        if ( d->domain_id == 0 )
> +            rt_dump(ops);
> +
> +        op->u.rt.nr_vcpus = 0;
> +        list_for_each( iter, &sdom->vcpu )
> +            vcpu_index++;
> +        op->u.rt.nr_vcpus = vcpu_index;
> +        local_sched = xzalloc_array(xen_domctl_sched_rt_params_t, vcpu_index);
> +        vcpu_index = 0;
> +        list_for_each( iter, &sdom->vcpu )
> +        {
> +            struct rt_vcpu * svc = list_entry(iter, struct rt_vcpu, sdom_elem);
> +
> +            local_sched[vcpu_index].budget = svc->budget;
> +            local_sched[vcpu_index].period = svc->period;
> +            local_sched[vcpu_index].index = vcpu_index;
> +            vcpu_index++;
> +        }
> +        copy_to_guest(op->u.rt.vcpu, local_sched, vcpu_index);

This will clobber guest heap if vcpu_index is greater than the allocated
space.
 
​This is a good point! I will pass the size of the array to the kernel and check that the number of the array's elements is not smaller than the number of vcpus.
 
 You also unconditionally leak local_sched, but there is no need
for an allocation in the first place.

​I will add the xfree() after copy_to_guest(). 
I have a question: how can I avoid allocating the local_sched?

unsigned i = 0;

list_for_each( iter, &sdom->vcpu )
{
    xen_domctl_sched_rt_params_t local_sched;

    if ( i >= op->u.rt.nr_vcpus )
        break;

    /* Fill local_sched */

    if ( copy_to_guest_offset(op->u.rt.vcpu, i, &local_sched, 1) )
    {
        ret = -EFAULT;
        break;
    }
}



 
> +        rc = 0;
> +        break;
> +    case XEN_DOMCTL_SCHEDOP_putinfo:
> +        list_for_each( iter, &sdom->vcpu )
> +        {
> +            struct rt_vcpu * svc = list_entry(iter, struct rt_vcpu, sdom_elem);
> +
> +            /* adjust per VCPU parameter */
> +            if ( op->u.rt.vcpu_index == svc->vcpu->vcpu_id )
> +            {
> +                vcpu_index = op->u.rt.vcpu_index;
> +
> +                if ( vcpu_index < 0 )
> +                    printk("XEN_DOMCTL_SCHEDOP_putinfo: vcpu_index=%d\n",
> +                            vcpu_index);
> +                else
> +                    printk("XEN_DOMCTL_SCHEDOP_putinfo: "
> +                            "vcpu_index=%d, period=%"PRId64", budget=%"PRId64"\n",
> +                            vcpu_index, op->u.rt.period, op->u.rt.budget);
> +
> +                svc->period = op->u.rt.period;
> +                svc->budget = op->u.rt.budget;
> +
> +                break;
> +            }
> +        }
> +        rc = 0;
> +        break;
> +    }
> +
> +    return rc;
> +}
> +
> +static struct rt_private _rt_priv;
> +
> +const struct scheduler sched_rt_def = {
> +    .name           = "SMP RT Scheduler",
> +    .opt_name       = "rt",

Should these names reflect RT_DS as opposed to simply RT?

​DS (Deferrable Server) is just one kind of server mechanisms for global Earliest Deadline First scheduling. ​We can add other server mechanisms in the same file sched_rt.c to extend this Real Time scheduler. But we don't want to change/affect user's interface when we add more server mechanisms.
The .opt_name will affect the user's interface when user choose the rt scheduler, If we change it to rt_ds, we will have to change it to rt again when we have more server mechanisms implemented. Then users will have to change their configuration (i.e., the command line value sched=) to the new name rt. Because this could potentially affect users' interface, I think it's better to use rt here. What do you think?


Reusing the rt.c infrastructure is fine, but something other than DS will necessarily be a different scheduler as far as the tools are concerned.  The user should expect to have to change their parameters if they want to use a new scheduler.  The other point of view is that the user should expect their scheduler not to change under their feet if they continue using the same parameters as before.

 

> +    .sched_id       = XEN_SCHEDULER_RT_DS,
> +    .sched_data     = &_rt_priv,
> +
> +    .dump_cpu_state = rt_dump_pcpu,
> +    .dump_settings  = rt_dump,
> +    .init           = rt_init,
> +    .deinit         = rt_deinit,
> +    .alloc_pdata    = rt_alloc_pdata,
> +    .free_pdata     = rt_free_pdata,
> +    .alloc_domdata  = rt_alloc_domdata,
> +    .free_domdata   = rt_free_domdata,
> +    .init_domain    = rt_dom_init,
> +    .destroy_domain = rt_dom_destroy,
> +    .alloc_vdata    = rt_alloc_vdata,
> +    .free_vdata     = rt_free_vdata,
> +    .insert_vcpu    = rt_vcpu_insert,
> +    .remove_vcpu    = rt_vcpu_remove,
> +
> +    .adjust         = rt_dom_cntl,
> +
> +    .pick_cpu       = rt_cpu_pick,
> +    .do_schedule    = rt_schedule,
> +    .sleep          = rt_vcpu_sleep,
> +    .wake           = rt_vcpu_wake,
> +    .context_saved  = rt_context_saved,
> +};
> diff --git a/xen/common/schedule.c b/xen/common/schedule.c
> index e9eb0bc..f2df400 100644
> --- a/xen/common/schedule.c
> +++ b/xen/common/schedule.c
> @@ -68,6 +68,7 @@ static const struct scheduler *schedulers[] = {
>      &sched_sedf_def,
>      &sched_credit_def,
>      &sched_credit2_def,
> +    &sched_rt_def,
>      &sched_arinc653_def,
>  };
>
> @@ -1092,7 +1093,8 @@ long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
>
>      if ( (op->sched_id != DOM2OP(d)->sched_id) ||
>           ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
> -          (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
> +          (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo) &&
> +          (op->cmd != XEN_DOMCTL_SCHEDOP_getnumvcpus)) )
>          return -EINVAL;
>
>      /* NB: the pluggable scheduler code needs to take care
> diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
> index 5b11bbf..8d4b973 100644
> --- a/xen/include/public/domctl.h
> +++ b/xen/include/public/domctl.h
> @@ -339,6 +339,18 @@ struct xen_domctl_max_vcpus {
>  typedef struct xen_domctl_max_vcpus xen_domctl_max_vcpus_t;
>  DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t);
>
> +/*
> + * This structure is used to pass to rt scheduler from a
> + * privileged domain to Xen
> + */
> +struct xen_domctl_sched_rt_params {
> +    /* get vcpus' info */
> +    int64_t period; /* s_time_t type */
> +    int64_t budget;
> +    int     index;

Index is clearly an unsigned quantity.  For alignment and compatibility,
uint64_t would make the most sense.  Alternatively, uint32_t and an
explicit uint32_t pad field.

​Agree. I have changed it to uint16_t because the vcpu_index is uint16_t in the tool stack.

Thank you very much for your comments and suggestions! Looking forward to your reply! ;-)

Best,

Meng​

If you make its size a multiple of 8, then Xen doesn't need a compat shim for converting hypercalls from 32bit guests.

~Andrew