Re: [PATCH v11 17/17]add two new ioctls for mp device.

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Michael S. Tsirkin" <mst@redhat.com>
To: xiaohui.xin@intel.com
Cc: netdev@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, mingo@elte.hu, davem@davemloft.net,
	herbert@gondor.hengli.com.au, jdike@linux.intel.com
Subject: Re: [PATCH v11 17/17]add two new ioctls for mp device.
Date: Tue, 28 Sep 2010 11:48:00 +0200	[thread overview]
Message-ID: <20100928094800.GG12472@redhat.com> (raw)
In-Reply-To: <b74412a16bc32fe2550461f25156e5b2563c5a2c.1285385607.git.xiaohui.xin@intel.com>

On Sat, Sep 25, 2010 at 12:27:35PM +0800, xiaohui.xin@intel.com wrote:
> From: Xin Xiaohui <xiaohui.xin@intel.com>
> 
> The patch add two ioctls for mp device.
> One is for userspace to query how much memory locked to make mp device
> run smoothly. Another one is for userspace to set how much meory locked
> it really wants.
> 
> ---
>  drivers/vhost/mpassthru.c |  103 +++++++++++++++++++++++----------------------
>  include/linux/mpassthru.h |    2 +
>  2 files changed, 54 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
> index d86d94c..e3a0199 100644
> --- a/drivers/vhost/mpassthru.c
> +++ b/drivers/vhost/mpassthru.c
> @@ -67,6 +67,8 @@ static int debug;
>  #define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
>  #define COPY_HDR_LEN   (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
>  
> +#define DEFAULT_NEED	((8192*2*2)*4096)
> +
>  struct frag {
>  	u16     offset;
>  	u16     size;
> @@ -110,7 +112,8 @@ struct page_ctor {
>  	int			rq_len;
>  	spinlock_t		read_lock;

try documenting what fields are protected by which lock, btw.

>  	/* record the locked pages */
> -	int			lock_pages;
> +	int			locked_pages;
> +	int			cur_pages;
>  	struct rlimit		o_rlim;

unused now?

>  	struct net_device	*dev;
>  	struct mpassthru_port	port;

This structure name should start with mp_ to avoid namespace pollution.
Also ctor implies a contructor function: see pgtable_page_ctor - it is
not a very good name for a structure.


> @@ -122,6 +125,7 @@ struct mp_struct {
>  	struct net_device       *dev;
>  	struct page_ctor	*ctor;
>  	struct socket           socket;
> +	struct task_struct	*user;
>  
>  #ifdef MPASSTHRU_DEBUG
>  	int debug;
> @@ -231,7 +235,8 @@ static int page_ctor_attach(struct mp_struct *mp)
>  	ctor->port.ctor = page_ctor;
>  	ctor->port.sock = &mp->socket;
>  	ctor->port.hash = mp_lookup;
> -	ctor->lock_pages = 0;
> +	ctor->locked_pages = 0;
> +	ctor->cur_pages = 0;
>  
>  	/* locked by mp_mutex */
>  	dev->mp_port = &ctor->port;
> @@ -264,37 +269,6 @@ struct page_info *info_dequeue(struct page_ctor *ctor)
>  	return info;
>  }
>  
> -static int set_memlock_rlimit(struct page_ctor *ctor, int resource,
> -			      unsigned long cur, unsigned long max)
> -{
> -	struct rlimit new_rlim, *old_rlim;
> -	int retval;
> -
> -	if (resource != RLIMIT_MEMLOCK)
> -		return -EINVAL;
> -	new_rlim.rlim_cur = cur;
> -	new_rlim.rlim_max = max;
> -
> -	old_rlim = current->signal->rlim + resource;
> -
> -	/* remember the old rlimit value when backend enabled */
> -	ctor->o_rlim.rlim_cur = old_rlim->rlim_cur;
> -	ctor->o_rlim.rlim_max = old_rlim->rlim_max;
> -
> -	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
> -			!capable(CAP_SYS_RESOURCE))
> -		return -EPERM;
> -
> -	retval = security_task_setrlimit(resource, &new_rlim);
> -	if (retval)
> -		return retval;
> -
> -	task_lock(current->group_leader);
> -	*old_rlim = new_rlim;
> -	task_unlock(current->group_leader);
> -	return 0;
> -}
> -
>  static void relinquish_resource(struct page_ctor *ctor)
>  {
>  	if (!(ctor->dev->flags & IFF_UP) &&
> @@ -323,7 +297,7 @@ static void mp_ki_dtor(struct kiocb *iocb)
>  	} else
>  		info->ctor->wq_len--;
>  	/* Decrement the number of locked pages */
> -	info->ctor->lock_pages -= info->pnum;
> +	info->ctor->cur_pages -= info->pnum;
>  	kmem_cache_free(ext_page_info_cache, info);
>  	relinquish_resource(info->ctor);
>  
> @@ -357,6 +331,7 @@ static int page_ctor_detach(struct mp_struct *mp)
>  {
>  	struct page_ctor *ctor;
>  	struct page_info *info;
> +	struct task_struct *tsk = mp->user;
>  	int i;
>  
>  	/* locked by mp_mutex */
> @@ -375,9 +350,9 @@ static int page_ctor_detach(struct mp_struct *mp)
>  
>  	relinquish_resource(ctor);
>  
> -	set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
> -			   ctor->o_rlim.rlim_cur,
> -			   ctor->o_rlim.rlim_max);
> +	down_write(&tsk->mm->mmap_sem);
> +	tsk->mm->locked_vm -= ctor->locked_pages;
> +	up_write(&tsk->mm->mmap_sem);
>  
>  	/* locked by mp_mutex */
>  	ctor->dev->mp_port = NULL;
> @@ -514,7 +489,6 @@ static struct page_info *mp_hash_delete(struct page_ctor *ctor,
>  {
>  	key_mp_t key = mp_hash(info->pages[0], HASH_BUCKETS);
>  	struct page_info *tmp = NULL;
> -	int i;
>  
>  	tmp = ctor->hash_table[key];
>  	while (tmp) {
> @@ -565,14 +539,11 @@ static struct page_info *alloc_page_info(struct page_ctor *ctor,
>  	int rc;
>  	int i, j, n = 0;
>  	int len;
> -	unsigned long base, lock_limit;
> +	unsigned long base;
>  	struct page_info *info = NULL;
>  
> -	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
> -	lock_limit >>= PAGE_SHIFT;
> -
> -	if (ctor->lock_pages + count > lock_limit && npages) {
> -		printk(KERN_INFO "exceed the locked memory rlimit.");
> +	if (ctor->cur_pages + count > ctor->locked_pages) {
> +		printk(KERN_INFO "Exceed memory lock rlimt.");
>  		return NULL;
>  	}
>  
> @@ -634,7 +605,7 @@ static struct page_info *alloc_page_info(struct page_ctor *ctor,
>  			mp_hash_insert(ctor, info->pages[i], info);
>  	}
>  	/* increment the number of locked pages */
> -	ctor->lock_pages += j;
> +	ctor->cur_pages += j;
>  	return info;
>  
>  failed:
> @@ -1006,12 +977,6 @@ proceed:
>  		count--;
>  	}
>  
> -	if (!ctor->lock_pages || !ctor->rq_len) {
> -		set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
> -				iocb->ki_user_data * 4096 * 2,
> -				iocb->ki_user_data * 4096 * 2);
> -	}
> -
>  	/* Translate address to kernel */
>  	info = alloc_page_info(ctor, iocb, iov, count, frags, npages, 0);
>  	if (!info)
> @@ -1115,8 +1080,10 @@ static long mp_chr_ioctl(struct file *file, unsigned int cmd,
>  	struct mp_struct *mp;
>  	struct net_device *dev;
>  	void __user* argp = (void __user *)arg;
> +	unsigned long  __user *limitp = argp;
>  	struct ifreq ifr;
>  	struct sock *sk;
> +	unsigned long limit, locked, lock_limit;
>  	int ret;
>  
>  	ret = -EINVAL;
> @@ -1152,6 +1119,7 @@ static long mp_chr_ioctl(struct file *file, unsigned int cmd,
>  			goto err_dev_put;
>  		}
>  		mp->dev = dev;
> +		mp->user = current;

This is unsafe: task might go away but fd will still exist.
You should get the mm like vhost does, then you can keep
a reference until release.

>  		ret = -ENOMEM;
>  
>  		sk = sk_alloc(mfile->net, AF_UNSPEC, GFP_KERNEL, &mp_proto);
> @@ -1193,6 +1161,39 @@ err_dev_put:
>  		ret = do_unbind(mfile);
>  		break;
>  
> +	case MPASSTHRU_SET_MEM_LOCKED:
> +		ret = copy_from_user(&limit, limitp, sizeof limit);
> +		if (ret < 0)
> +			return ret;
> +
> +		mp = mp_get(mfile);
> +		if (!mp)
> +			return -ENODEV;
> +
> +		limit = PAGE_ALIGN(limit) >> PAGE_SHIFT;
> +		down_write(&current->mm->mmap_sem);

So here, current might be different from mp->user:
many processes might share an fd. The result
will be that you will subtract locked_vm from A but add it to B.

The right thing to do IMO is to store mm on SET_MEM_LOCKED.
Also be careful about multiple callers etc.


> +		locked = limit + current->mm->locked_vm;
> +		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +		if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
> +			up_write(&current->mm->mmap_sem);
> +			mp_put(mfile);
> +			return -ENOMEM;
> +		}
> +		current->mm->locked_vm = locked;
> +		up_write(&current->mm->mmap_sem);
> +
> +		mutex_lock(&mp_mutex);
> +		mp->ctor->locked_pages = limit;

What if a process calls SET_MEM_LOCKED multiple times
(or many processes do)? What if it is called when
some pages are already locked?
I suggested one way to handle that in one of the previous messages,
but you must think out these scenarious when you invent a new
ioctl.


> +		mutex_unlock(&mp_mutex);
> +
> +		mp_put(mfile);
> +		return 0;
> +
> +	case MPASSTHRU_GET_MEM_LOCKED_NEED:
> +		limit = DEFAULT_NEED;
> +		return copy_to_user(limitp, &limit, sizeof limit);
> +
>  	default:
>  		break;
>  	}
> diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
> index ba8f320..083e9f7 100644
> --- a/include/linux/mpassthru.h
> +++ b/include/linux/mpassthru.h
> @@ -7,6 +7,8 @@
>  /* ioctl defines */
>  #define MPASSTHRU_BINDDEV      _IOW('M', 213, int)
>  #define MPASSTHRU_UNBINDDEV    _IO('M', 214)
> +#define MPASSTHRU_SET_MEM_LOCKED	_IOW('M', 215, unsigned long)
> +#define MPASSTHRU_GET_MEM_LOCKED_NEED	_IOR('M', 216, unsigned long)
>  
>  #ifdef __KERNEL__
>  #if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
> -- 
> 1.7.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

next prev parent reply	other threads:[~2010-09-28  9:54 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-09-25  4:27 [PATCH v11 00/17] Provide a zero-copy method on KVM virtio-net xiaohui.xin
2010-09-25  4:27 ` [PATCH v11 01/17] Add a new structure for skb buffer from external xiaohui.xin
2010-09-25  4:27   ` [PATCH v11 02/17] Add a new struct for device to manipulate external buffer xiaohui.xin
2010-09-27 13:41     ` Ben Hutchings
2010-09-25  4:27   ` [PATCH v11 03/17] Add a ndo_mp_port_prep pointer to net_device_ops xiaohui.xin
2010-09-27 13:42     ` Ben Hutchings
2010-09-29 13:41       ` Xin, Xiaohui
2010-09-25  4:27   ` [PATCH v11 04/17]Add a function make external buffer owner to query capability xiaohui.xin
2010-09-27 13:45     ` Ben Hutchings
2010-09-25  4:27   ` [PATCH v11 05/17] Add a function to indicate if device use external buffer xiaohui.xin
2010-09-25  4:27   ` [PATCH v11 06/17]Use callback to deal with skb_release_data() specially xiaohui.xin
2010-09-25  4:27   ` [PATCH v11 07/17] Modify netdev_alloc_page() to get external buffer xiaohui.xin
2010-09-25  4:27   ` [PATCH v11 08/17] Modify netdev_free_page() to release " xiaohui.xin
2010-09-25  4:27   ` [PATCH v11 09/17] Don't do skb recycle, if device use " xiaohui.xin
2010-09-25  4:27   ` [PATCH v11 10/17] Add a hook to intercept external buffers from NIC driver xiaohui.xin
2010-09-25  4:27   ` [PATCH v11 11/17] Add header file for mp device xiaohui.xin
2010-09-27 13:55     ` Ben Hutchings
2010-09-25  4:27   ` [PATCH v11 12/17] Add a kconfig entry and make entry " xiaohui.xin
2010-09-27 13:56     ` Ben Hutchings
2010-09-29 13:39       ` Xin, Xiaohui
2010-09-25  4:27   ` [PATCH v11 13/17] Add mp(mediate passthru) device xiaohui.xin
2010-09-27 21:23     ` Ben Hutchings
2010-09-28 13:06       ` Michael S. Tsirkin
2010-09-28 14:39         ` Arnd Bergmann
2010-09-28 14:43           ` Michael S. Tsirkin
2010-09-28 15:18             ` Arnd Bergmann
2010-09-28 18:48               ` Sridhar Samudrala
2010-09-29 13:38       ` Xin, Xiaohui
2010-09-25  4:27   ` [PATCH v11 14/17]Provides multiple submits and asynchronous notifications xiaohui.xin
2010-09-25  4:27   ` [PATCH v11 15/17]An example how to modifiy NIC driver to use napi_gro_frags() interface xiaohui.xin
2010-09-25  4:27   ` [PATCH v11 16/17]An example how to alloc user buffer based on " xiaohui.xin
2010-09-25  4:27   ` [PATCH v11 17/17]add two new ioctls for mp device xiaohui.xin
2010-09-27 21:36     ` Ben Hutchings
2010-09-28 13:09       ` Michael S. Tsirkin
2010-09-28  9:48     ` Michael S. Tsirkin [this message]
2010-09-29  9:36       ` xiaohui.xin
2010-09-26 17:01 ` [PATCH v11 00/17] Provide a zero-copy method on KVM virtio-net Michael S. Tsirkin
2010-09-27  0:44   ` Xin, Xiaohui
2010-09-28  1:25     ` Xin, Xiaohui

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100928094800.GG12472@redhat.com \
    --to=mst@redhat.com \
    --cc=davem@davemloft.net \
    --cc=herbert@gondor.hengli.com.au \
    --cc=jdike@linux.intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=netdev@vger.kernel.org \
    --cc=xiaohui.xin@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.