From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753937AbYLPUzs (ORCPT ); Tue, 16 Dec 2008 15:55:48 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1753598AbYLPUzX (ORCPT ); Tue, 16 Dec 2008 15:55:23 -0500 Received: from smtp1.linux-foundation.org ([140.211.169.13]:40098 "EHLO smtp1.linux-foundation.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752689AbYLPUzU (ORCPT ); Tue, 16 Dec 2008 15:55:20 -0500 Date: Tue, 16 Dec 2008 12:54:25 -0800 From: Andrew Morton To: Jeremy Fitzhardinge Cc: mingo@elte.hu, viro@ftp.linux.org.uk, linux-kernel@vger.kernel.org, xen-devel@lists.xensource.com, alex.zeffertt@eu.citrix.com, Ian.Campbell@citrix.com Subject: Re: [PATCH] xen: add xenfs to allow usermode <-> Xen interaction Message-Id: <20081216125425.30524946.akpm@linux-foundation.org> In-Reply-To: <49480F39.2010102@goop.org> References: <49480F39.2010102@goop.org> X-Mailer: Sylpheed version 2.2.4 (GTK+ 2.8.20; i486-pc-linux-gnu) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Tue, 16 Dec 2008 12:27:37 -0800 Jeremy Fitzhardinge wrote: > [ Reviewers: This is in drivers/xen to keep it close to the code it is and will be using. Would people > prefer to see it in fs/xenfs? -J ] Well it would be nice to keep filesystems under ./fs, but `grep -rl register_filesystem .' says we screwed that pooch. > From: Alex Zeffertt > > The xenfs filesystem exports various interfaces to usermode. Initially > this exports a file to allow usermode to interact with xenbus/xenstore. > > Traditionally this appeared in /proc/xen. Rather than extending procfs, > this patch adds a backward-compat mountpoint on /proc/xen, and provides > a xenfs filesystem which can be mounted there. > > > ... > > --- /dev/null > +++ b/drivers/xen/xenfs/super.c > @@ -0,0 +1,65 @@ > +/* > + * xenfs.c - a filesystem for passing info between the a domain and > + * the hypervisor. > + * > + * 2008-10-07 Alex Zeffertt Replaced /proc/xen/xenbus with xenfs filesystem > + * and /proc/xen compatibility mount point. > + * Turned xenfs into a loadable module. > + */ > + > +#include > +#include > +#include > +#include > + > +#include "xenfs.h" > + > +#include > + > +#define XENFS_MAGIC 0xabba1974 Move to include/linux/magic.h, please. > +MODULE_DESCRIPTION("Xen filesystem"); > +MODULE_LICENSE("GPL"); MODULE_AUTHOR()? ./MAINTAINERS? > +static int xenfs_fill_super(struct super_block *sb, void *data, int silent) > +{ > + static struct tree_descr xenfs_files[] = { > + [2] = {"xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR}, > + {""}, > + }; > + > + return simple_fill_super(sb, XENFS_MAGIC, xenfs_files); > +} > + > > ... > > +struct xenbus_transaction_holder { > + struct list_head list; > + struct xenbus_transaction handle; > +}; > + > +struct read_buffer { > + struct list_head list; > + unsigned int cons; > + unsigned int len; > + char msg[]; > +}; > > +struct xenbus_file_priv { > + /* In-progress transaction. */ > + struct list_head transactions; > + > + /* Active watches. */ > + struct list_head watches; > + > + /* Partial request. */ > + unsigned int len; > + union { > + struct xsd_sockmsg msg; > + char buffer[PAGE_SIZE]; > + } u; > + > + /* Response queue. */ > + struct list_head read_buffers; > + wait_queue_head_t read_waitq; > + > + struct mutex reply_mutex; > +}; It would be useful to document the locking for the list_head's (at least). > + > +static ssize_t xenbus_file_read(struct file *filp, > + char __user *ubuf, > + size_t len, loff_t *ppos) > +{ > + struct xenbus_file_priv *u = filp->private_data; > + struct read_buffer *rb; > + int i, ret; > + > + mutex_lock(&u->reply_mutex); > + while (list_empty(&u->read_buffers)) { > + mutex_unlock(&u->reply_mutex); > + ret = wait_event_interruptible(u->read_waitq, > + !list_empty(&u->read_buffers)); > + if (ret) > + return ret; > + mutex_lock(&u->reply_mutex); > + } > + > + rb = list_entry(u->read_buffers.next, struct read_buffer, list); > + for (i = 0; i < len;) { > + ret = put_user(rb->msg[rb->cons], ubuf + i); So mmap_sem nests inside ->reply_mutex. Has this all been carefully tested with lockdep? > + if (ret) { > + mutex_unlock(&u->reply_mutex); > + return ret; > + } > + i++; > + rb->cons++; > + if (rb->cons == rb->len) { > + list_del(&rb->list); > + kfree(rb); > + if (list_empty(&u->read_buffers)) > + break; > + rb = list_entry(u->read_buffers.next, > + struct read_buffer, list); > + } > + } This code will misbehave if it ever receives a read_buffer with len==0. > + mutex_unlock(&u->reply_mutex); > + > + return i; > +} > + > +static int queue_reply(struct list_head *list, char *data, unsigned int len) > +{ > + struct read_buffer *rb; > + > + if (len == 0) > + return 0; That should fix it. > + rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL); > + if (rb == NULL) > + return -ENOMEM; > + > + rb->cons = 0; > + rb->len = len; > + > + memcpy(rb->msg, data, len); > + > + list_add_tail(&rb->list, list); So the caller of this function must hold ->reply_mutex. You thought that was secret but I found it out! > + return 0; > +} > + > +/* Free all the read_buffer s on a list. > + * Caller must have sole reference to list. > + */ > +static void queue_cleanup(struct list_head *list) > +{ > + struct read_buffer *rb; > + > + while (!list_empty(list)) { > + rb = list_entry(list->next, struct read_buffer, list); > + list_del(list->next); > + kfree(rb); > + } > +} > + > +struct watch_adapter { > + struct list_head list; locking is? > + struct xenbus_watch watch; > + struct xenbus_file_priv *dev_data; > + char *token; > +}; > + > > ... > > +static LIST_HEAD(watch_list); > + > +static ssize_t xenbus_file_write(struct file *filp, > + const char __user *ubuf, > + size_t len, loff_t *ppos) > +{ > + struct xenbus_file_priv *u = filp->private_data; > + struct xenbus_transaction_holder *trans = NULL; > + uint32_t msg_type; > + void *reply; > + char *path, *token; > + int err, rc = len; > + int ret; > + LIST_HEAD(staging_q); > + > + if ((len + u->len) > sizeof(u->u.buffer)) { > + rc = -EINVAL; > + goto out; > + } Now what's this doing? One would expect a large write to get chunked up into smaller writes by the kernel. This code is poorly documented. > + if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0) { > + rc = -EFAULT; > + goto out; > + } > + > + u->len += len; > + if ((u->len < sizeof(u->u.msg)) || > + (u->len < (sizeof(u->u.msg) + u->u.msg.len))) > + return rc; > + > + msg_type = u->u.msg.type; > + > + switch (msg_type) { > + case XS_TRANSACTION_START: > + case XS_TRANSACTION_END: > + case XS_DIRECTORY: > + case XS_READ: > + case XS_GET_PERMS: > + case XS_RELEASE: > + case XS_GET_DOMAIN_PATH: > + case XS_WRITE: > + case XS_MKDIR: > + case XS_RM: > + case XS_SET_PERMS: > + if (msg_type == XS_TRANSACTION_START) { > + trans = kmalloc(sizeof(*trans), GFP_KERNEL); > + if (!trans) { > + rc = -ENOMEM; > + goto out; > + } > + } > + > + reply = xenbus_dev_request_and_reply(&u->u.msg); > + if (IS_ERR(reply)) { > + kfree(trans); > + rc = PTR_ERR(reply); > + goto out; > + } > + > + if (msg_type == XS_TRANSACTION_START) { > + trans->handle.id = simple_strtoul(reply, NULL, 0); > + list_add(&trans->list, &u->transactions); > + } else if (msg_type == XS_TRANSACTION_END) { > + list_for_each_entry(trans, &u->transactions, list) > + if (trans->handle.id == u->u.msg.tx_id) > + break; > + BUG_ON(&trans->list == &u->transactions); > + list_del(&trans->list); > + kfree(trans); > + } > + mutex_lock(&u->reply_mutex); > + ret = queue_reply(&staging_q, > + (char *)&u->u.msg, sizeof(u->u.msg)); > + if (!ret) > + ret = queue_reply(&staging_q, > + (char *)reply, u->u.msg.len); > + if (!ret) { > + list_splice_tail(&staging_q, &u->read_buffers); > + wake_up(&u->read_waitq); > + } else { > + queue_cleanup(&staging_q); > + rc = ret; > + } > + mutex_unlock(&u->reply_mutex); > + kfree(reply); > + break; > + > + case XS_WATCH: > + case XS_UNWATCH: { > + struct watch_adapter *watch, *tmp_watch; > + static const char XS_RESP[] = "OK"; > + struct xsd_sockmsg hdr; > + > + path = u->u.buffer + sizeof(u->u.msg); > + token = memchr(path, 0, u->u.msg.len); > + if (token == NULL) { > + rc = -EILSEQ; > + goto out; > + } > + token++; > + > + if (msg_type == XS_WATCH) { > + watch = kzalloc(sizeof(*watch), GFP_KERNEL); > + if (watch == NULL) { > + rc = -ENOMEM; > + goto out; > + } > + watch->watch.node = > + kmalloc(strlen(path)+1, GFP_KERNEL); > + if (watch->watch.node == NULL) { > + kfree(watch); > + rc = -ENOMEM; > + goto out; > + } > + strcpy((char *)watch->watch.node, path); > + watch->watch.callback = watch_fired; > + watch->token = > + kmalloc(strlen(token)+1, GFP_KERNEL); > + if (watch->token == NULL) { > + kfree(watch->watch.node); > + kfree(watch); > + rc = -ENOMEM; > + goto out; > + } > + strcpy(watch->token, token); > + watch->dev_data = u; > + > + err = register_xenbus_watch(&watch->watch); > + if (err) { > + free_watch_adapter(watch); > + rc = err; > + goto out; > + } > + > + list_add(&watch->list, &u->watches); > + } else { > + list_for_each_entry_safe(watch, tmp_watch, > + &u->watches, list) { > + if (!strcmp(watch->token, token) && > + !strcmp(watch->watch.node, path)) { > + unregister_xenbus_watch(&watch->watch); > + list_del(&watch->list); > + free_watch_adapter(watch); > + break; > + } > + } > + } > + > + hdr.type = msg_type; > + hdr.len = sizeof(XS_RESP); > + mutex_lock(&u->reply_mutex); > + ret = queue_reply(&staging_q, (char *)&hdr, sizeof(hdr)); > + if (!ret) > + ret = queue_reply(&staging_q, (char *)XS_RESP, hdr.len); > + if (!ret) { > + list_splice_tail(&staging_q, &u->read_buffers); > + wake_up(&u->read_waitq); > + } else { > + queue_cleanup(&staging_q); > + rc = ret; > + } > + mutex_unlock(&u->reply_mutex); > + break; > + } > + > + default: > + rc = -EINVAL; > + break; > + } > + > + out: > + u->len = 0; > + return rc; > +} This function implements a new kernel ABI. How are reviewers to review the design of this proposed ABI? > > ... > > +static unsigned int xenbus_file_poll(struct file *file, poll_table *wait) > +{ > + struct xenbus_file_priv *u = file->private_data; > + > + poll_wait(file, &u->read_waitq, wait); > + if (!list_empty(&u->read_buffers)) > + return POLLIN | POLLRDNORM; I wonder if we needed the lock or some open-coded barrier to safely run list_empty(). > + return 0; > +} > + > > ... >