Netdev List
 help / color / mirror / Atom feed
* [PATCH 6/6] SUNRPC: split SUNPRC PipeFS dentry and private pipe data creation
From: Stanislav Kinsbursky @ 2011-11-22 15:42 UTC (permalink / raw)
  To: Trond.Myklebust
  Cc: linux-nfs, xemul, neilb, netdev, linux-kernel, jbottomley,
	bfields, davem, devel
In-Reply-To: <20111122134514.479.9848.stgit@localhost6.localdomain6>

This patch is a final step towards to removing PipeFS inode references from
kernel code other than PipeFS itself. It makes all kernel SUNRPC PipeFS users
depends on pipe private data, which state depend on their specific operations,
etc.
This patch completes SUNRPC PipeFS preparations and allows to create pipe
private data and PipeFS dentries independently.
Next step will be making SUNPRC PipeFS dentries allocated by SUNRPC PipeFS
network namespace aware routines.

Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>

---
 fs/nfs/blocklayout/blocklayout.c    |   16 ++++++++--
 fs/nfs/blocklayout/blocklayout.h    |    2 +
 fs/nfs/blocklayout/blocklayoutdev.c |    2 +
 fs/nfs/blocklayout/blocklayoutdm.c  |    2 +
 fs/nfs/idmap.c                      |   28 +++++++++++++-----
 include/linux/sunrpc/rpc_pipe_fs.h  |    7 +++--
 net/sunrpc/auth_gss/auth_gss.c      |   54 +++++++++++++++++++++++------------
 net/sunrpc/rpc_pipe.c               |   54 ++++++++++++++++++++---------------
 8 files changed, 107 insertions(+), 58 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9561c8f..c26633e 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -46,7 +46,7 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
 MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
 
-struct dentry *bl_device_pipe;
+struct rpc_pipe *bl_device_pipe;
 wait_queue_head_t bl_wq;
 
 static void print_page(struct page *page)
@@ -991,15 +991,22 @@ static int __init nfs4blocklayout_init(void)
 	if (ret)
 		goto out_remove;
 
-	bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
-				    &bl_upcall_ops, 0);
+	bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
 	if (IS_ERR(bl_device_pipe)) {
 		ret = PTR_ERR(bl_device_pipe);
 		goto out_remove;
 	}
+	bl_device_pipe->dentry = rpc_mkpipe_dentry(path.dentry, "blocklayout",
+						   NULL, bl_device_pipe);
+	if (IS_ERR(bl_device_pipe->dentry)) {
+		ret = PTR_ERR(bl_device_pipe->dentry);
+		goto out_destroy_pipe;
+	}
 out:
 	return ret;
 
+out_destroy_pipe:
+	rpc_destroy_pipe_data(bl_device_pipe);
 out_remove:
 	pnfs_unregister_layoutdriver(&blocklayout_type);
 	return ret;
@@ -1011,7 +1018,8 @@ static void __exit nfs4blocklayout_exit(void)
 	       __func__);
 
 	pnfs_unregister_layoutdriver(&blocklayout_type);
-	rpc_unlink(bl_device_pipe);
+	rpc_unlink(bl_device_pipe->dentry);
+	rpc_destroy_pipe_data(bl_device_pipe);
 }
 
 MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index f27d827..5f30941 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -159,7 +159,7 @@ struct bl_msg_hdr {
 	u16 totallen; /* length of entire message, including hdr itself */
 };
 
-extern struct dentry *bl_device_pipe;
+extern struct rpc_pipe *bl_device_pipe;
 extern wait_queue_head_t bl_wq;
 
 #define BL_DEVICE_UMOUNT               0x0 /* Umount--delete devices */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index 44dc348..79f4752 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -168,7 +168,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
 
 	dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
 	add_wait_queue(&bl_wq, &wq);
-	if (rpc_queue_upcall(RPC_I(bl_device_pipe->d_inode)->pipe, &msg) < 0) {
+	if (rpc_queue_upcall(bl_device_pipe, &msg) < 0) {
 		remove_wait_queue(&bl_wq, &wq);
 		goto out;
 	}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
index 3c38244..631f254 100644
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -66,7 +66,7 @@ static void dev_remove(dev_t dev)
 	msg.len = sizeof(bl_msg) + bl_msg.totallen;
 
 	add_wait_queue(&bl_wq, &wq);
-	if (rpc_queue_upcall(RPC_I(bl_device_pipe->d_inode)->pipe, &msg) < 0) {
+	if (rpc_queue_upcall(bl_device_pipe, &msg) < 0) {
 		remove_wait_queue(&bl_wq, &wq);
 		goto out;
 	}
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 7e3d8dd..b09a7f1 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -327,7 +327,7 @@ struct idmap_hashtable {
 };
 
 struct idmap {
-	struct dentry		*idmap_dentry;
+	struct rpc_pipe		*idmap_pipe;
 	wait_queue_head_t	idmap_wq;
 	struct idmap_msg	idmap_im;
 	struct mutex		idmap_lock;	/* Serializes upcalls */
@@ -354,6 +354,7 @@ int
 nfs_idmap_new(struct nfs_client *clp)
 {
 	struct idmap *idmap;
+	struct rpc_pipe *pipe;
 	int error;
 
 	BUG_ON(clp->cl_idmap != NULL);
@@ -362,14 +363,23 @@ nfs_idmap_new(struct nfs_client *clp)
 	if (idmap == NULL)
 		return -ENOMEM;
 
-	idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry,
-			"idmap", idmap, &idmap_upcall_ops, 0);
-	if (IS_ERR(idmap->idmap_dentry)) {
-		error = PTR_ERR(idmap->idmap_dentry);
+	pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0);
+	if (IS_ERR(pipe)) {
+		error = PTR_ERR(pipe);
 		kfree(idmap);
 		return error;
 	}
 
+	if (clp->cl_rpcclient->cl_path.dentry)
+		pipe->dentry = rpc_mkpipe_dentry(clp->cl_rpcclient->cl_path.dentry,
+				"idmap", idmap, pipe);
+	if (IS_ERR(pipe->dentry)) {
+		error = PTR_ERR(pipe->dentry);
+		rpc_destroy_pipe_data(pipe);
+		kfree(idmap);
+		return error;
+	}
+	idmap->idmap_pipe = pipe;
 	mutex_init(&idmap->idmap_lock);
 	mutex_init(&idmap->idmap_im_lock);
 	init_waitqueue_head(&idmap->idmap_wq);
@@ -387,7 +397,9 @@ nfs_idmap_delete(struct nfs_client *clp)
 
 	if (!idmap)
 		return;
-	rpc_unlink(idmap->idmap_dentry);
+	if (idmap->idmap_pipe->dentry)
+		rpc_unlink(idmap->idmap_pipe->dentry);
+	rpc_destroy_pipe_data(idmap->idmap_pipe);
 	clp->cl_idmap = NULL;
 	kfree(idmap);
 }
@@ -508,7 +520,7 @@ nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h,
 	msg.len = sizeof(*im);
 
 	add_wait_queue(&idmap->idmap_wq, &wq);
-	if (rpc_queue_upcall(RPC_I(idmap->idmap_dentry->d_inode)->pipe, &msg) < 0) {
+	if (rpc_queue_upcall(idmap->idmap_pipe, &msg) < 0) {
 		remove_wait_queue(&idmap->idmap_wq, &wq);
 		goto out;
 	}
@@ -569,7 +581,7 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
 
 	add_wait_queue(&idmap->idmap_wq, &wq);
 
-	if (rpc_queue_upcall(RPC_I(idmap->idmap_dentry->d_inode)->pipe, &msg) < 0) {
+	if (rpc_queue_upcall(idmap->idmap_pipe, &msg) < 0) {
 		remove_wait_queue(&idmap->idmap_wq, &wq);
 		goto out;
 	}
diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index ad78bea..0808ed2 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -34,6 +34,7 @@ struct rpc_pipe {
 	struct delayed_work queue_timeout;
 	const struct rpc_pipe_ops *ops;
 	spinlock_t lock;
+	struct dentry *dentry;
 };
 
 struct rpc_inode {
@@ -77,8 +78,10 @@ extern struct dentry *rpc_create_cache_dir(struct dentry *,
 					   struct cache_detail *);
 extern void rpc_remove_cache_dir(struct dentry *);
 
-extern struct dentry *rpc_mkpipe(struct dentry *, const char *, void *,
-				 const struct rpc_pipe_ops *, int flags);
+struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags);
+void rpc_destroy_pipe_data(struct rpc_pipe *pipe);
+extern struct dentry *rpc_mkpipe_dentry(struct dentry *, const char *, void *,
+					struct rpc_pipe *);
 extern int rpc_unlink(struct dentry *);
 extern struct vfsmount *rpc_get_mount(void);
 extern void rpc_put_mount(void);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 15fd9fe..2b25a7b 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -81,7 +81,7 @@ struct gss_auth {
 	 * mechanism (for example, "krb5") and exists for
 	 * backwards-compatibility with older gssd's.
 	 */
-	struct dentry *dentry[2];
+	struct rpc_pipe *pipe[2];
 };
 
 /* pipe_version >= 0 if and only if someone has a pipe open. */
@@ -451,7 +451,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid, struct rpc_clnt *clnt,
 		kfree(gss_msg);
 		return ERR_PTR(vers);
 	}
-	gss_msg->pipe = RPC_I(gss_auth->dentry[vers]->d_inode)->pipe;
+	gss_msg->pipe = gss_auth->pipe[vers];
 	INIT_LIST_HEAD(&gss_msg->list);
 	rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq");
 	init_waitqueue_head(&gss_msg->waitqueue);
@@ -821,21 +821,33 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 	 * that we supported only the old pipe.  So we instead create
 	 * the new pipe first.
 	 */
-	gss_auth->dentry[1] = rpc_mkpipe(clnt->cl_path.dentry,
-					 "gssd",
-					 clnt, &gss_upcall_ops_v1,
-					 RPC_PIPE_WAIT_FOR_OPEN);
-	if (IS_ERR(gss_auth->dentry[1])) {
-		err = PTR_ERR(gss_auth->dentry[1]);
+	gss_auth->pipe[1] = rpc_mkpipe_data(&gss_upcall_ops_v1,
+					    RPC_PIPE_WAIT_FOR_OPEN);
+	if (IS_ERR(gss_auth->pipe[1])) {
+		err = PTR_ERR(gss_auth->pipe[1]);
 		goto err_put_mech;
 	}
 
-	gss_auth->dentry[0] = rpc_mkpipe(clnt->cl_path.dentry,
-					 gss_auth->mech->gm_name,
-					 clnt, &gss_upcall_ops_v0,
-					 RPC_PIPE_WAIT_FOR_OPEN);
-	if (IS_ERR(gss_auth->dentry[0])) {
-		err = PTR_ERR(gss_auth->dentry[0]);
+	gss_auth->pipe[0] = rpc_mkpipe_data(&gss_upcall_ops_v0,
+					    RPC_PIPE_WAIT_FOR_OPEN);
+	if (IS_ERR(gss_auth->pipe[0])) {
+		err = PTR_ERR(gss_auth->pipe[0]);
+		goto err_destroy_pipe_1;
+	}
+
+	gss_auth->pipe[1]->dentry = rpc_mkpipe_dentry(clnt->cl_path.dentry,
+						      "gssd",
+						      clnt, gss_auth->pipe[1]);
+	if (IS_ERR(gss_auth->pipe[1]->dentry)) {
+		err = PTR_ERR(gss_auth->pipe[1]->dentry);
+		goto err_destroy_pipe_0;
+	}
+
+	gss_auth->pipe[0]->dentry = rpc_mkpipe_dentry(clnt->cl_path.dentry,
+						      gss_auth->mech->gm_name,
+						      clnt, gss_auth->pipe[0]);
+	if (IS_ERR(gss_auth->pipe[0]->dentry)) {
+		err = PTR_ERR(gss_auth->pipe[0]->dentry);
 		goto err_unlink_pipe_1;
 	}
 	err = rpcauth_init_credcache(auth);
@@ -844,9 +856,13 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 
 	return auth;
 err_unlink_pipe_0:
-	rpc_unlink(gss_auth->dentry[0]);
+	rpc_unlink(gss_auth->pipe[0]->dentry);
 err_unlink_pipe_1:
-	rpc_unlink(gss_auth->dentry[1]);
+	rpc_unlink(gss_auth->pipe[1]->dentry);
+err_destroy_pipe_0:
+	rpc_destroy_pipe_data(gss_auth->pipe[0]);
+err_destroy_pipe_1:
+	rpc_destroy_pipe_data(gss_auth->pipe[1]);
 err_put_mech:
 	gss_mech_put(gss_auth->mech);
 err_free:
@@ -859,8 +875,10 @@ out_dec:
 static void
 gss_free(struct gss_auth *gss_auth)
 {
-	rpc_unlink(gss_auth->dentry[1]);
-	rpc_unlink(gss_auth->dentry[0]);
+	rpc_unlink(gss_auth->pipe[0]->dentry);
+	rpc_unlink(gss_auth->pipe[1]->dentry);
+	rpc_destroy_pipe_data(gss_auth->pipe[0]);
+	rpc_destroy_pipe_data(gss_auth->pipe[1]);
 	gss_mech_put(gss_auth->mech);
 
 	kfree(gss_auth);
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 0eed975..8e59580 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -187,7 +187,6 @@ rpc_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	INIT_LIST_HEAD(&inode->i_dentry);
-	kfree(RPC_I(inode)->pipe);
 	kmem_cache_free(rpc_inode_cachep, RPC_I(inode));
 }
 
@@ -556,34 +555,44 @@ init_pipe(struct rpc_pipe *pipe)
 			    rpc_timeout_upcall_queue);
 	pipe->ops = NULL;
 	spin_lock_init(&pipe->lock);
+	pipe->dentry = NULL;
+}
 
+void rpc_destroy_pipe_data(struct rpc_pipe *pipe)
+{
+	kfree(pipe);
 }
+EXPORT_SYMBOL_GPL(rpc_destroy_pipe_data);
 
-static int __rpc_mkpipe(struct inode *dir, struct dentry *dentry,
-			umode_t mode,
-			const struct file_operations *i_fop,
-			void *private,
-			const struct rpc_pipe_ops *ops,
-			int flags)
+struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags)
 {
 	struct rpc_pipe *pipe;
-	struct rpc_inode *rpci;
-	int err;
 
 	pipe = kzalloc(sizeof(struct rpc_pipe), GFP_KERNEL);
 	if (!pipe)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	init_pipe(pipe);
+	pipe->ops = ops;
+	pipe->flags = flags;
+	return pipe;
+}
+EXPORT_SYMBOL_GPL(rpc_mkpipe_data);
+
+static int __rpc_mkpipe_dentry(struct inode *dir, struct dentry *dentry,
+			       umode_t mode,
+			       const struct file_operations *i_fop,
+			       void *private,
+			       struct rpc_pipe *pipe)
+{
+	struct rpc_inode *rpci;
+	int err;
+
 	err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private);
-	if (err) {
-		kfree(pipe);
+	if (err)
 		return err;
-	}
 	rpci = RPC_I(dentry->d_inode);
 	rpci->private = private;
 	rpci->pipe = pipe;
-	rpci->pipe->flags = flags;
-	rpci->pipe->ops = ops;
 	fsnotify_create(dir, dentry);
 	return 0;
 }
@@ -800,9 +809,8 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
  * The @private argument passed here will be available to all these methods
  * from the file pointer, via RPC_I(file->f_dentry->d_inode)->private.
  */
-struct dentry *rpc_mkpipe(struct dentry *parent, const char *name,
-			  void *private, const struct rpc_pipe_ops *ops,
-			  int flags)
+struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
+				 void *private, struct rpc_pipe *pipe)
 {
 	struct dentry *dentry;
 	struct inode *dir = parent->d_inode;
@@ -810,9 +818,9 @@ struct dentry *rpc_mkpipe(struct dentry *parent, const char *name,
 	struct qstr q;
 	int err;
 
-	if (ops->upcall == NULL)
+	if (pipe->ops->upcall == NULL)
 		umode &= ~S_IRUGO;
-	if (ops->downcall == NULL)
+	if (pipe->ops->downcall == NULL)
 		umode &= ~S_IWUGO;
 
 	q.name = name;
@@ -823,8 +831,8 @@ struct dentry *rpc_mkpipe(struct dentry *parent, const char *name,
 	dentry = __rpc_lookup_create_exclusive(parent, &q);
 	if (IS_ERR(dentry))
 		goto out;
-	err = __rpc_mkpipe(dir, dentry, umode, &rpc_pipe_fops,
-			   private, ops, flags);
+	err = __rpc_mkpipe_dentry(dir, dentry, umode, &rpc_pipe_fops,
+				  private, pipe);
 	if (err)
 		goto out_err;
 out:
@@ -837,7 +845,7 @@ out_err:
 			err);
 	goto out;
 }
-EXPORT_SYMBOL_GPL(rpc_mkpipe);
+EXPORT_SYMBOL_GPL(rpc_mkpipe_dentry);
 
 /**
  * rpc_unlink - remove a pipe

^ permalink raw reply related

* [PATCH 5/6] SUNRPC: cleanup GSS pipes usage
From: Stanislav Kinsbursky @ 2011-11-22 15:42 UTC (permalink / raw)
  To: Trond.Myklebust
  Cc: linux-nfs, xemul, neilb, netdev, linux-kernel, jbottomley,
	bfields, davem, devel
In-Reply-To: <20111122134514.479.9848.stgit@localhost6.localdomain6>

Currently gss auth holds RPC inode pointer which is now redundant since it
requires only pipes operations which takes private pipe data as an argument.
Thus this code can be cleaned and all references to RPC inode can be replaced
with privtae pipe data references.

Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>

---
 net/sunrpc/auth_gss/auth_gss.c |   76 ++++++++++++++++++++--------------------
 1 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 40227ef..15fd9fe 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -112,7 +112,7 @@ gss_put_ctx(struct gss_cl_ctx *ctx)
 /* gss_cred_set_ctx:
  * called by gss_upcall_callback and gss_create_upcall in order
  * to set the gss context. The actual exchange of an old context
- * and a new one is protected by the rpci->pipe->lock.
+ * and a new one is protected by the pipe->lock.
  */
 static void
 gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx)
@@ -251,7 +251,7 @@ struct gss_upcall_msg {
 	struct rpc_pipe_msg msg;
 	struct list_head list;
 	struct gss_auth *auth;
-	struct rpc_inode *inode;
+	struct rpc_pipe *pipe;
 	struct rpc_wait_queue rpc_waitqueue;
 	wait_queue_head_t waitqueue;
 	struct gss_cl_ctx *ctx;
@@ -294,10 +294,10 @@ gss_release_msg(struct gss_upcall_msg *gss_msg)
 }
 
 static struct gss_upcall_msg *
-__gss_find_upcall(struct rpc_inode *rpci, uid_t uid)
+__gss_find_upcall(struct rpc_pipe *pipe, uid_t uid)
 {
 	struct gss_upcall_msg *pos;
-	list_for_each_entry(pos, &rpci->pipe->in_downcall, list) {
+	list_for_each_entry(pos, &pipe->in_downcall, list) {
 		if (pos->uid != uid)
 			continue;
 		atomic_inc(&pos->count);
@@ -315,17 +315,17 @@ __gss_find_upcall(struct rpc_inode *rpci, uid_t uid)
 static inline struct gss_upcall_msg *
 gss_add_msg(struct gss_upcall_msg *gss_msg)
 {
-	struct rpc_inode *rpci = gss_msg->inode;
+	struct rpc_pipe *pipe = gss_msg->pipe;
 	struct gss_upcall_msg *old;
 
-	spin_lock(&rpci->pipe->lock);
-	old = __gss_find_upcall(rpci, gss_msg->uid);
+	spin_lock(&pipe->lock);
+	old = __gss_find_upcall(pipe, gss_msg->uid);
 	if (old == NULL) {
 		atomic_inc(&gss_msg->count);
-		list_add(&gss_msg->list, &rpci->pipe->in_downcall);
+		list_add(&gss_msg->list, &pipe->in_downcall);
 	} else
 		gss_msg = old;
-	spin_unlock(&rpci->pipe->lock);
+	spin_unlock(&pipe->lock);
 	return gss_msg;
 }
 
@@ -341,14 +341,14 @@ __gss_unhash_msg(struct gss_upcall_msg *gss_msg)
 static void
 gss_unhash_msg(struct gss_upcall_msg *gss_msg)
 {
-	struct rpc_inode *rpci = gss_msg->inode;
+	struct rpc_pipe *pipe = gss_msg->pipe;
 
 	if (list_empty(&gss_msg->list))
 		return;
-	spin_lock(&rpci->pipe->lock);
+	spin_lock(&pipe->lock);
 	if (!list_empty(&gss_msg->list))
 		__gss_unhash_msg(gss_msg);
-	spin_unlock(&rpci->pipe->lock);
+	spin_unlock(&pipe->lock);
 }
 
 static void
@@ -375,11 +375,11 @@ gss_upcall_callback(struct rpc_task *task)
 	struct gss_cred *gss_cred = container_of(task->tk_rqstp->rq_cred,
 			struct gss_cred, gc_base);
 	struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall;
-	struct rpc_inode *rpci = gss_msg->inode;
+	struct rpc_pipe *pipe = gss_msg->pipe;
 
-	spin_lock(&rpci->pipe->lock);
+	spin_lock(&pipe->lock);
 	gss_handle_downcall_result(gss_cred, gss_msg);
-	spin_unlock(&rpci->pipe->lock);
+	spin_unlock(&pipe->lock);
 	task->tk_status = gss_msg->msg.errno;
 	gss_release_msg(gss_msg);
 }
@@ -451,7 +451,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid, struct rpc_clnt *clnt,
 		kfree(gss_msg);
 		return ERR_PTR(vers);
 	}
-	gss_msg->inode = RPC_I(gss_auth->dentry[vers]->d_inode);
+	gss_msg->pipe = RPC_I(gss_auth->dentry[vers]->d_inode)->pipe;
 	INIT_LIST_HEAD(&gss_msg->list);
 	rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq");
 	init_waitqueue_head(&gss_msg->waitqueue);
@@ -475,7 +475,7 @@ gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cr
 		return gss_new;
 	gss_msg = gss_add_msg(gss_new);
 	if (gss_msg == gss_new) {
-		int res = rpc_queue_upcall(gss_new->inode->pipe, &gss_new->msg);
+		int res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg);
 		if (res) {
 			gss_unhash_msg(gss_new);
 			gss_msg = ERR_PTR(res);
@@ -506,7 +506,7 @@ gss_refresh_upcall(struct rpc_task *task)
 	struct gss_cred *gss_cred = container_of(cred,
 			struct gss_cred, gc_base);
 	struct gss_upcall_msg *gss_msg;
-	struct rpc_inode *rpci;
+	struct rpc_pipe *pipe;
 	int err = 0;
 
 	dprintk("RPC: %5u gss_refresh_upcall for uid %u\n", task->tk_pid,
@@ -524,8 +524,8 @@ gss_refresh_upcall(struct rpc_task *task)
 		err = PTR_ERR(gss_msg);
 		goto out;
 	}
-	rpci = gss_msg->inode;
-	spin_lock(&rpci->pipe->lock);
+	pipe = gss_msg->pipe;
+	spin_lock(&pipe->lock);
 	if (gss_cred->gc_upcall != NULL)
 		rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL);
 	else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) {
@@ -538,7 +538,7 @@ gss_refresh_upcall(struct rpc_task *task)
 		gss_handle_downcall_result(gss_cred, gss_msg);
 		err = gss_msg->msg.errno;
 	}
-	spin_unlock(&rpci->pipe->lock);
+	spin_unlock(&pipe->lock);
 	gss_release_msg(gss_msg);
 out:
 	dprintk("RPC: %5u gss_refresh_upcall for uid %u result %d\n",
@@ -549,7 +549,7 @@ out:
 static inline int
 gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
 {
-	struct rpc_inode *rpci;
+	struct rpc_pipe *pipe;
 	struct rpc_cred *cred = &gss_cred->gc_base;
 	struct gss_upcall_msg *gss_msg;
 	DEFINE_WAIT(wait);
@@ -573,14 +573,14 @@ retry:
 		err = PTR_ERR(gss_msg);
 		goto out;
 	}
-	rpci = gss_msg->inode;
+	pipe = gss_msg->pipe;
 	for (;;) {
 		prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_KILLABLE);
-		spin_lock(&rpci->pipe->lock);
+		spin_lock(&pipe->lock);
 		if (gss_msg->ctx != NULL || gss_msg->msg.errno < 0) {
 			break;
 		}
-		spin_unlock(&rpci->pipe->lock);
+		spin_unlock(&pipe->lock);
 		if (fatal_signal_pending(current)) {
 			err = -ERESTARTSYS;
 			goto out_intr;
@@ -591,7 +591,7 @@ retry:
 		gss_cred_set_ctx(cred, gss_msg->ctx);
 	else
 		err = gss_msg->msg.errno;
-	spin_unlock(&rpci->pipe->lock);
+	spin_unlock(&pipe->lock);
 out_intr:
 	finish_wait(&gss_msg->waitqueue, &wait);
 	gss_release_msg(gss_msg);
@@ -629,7 +629,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	const void *p, *end;
 	void *buf;
 	struct gss_upcall_msg *gss_msg;
-	struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode);
+	struct rpc_pipe *pipe = RPC_I(filp->f_dentry->d_inode)->pipe;
 	struct gss_cl_ctx *ctx;
 	uid_t uid;
 	ssize_t err = -EFBIG;
@@ -659,14 +659,14 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 
 	err = -ENOENT;
 	/* Find a matching upcall */
-	spin_lock(&rpci->pipe->lock);
-	gss_msg = __gss_find_upcall(rpci, uid);
+	spin_lock(&pipe->lock);
+	gss_msg = __gss_find_upcall(pipe, uid);
 	if (gss_msg == NULL) {
-		spin_unlock(&rpci->pipe->lock);
+		spin_unlock(&pipe->lock);
 		goto err_put_ctx;
 	}
 	list_del_init(&gss_msg->list);
-	spin_unlock(&rpci->pipe->lock);
+	spin_unlock(&pipe->lock);
 
 	p = gss_fill_context(p, end, ctx, gss_msg->auth->mech);
 	if (IS_ERR(p)) {
@@ -694,9 +694,9 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	err = mlen;
 
 err_release_msg:
-	spin_lock(&rpci->pipe->lock);
+	spin_lock(&pipe->lock);
 	__gss_unhash_msg(gss_msg);
-	spin_unlock(&rpci->pipe->lock);
+	spin_unlock(&pipe->lock);
 	gss_release_msg(gss_msg);
 err_put_ctx:
 	gss_put_ctx(ctx);
@@ -742,23 +742,23 @@ static int gss_pipe_open_v1(struct inode *inode)
 static void
 gss_pipe_release(struct inode *inode)
 {
-	struct rpc_inode *rpci = RPC_I(inode);
+	struct rpc_pipe *pipe = RPC_I(inode)->pipe;
 	struct gss_upcall_msg *gss_msg;
 
 restart:
-	spin_lock(&rpci->pipe->lock);
-	list_for_each_entry(gss_msg, &rpci->pipe->in_downcall, list) {
+	spin_lock(&pipe->lock);
+	list_for_each_entry(gss_msg, &pipe->in_downcall, list) {
 
 		if (!list_empty(&gss_msg->msg.list))
 			continue;
 		gss_msg->msg.errno = -EPIPE;
 		atomic_inc(&gss_msg->count);
 		__gss_unhash_msg(gss_msg);
-		spin_unlock(&rpci->pipe->lock);
+		spin_unlock(&pipe->lock);
 		gss_release_msg(gss_msg);
 		goto restart;
 	}
-	spin_unlock(&rpci->pipe->lock);
+	spin_unlock(&pipe->lock);
 
 	put_pipe_version();
 }

^ permalink raw reply related

* [PATCH 4/6] SUNPRC: cleanup RPC PipeFS pipes upcall interface
From: Stanislav Kinsbursky @ 2011-11-22 15:41 UTC (permalink / raw)
  To: Trond.Myklebust
  Cc: linux-nfs, xemul, neilb, netdev, linux-kernel, jbottomley,
	bfields, davem, devel
In-Reply-To: <20111122134514.479.9848.stgit@localhost6.localdomain6>

RPC pipe upcall doesn't requires only private pipe data. Thus RPC inode
references in this code can be removed.

Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>

---
 fs/nfs/blocklayout/blocklayoutdev.c |    2 +-
 fs/nfs/blocklayout/blocklayoutdm.c  |    2 +-
 fs/nfs/idmap.c                      |    4 ++--
 include/linux/sunrpc/rpc_pipe_fs.h  |    2 +-
 net/sunrpc/auth_gss/auth_gss.c      |    3 +--
 net/sunrpc/rpc_pipe.c               |    3 +--
 6 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index a83b393..44dc348 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -168,7 +168,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
 
 	dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
 	add_wait_queue(&bl_wq, &wq);
-	if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+	if (rpc_queue_upcall(RPC_I(bl_device_pipe->d_inode)->pipe, &msg) < 0) {
 		remove_wait_queue(&bl_wq, &wq);
 		goto out;
 	}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
index d055c75..3c38244 100644
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -66,7 +66,7 @@ static void dev_remove(dev_t dev)
 	msg.len = sizeof(bl_msg) + bl_msg.totallen;
 
 	add_wait_queue(&bl_wq, &wq);
-	if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+	if (rpc_queue_upcall(RPC_I(bl_device_pipe->d_inode)->pipe, &msg) < 0) {
 		remove_wait_queue(&bl_wq, &wq);
 		goto out;
 	}
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index f20801a..7e3d8dd 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -508,7 +508,7 @@ nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h,
 	msg.len = sizeof(*im);
 
 	add_wait_queue(&idmap->idmap_wq, &wq);
-	if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
+	if (rpc_queue_upcall(RPC_I(idmap->idmap_dentry->d_inode)->pipe, &msg) < 0) {
 		remove_wait_queue(&idmap->idmap_wq, &wq);
 		goto out;
 	}
@@ -569,7 +569,7 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
 
 	add_wait_queue(&idmap->idmap_wq, &wq);
 
-	if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
+	if (rpc_queue_upcall(RPC_I(idmap->idmap_dentry->d_inode)->pipe, &msg) < 0) {
 		remove_wait_queue(&idmap->idmap_wq, &wq);
 		goto out;
 	}
diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index c2fa330..ad78bea 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -64,7 +64,7 @@ extern void rpc_put_sb_net(const struct net *net);
 
 extern ssize_t rpc_pipe_generic_upcall(struct file *, struct rpc_pipe_msg *,
 				       char __user *, size_t);
-extern int rpc_queue_upcall(struct inode *, struct rpc_pipe_msg *);
+extern int rpc_queue_upcall(struct rpc_pipe *, struct rpc_pipe_msg *);
 
 struct rpc_clnt;
 extern struct dentry *rpc_create_client_dir(struct dentry *, struct qstr *, struct rpc_clnt *);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 70a7953..40227ef 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -475,8 +475,7 @@ gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cr
 		return gss_new;
 	gss_msg = gss_add_msg(gss_new);
 	if (gss_msg == gss_new) {
-		struct inode *inode = &gss_new->inode->vfs_inode;
-		int res = rpc_queue_upcall(inode, &gss_new->msg);
+		int res = rpc_queue_upcall(gss_new->inode->pipe, &gss_new->msg);
 		if (res) {
 			gss_unhash_msg(gss_new);
 			gss_msg = ERR_PTR(res);
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index edf140a..0eed975 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -110,9 +110,8 @@ rpc_timeout_upcall_queue(struct work_struct *work)
  * initialize the fields of @msg (other than @msg->list) appropriately.
  */
 int
-rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg)
+rpc_queue_upcall(struct rpc_pipe *pipe, struct rpc_pipe_msg *msg)
 {
-	struct rpc_pipe *pipe = RPC_I(inode)->pipe;
 	int res = -EPIPE;
 
 	spin_lock(&pipe->lock);

^ permalink raw reply related

* [PATCH 3/6] SUNRPC: cleanup PipeFS redundant RPC inode usage
From: Stanislav Kinsbursky @ 2011-11-22 15:41 UTC (permalink / raw)
  To: Trond.Myklebust
  Cc: linux-nfs, xemul, neilb, netdev, linux-kernel, jbottomley,
	bfields, davem, devel
In-Reply-To: <20111122134514.479.9848.stgit@localhost6.localdomain6>

This patch removes redundant RPC inode references from PipeFS. These places are
actually where pipes operations are performed.

Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>

---
 net/sunrpc/rpc_pipe.c |   93 ++++++++++++++++++++++++-------------------------
 1 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index a95ba18..edf140a 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -112,28 +112,28 @@ rpc_timeout_upcall_queue(struct work_struct *work)
 int
 rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg)
 {
-	struct rpc_inode *rpci = RPC_I(inode);
+	struct rpc_pipe *pipe = RPC_I(inode)->pipe;
 	int res = -EPIPE;
 
-	spin_lock(&rpci->pipe->lock);
-	if (rpci->pipe->ops == NULL)
+	spin_lock(&pipe->lock);
+	if (pipe->ops == NULL)
 		goto out;
-	if (rpci->pipe->nreaders) {
-		list_add_tail(&msg->list, &rpci->pipe->pipe);
-		rpci->pipe->pipelen += msg->len;
+	if (pipe->nreaders) {
+		list_add_tail(&msg->list, &pipe->pipe);
+		pipe->pipelen += msg->len;
 		res = 0;
-	} else if (rpci->pipe->flags & RPC_PIPE_WAIT_FOR_OPEN) {
-		if (list_empty(&rpci->pipe->pipe))
+	} else if (pipe->flags & RPC_PIPE_WAIT_FOR_OPEN) {
+		if (list_empty(&pipe->pipe))
 			queue_delayed_work(rpciod_workqueue,
-					&rpci->pipe->queue_timeout,
+					&pipe->queue_timeout,
 					RPC_UPCALL_TIMEOUT);
-		list_add_tail(&msg->list, &rpci->pipe->pipe);
-		rpci->pipe->pipelen += msg->len;
+		list_add_tail(&msg->list, &pipe->pipe);
+		pipe->pipelen += msg->len;
 		res = 0;
 	}
 out:
-	spin_unlock(&rpci->pipe->lock);
-	wake_up(&rpci->pipe->waitq);
+	spin_unlock(&pipe->lock);
+	wake_up(&pipe->waitq);
 	return res;
 }
 EXPORT_SYMBOL_GPL(rpc_queue_upcall);
@@ -201,23 +201,23 @@ rpc_destroy_inode(struct inode *inode)
 static int
 rpc_pipe_open(struct inode *inode, struct file *filp)
 {
-	struct rpc_inode *rpci = RPC_I(inode);
+	struct rpc_pipe *pipe = RPC_I(inode)->pipe;
 	int first_open;
 	int res = -ENXIO;
 
 	mutex_lock(&inode->i_mutex);
-	if (rpci->pipe->ops == NULL)
+	if (pipe->ops == NULL)
 		goto out;
-	first_open = rpci->pipe->nreaders == 0 && rpci->pipe->nwriters == 0;
-	if (first_open && rpci->pipe->ops->open_pipe) {
-		res = rpci->pipe->ops->open_pipe(inode);
+	first_open = pipe->nreaders == 0 && pipe->nwriters == 0;
+	if (first_open && pipe->ops->open_pipe) {
+		res = pipe->ops->open_pipe(inode);
 		if (res)
 			goto out;
 	}
 	if (filp->f_mode & FMODE_READ)
-		rpci->pipe->nreaders++;
+		pipe->nreaders++;
 	if (filp->f_mode & FMODE_WRITE)
-		rpci->pipe->nwriters++;
+		pipe->nwriters++;
 	res = 0;
 out:
 	mutex_unlock(&inode->i_mutex);
@@ -268,39 +268,39 @@ static ssize_t
 rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct rpc_inode *rpci = RPC_I(inode);
+	struct rpc_pipe *pipe = RPC_I(inode)->pipe;
 	struct rpc_pipe_msg *msg;
 	int res = 0;
 
 	mutex_lock(&inode->i_mutex);
-	if (rpci->pipe->ops == NULL) {
+	if (pipe->ops == NULL) {
 		res = -EPIPE;
 		goto out_unlock;
 	}
 	msg = filp->private_data;
 	if (msg == NULL) {
-		spin_lock(&rpci->pipe->lock);
-		if (!list_empty(&rpci->pipe->pipe)) {
-			msg = list_entry(rpci->pipe->pipe.next,
+		spin_lock(&pipe->lock);
+		if (!list_empty(&pipe->pipe)) {
+			msg = list_entry(pipe->pipe.next,
 					struct rpc_pipe_msg,
 					list);
-			list_move(&msg->list, &rpci->pipe->in_upcall);
-			rpci->pipe->pipelen -= msg->len;
+			list_move(&msg->list, &pipe->in_upcall);
+			pipe->pipelen -= msg->len;
 			filp->private_data = msg;
 			msg->copied = 0;
 		}
-		spin_unlock(&rpci->pipe->lock);
+		spin_unlock(&pipe->lock);
 		if (msg == NULL)
 			goto out_unlock;
 	}
 	/* NOTE: it is up to the callback to update msg->copied */
-	res = rpci->pipe->ops->upcall(filp, msg, buf, len);
+	res = pipe->ops->upcall(filp, msg, buf, len);
 	if (res < 0 || msg->len == msg->copied) {
 		filp->private_data = NULL;
-		spin_lock(&rpci->pipe->lock);
+		spin_lock(&pipe->lock);
 		list_del_init(&msg->list);
-		spin_unlock(&rpci->pipe->lock);
-		rpci->pipe->ops->destroy_msg(msg);
+		spin_unlock(&pipe->lock);
+		pipe->ops->destroy_msg(msg);
 	}
 out_unlock:
 	mutex_unlock(&inode->i_mutex);
@@ -311,13 +311,13 @@ static ssize_t
 rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *offset)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct rpc_inode *rpci = RPC_I(inode);
+	struct rpc_pipe *pipe = RPC_I(inode)->pipe;
 	int res;
 
 	mutex_lock(&inode->i_mutex);
 	res = -EPIPE;
-	if (rpci->pipe->ops != NULL)
-		res = rpci->pipe->ops->downcall(filp, buf, len);
+	if (pipe->ops != NULL)
+		res = pipe->ops->downcall(filp, buf, len);
 	mutex_unlock(&inode->i_mutex);
 	return res;
 }
@@ -325,16 +325,15 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of
 static unsigned int
 rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
 {
-	struct rpc_inode *rpci;
+	struct rpc_pipe *pipe = RPC_I(filp->f_path.dentry->d_inode)->pipe;
 	unsigned int mask = 0;
 
-	rpci = RPC_I(filp->f_path.dentry->d_inode);
-	poll_wait(filp, &rpci->pipe->waitq, wait);
+	poll_wait(filp, &pipe->waitq, wait);
 
 	mask = POLLOUT | POLLWRNORM;
-	if (rpci->pipe->ops == NULL)
+	if (pipe->ops == NULL)
 		mask |= POLLERR | POLLHUP;
-	if (filp->private_data || !list_empty(&rpci->pipe->pipe))
+	if (filp->private_data || !list_empty(&pipe->pipe))
 		mask |= POLLIN | POLLRDNORM;
 	return mask;
 }
@@ -343,23 +342,23 @@ static long
 rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct rpc_inode *rpci = RPC_I(inode);
+	struct rpc_pipe *pipe = RPC_I(inode)->pipe;
 	int len;
 
 	switch (cmd) {
 	case FIONREAD:
-		spin_lock(&rpci->pipe->lock);
-		if (rpci->pipe->ops == NULL) {
-			spin_unlock(&rpci->pipe->lock);
+		spin_lock(&pipe->lock);
+		if (pipe->ops == NULL) {
+			spin_unlock(&pipe->lock);
 			return -EPIPE;
 		}
-		len = rpci->pipe->pipelen;
+		len = pipe->pipelen;
 		if (filp->private_data) {
 			struct rpc_pipe_msg *msg;
 			msg = filp->private_data;
 			len += msg->len - msg->copied;
 		}
-		spin_unlock(&rpci->pipe->lock);
+		spin_unlock(&pipe->lock);
 		return put_user(len, (int __user *)arg);
 	default:
 		return -EINVAL;
@@ -789,7 +788,7 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
  * @private: private data to associate with the pipe, for the caller's use
  * @ops: operations defining the behavior of the pipe: upcall, downcall,
  *	release_pipe, open_pipe, and destroy_msg.
- * @flags: rpc_inode flags
+ * @flags: rpc_pipe flags
  *
  * Data is made available for userspace to read by calls to
  * rpc_queue_upcall().  The actual reads will result in calls to

^ permalink raw reply related

* [PATCH 2/6] SUNRPC: split SUNPRC PipeFS pipe data and inode creation
From: Stanislav Kinsbursky @ 2011-11-22 15:41 UTC (permalink / raw)
  To: Trond.Myklebust
  Cc: linux-nfs, xemul, neilb, netdev, linux-kernel, jbottomley,
	bfields, davem, devel
In-Reply-To: <20111122134514.479.9848.stgit@localhost6.localdomain6>

Generally, pipe data is used only for pipes, and thus allocating space for it
on every RPC inode allocation is redundant. This patch splits private SUNRPC
PipeFS pipe data and inode, makes pipe data allocated only for pipe inodes.
This patch is also is a next step towards to to removing PipeFS inode
references from kernel code other than PipeFS itself.

Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>

---
 include/linux/sunrpc/rpc_pipe_fs.h |   10 +-
 net/sunrpc/auth_gss/auth_gss.c     |   46 ++++----
 net/sunrpc/rpc_pipe.c              |  208 +++++++++++++++++++-----------------
 3 files changed, 142 insertions(+), 122 deletions(-)

diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index 8c51471..c2fa330 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -21,9 +21,7 @@ struct rpc_pipe_ops {
 	void (*destroy_msg)(struct rpc_pipe_msg *);
 };
 
-struct rpc_inode {
-	struct inode vfs_inode;
-	void *private;
+struct rpc_pipe {
 	struct list_head pipe;
 	struct list_head in_upcall;
 	struct list_head in_downcall;
@@ -38,6 +36,12 @@ struct rpc_inode {
 	spinlock_t lock;
 };
 
+struct rpc_inode {
+	struct inode vfs_inode;
+	void *private;
+	struct rpc_pipe *pipe;
+};
+
 static inline struct rpc_inode *
 RPC_I(struct inode *inode)
 {
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 6ba2784..70a7953 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -112,7 +112,7 @@ gss_put_ctx(struct gss_cl_ctx *ctx)
 /* gss_cred_set_ctx:
  * called by gss_upcall_callback and gss_create_upcall in order
  * to set the gss context. The actual exchange of an old context
- * and a new one is protected by the rpci->lock.
+ * and a new one is protected by the rpci->pipe->lock.
  */
 static void
 gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx)
@@ -297,7 +297,7 @@ static struct gss_upcall_msg *
 __gss_find_upcall(struct rpc_inode *rpci, uid_t uid)
 {
 	struct gss_upcall_msg *pos;
-	list_for_each_entry(pos, &rpci->in_downcall, list) {
+	list_for_each_entry(pos, &rpci->pipe->in_downcall, list) {
 		if (pos->uid != uid)
 			continue;
 		atomic_inc(&pos->count);
@@ -318,14 +318,14 @@ gss_add_msg(struct gss_upcall_msg *gss_msg)
 	struct rpc_inode *rpci = gss_msg->inode;
 	struct gss_upcall_msg *old;
 
-	spin_lock(&rpci->lock);
+	spin_lock(&rpci->pipe->lock);
 	old = __gss_find_upcall(rpci, gss_msg->uid);
 	if (old == NULL) {
 		atomic_inc(&gss_msg->count);
-		list_add(&gss_msg->list, &rpci->in_downcall);
+		list_add(&gss_msg->list, &rpci->pipe->in_downcall);
 	} else
 		gss_msg = old;
-	spin_unlock(&rpci->lock);
+	spin_unlock(&rpci->pipe->lock);
 	return gss_msg;
 }
 
@@ -345,10 +345,10 @@ gss_unhash_msg(struct gss_upcall_msg *gss_msg)
 
 	if (list_empty(&gss_msg->list))
 		return;
-	spin_lock(&rpci->lock);
+	spin_lock(&rpci->pipe->lock);
 	if (!list_empty(&gss_msg->list))
 		__gss_unhash_msg(gss_msg);
-	spin_unlock(&rpci->lock);
+	spin_unlock(&rpci->pipe->lock);
 }
 
 static void
@@ -377,9 +377,9 @@ gss_upcall_callback(struct rpc_task *task)
 	struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall;
 	struct rpc_inode *rpci = gss_msg->inode;
 
-	spin_lock(&rpci->lock);
+	spin_lock(&rpci->pipe->lock);
 	gss_handle_downcall_result(gss_cred, gss_msg);
-	spin_unlock(&rpci->lock);
+	spin_unlock(&rpci->pipe->lock);
 	task->tk_status = gss_msg->msg.errno;
 	gss_release_msg(gss_msg);
 }
@@ -526,7 +526,7 @@ gss_refresh_upcall(struct rpc_task *task)
 		goto out;
 	}
 	rpci = gss_msg->inode;
-	spin_lock(&rpci->lock);
+	spin_lock(&rpci->pipe->lock);
 	if (gss_cred->gc_upcall != NULL)
 		rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL);
 	else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) {
@@ -539,7 +539,7 @@ gss_refresh_upcall(struct rpc_task *task)
 		gss_handle_downcall_result(gss_cred, gss_msg);
 		err = gss_msg->msg.errno;
 	}
-	spin_unlock(&rpci->lock);
+	spin_unlock(&rpci->pipe->lock);
 	gss_release_msg(gss_msg);
 out:
 	dprintk("RPC: %5u gss_refresh_upcall for uid %u result %d\n",
@@ -577,11 +577,11 @@ retry:
 	rpci = gss_msg->inode;
 	for (;;) {
 		prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_KILLABLE);
-		spin_lock(&rpci->lock);
+		spin_lock(&rpci->pipe->lock);
 		if (gss_msg->ctx != NULL || gss_msg->msg.errno < 0) {
 			break;
 		}
-		spin_unlock(&rpci->lock);
+		spin_unlock(&rpci->pipe->lock);
 		if (fatal_signal_pending(current)) {
 			err = -ERESTARTSYS;
 			goto out_intr;
@@ -592,7 +592,7 @@ retry:
 		gss_cred_set_ctx(cred, gss_msg->ctx);
 	else
 		err = gss_msg->msg.errno;
-	spin_unlock(&rpci->lock);
+	spin_unlock(&rpci->pipe->lock);
 out_intr:
 	finish_wait(&gss_msg->waitqueue, &wait);
 	gss_release_msg(gss_msg);
@@ -660,14 +660,14 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 
 	err = -ENOENT;
 	/* Find a matching upcall */
-	spin_lock(&rpci->lock);
+	spin_lock(&rpci->pipe->lock);
 	gss_msg = __gss_find_upcall(rpci, uid);
 	if (gss_msg == NULL) {
-		spin_unlock(&rpci->lock);
+		spin_unlock(&rpci->pipe->lock);
 		goto err_put_ctx;
 	}
 	list_del_init(&gss_msg->list);
-	spin_unlock(&rpci->lock);
+	spin_unlock(&rpci->pipe->lock);
 
 	p = gss_fill_context(p, end, ctx, gss_msg->auth->mech);
 	if (IS_ERR(p)) {
@@ -695,9 +695,9 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	err = mlen;
 
 err_release_msg:
-	spin_lock(&rpci->lock);
+	spin_lock(&rpci->pipe->lock);
 	__gss_unhash_msg(gss_msg);
-	spin_unlock(&rpci->lock);
+	spin_unlock(&rpci->pipe->lock);
 	gss_release_msg(gss_msg);
 err_put_ctx:
 	gss_put_ctx(ctx);
@@ -747,19 +747,19 @@ gss_pipe_release(struct inode *inode)
 	struct gss_upcall_msg *gss_msg;
 
 restart:
-	spin_lock(&rpci->lock);
-	list_for_each_entry(gss_msg, &rpci->in_downcall, list) {
+	spin_lock(&rpci->pipe->lock);
+	list_for_each_entry(gss_msg, &rpci->pipe->in_downcall, list) {
 
 		if (!list_empty(&gss_msg->msg.list))
 			continue;
 		gss_msg->msg.errno = -EPIPE;
 		atomic_inc(&gss_msg->count);
 		__gss_unhash_msg(gss_msg);
-		spin_unlock(&rpci->lock);
+		spin_unlock(&rpci->pipe->lock);
 		gss_release_msg(gss_msg);
 		goto restart;
 	}
-	spin_unlock(&rpci->lock);
+	spin_unlock(&rpci->pipe->lock);
 
 	put_pipe_version();
 }
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index d0ffdf4..a95ba18 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -61,7 +61,7 @@ void rpc_pipefs_notifier_unregister(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(rpc_pipefs_notifier_unregister);
 
-static void rpc_purge_list(struct rpc_inode *rpci, struct list_head *head,
+static void rpc_purge_list(struct rpc_pipe *pipe, struct list_head *head,
 		void (*destroy_msg)(struct rpc_pipe_msg *), int err)
 {
 	struct rpc_pipe_msg *msg;
@@ -74,29 +74,29 @@ static void rpc_purge_list(struct rpc_inode *rpci, struct list_head *head,
 		msg->errno = err;
 		destroy_msg(msg);
 	} while (!list_empty(head));
-	wake_up(&rpci->waitq);
+	wake_up(&pipe->waitq);
 }
 
 static void
 rpc_timeout_upcall_queue(struct work_struct *work)
 {
 	LIST_HEAD(free_list);
-	struct rpc_inode *rpci =
-		container_of(work, struct rpc_inode, queue_timeout.work);
+	struct rpc_pipe *pipe =
+		container_of(work, struct rpc_pipe, queue_timeout.work);
 	void (*destroy_msg)(struct rpc_pipe_msg *);
 
-	spin_lock(&rpci->lock);
-	if (rpci->ops == NULL) {
-		spin_unlock(&rpci->lock);
+	spin_lock(&pipe->lock);
+	if (pipe->ops == NULL) {
+		spin_unlock(&pipe->lock);
 		return;
 	}
-	destroy_msg = rpci->ops->destroy_msg;
-	if (rpci->nreaders == 0) {
-		list_splice_init(&rpci->pipe, &free_list);
-		rpci->pipelen = 0;
+	destroy_msg = pipe->ops->destroy_msg;
+	if (pipe->nreaders == 0) {
+		list_splice_init(&pipe->pipe, &free_list);
+		pipe->pipelen = 0;
 	}
-	spin_unlock(&rpci->lock);
-	rpc_purge_list(rpci, &free_list, destroy_msg, -ETIMEDOUT);
+	spin_unlock(&pipe->lock);
+	rpc_purge_list(pipe, &free_list, destroy_msg, -ETIMEDOUT);
 }
 
 /**
@@ -115,25 +115,25 @@ rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg)
 	struct rpc_inode *rpci = RPC_I(inode);
 	int res = -EPIPE;
 
-	spin_lock(&rpci->lock);
-	if (rpci->ops == NULL)
+	spin_lock(&rpci->pipe->lock);
+	if (rpci->pipe->ops == NULL)
 		goto out;
-	if (rpci->nreaders) {
-		list_add_tail(&msg->list, &rpci->pipe);
-		rpci->pipelen += msg->len;
+	if (rpci->pipe->nreaders) {
+		list_add_tail(&msg->list, &rpci->pipe->pipe);
+		rpci->pipe->pipelen += msg->len;
 		res = 0;
-	} else if (rpci->flags & RPC_PIPE_WAIT_FOR_OPEN) {
-		if (list_empty(&rpci->pipe))
+	} else if (rpci->pipe->flags & RPC_PIPE_WAIT_FOR_OPEN) {
+		if (list_empty(&rpci->pipe->pipe))
 			queue_delayed_work(rpciod_workqueue,
-					&rpci->queue_timeout,
+					&rpci->pipe->queue_timeout,
 					RPC_UPCALL_TIMEOUT);
-		list_add_tail(&msg->list, &rpci->pipe);
-		rpci->pipelen += msg->len;
+		list_add_tail(&msg->list, &rpci->pipe->pipe);
+		rpci->pipe->pipelen += msg->len;
 		res = 0;
 	}
 out:
-	spin_unlock(&rpci->lock);
-	wake_up(&rpci->waitq);
+	spin_unlock(&rpci->pipe->lock);
+	wake_up(&rpci->pipe->waitq);
 	return res;
 }
 EXPORT_SYMBOL_GPL(rpc_queue_upcall);
@@ -147,27 +147,27 @@ rpc_inode_setowner(struct inode *inode, void *private)
 static void
 rpc_close_pipes(struct inode *inode)
 {
-	struct rpc_inode *rpci = RPC_I(inode);
+	struct rpc_pipe *pipe = RPC_I(inode)->pipe;
 	const struct rpc_pipe_ops *ops;
 	int need_release;
 
 	mutex_lock(&inode->i_mutex);
-	ops = rpci->ops;
+	ops = pipe->ops;
 	if (ops != NULL) {
 		LIST_HEAD(free_list);
-		spin_lock(&rpci->lock);
-		need_release = rpci->nreaders != 0 || rpci->nwriters != 0;
-		rpci->nreaders = 0;
-		list_splice_init(&rpci->in_upcall, &free_list);
-		list_splice_init(&rpci->pipe, &free_list);
-		rpci->pipelen = 0;
-		rpci->ops = NULL;
-		spin_unlock(&rpci->lock);
-		rpc_purge_list(rpci, &free_list, ops->destroy_msg, -EPIPE);
-		rpci->nwriters = 0;
+		spin_lock(&pipe->lock);
+		need_release = pipe->nreaders != 0 || pipe->nwriters != 0;
+		pipe->nreaders = 0;
+		list_splice_init(&pipe->in_upcall, &free_list);
+		list_splice_init(&pipe->pipe, &free_list);
+		pipe->pipelen = 0;
+		pipe->ops = NULL;
+		spin_unlock(&pipe->lock);
+		rpc_purge_list(pipe, &free_list, ops->destroy_msg, -EPIPE);
+		pipe->nwriters = 0;
 		if (need_release && ops->release_pipe)
 			ops->release_pipe(inode);
-		cancel_delayed_work_sync(&rpci->queue_timeout);
+		cancel_delayed_work_sync(&pipe->queue_timeout);
 	}
 	rpc_inode_setowner(inode, NULL);
 	mutex_unlock(&inode->i_mutex);
@@ -188,6 +188,7 @@ rpc_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	INIT_LIST_HEAD(&inode->i_dentry);
+	kfree(RPC_I(inode)->pipe);
 	kmem_cache_free(rpc_inode_cachep, RPC_I(inode));
 }
 
@@ -205,18 +206,18 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
 	int res = -ENXIO;
 
 	mutex_lock(&inode->i_mutex);
-	if (rpci->ops == NULL)
+	if (rpci->pipe->ops == NULL)
 		goto out;
-	first_open = rpci->nreaders == 0 && rpci->nwriters == 0;
-	if (first_open && rpci->ops->open_pipe) {
-		res = rpci->ops->open_pipe(inode);
+	first_open = rpci->pipe->nreaders == 0 && rpci->pipe->nwriters == 0;
+	if (first_open && rpci->pipe->ops->open_pipe) {
+		res = rpci->pipe->ops->open_pipe(inode);
 		if (res)
 			goto out;
 	}
 	if (filp->f_mode & FMODE_READ)
-		rpci->nreaders++;
+		rpci->pipe->nreaders++;
 	if (filp->f_mode & FMODE_WRITE)
-		rpci->nwriters++;
+		rpci->pipe->nwriters++;
 	res = 0;
 out:
 	mutex_unlock(&inode->i_mutex);
@@ -226,38 +227,38 @@ out:
 static int
 rpc_pipe_release(struct inode *inode, struct file *filp)
 {
-	struct rpc_inode *rpci = RPC_I(inode);
+	struct rpc_pipe *pipe = RPC_I(inode)->pipe;
 	struct rpc_pipe_msg *msg;
 	int last_close;
 
 	mutex_lock(&inode->i_mutex);
-	if (rpci->ops == NULL)
+	if (pipe->ops == NULL)
 		goto out;
 	msg = filp->private_data;
 	if (msg != NULL) {
-		spin_lock(&rpci->lock);
+		spin_lock(&pipe->lock);
 		msg->errno = -EAGAIN;
 		list_del_init(&msg->list);
-		spin_unlock(&rpci->lock);
-		rpci->ops->destroy_msg(msg);
+		spin_unlock(&pipe->lock);
+		pipe->ops->destroy_msg(msg);
 	}
 	if (filp->f_mode & FMODE_WRITE)
-		rpci->nwriters --;
+		pipe->nwriters --;
 	if (filp->f_mode & FMODE_READ) {
-		rpci->nreaders --;
-		if (rpci->nreaders == 0) {
+		pipe->nreaders --;
+		if (pipe->nreaders == 0) {
 			LIST_HEAD(free_list);
-			spin_lock(&rpci->lock);
-			list_splice_init(&rpci->pipe, &free_list);
-			rpci->pipelen = 0;
-			spin_unlock(&rpci->lock);
-			rpc_purge_list(rpci, &free_list,
-					rpci->ops->destroy_msg, -EAGAIN);
+			spin_lock(&pipe->lock);
+			list_splice_init(&pipe->pipe, &free_list);
+			pipe->pipelen = 0;
+			spin_unlock(&pipe->lock);
+			rpc_purge_list(pipe, &free_list,
+					pipe->ops->destroy_msg, -EAGAIN);
 		}
 	}
-	last_close = rpci->nwriters == 0 && rpci->nreaders == 0;
-	if (last_close && rpci->ops->release_pipe)
-		rpci->ops->release_pipe(inode);
+	last_close = pipe->nwriters == 0 && pipe->nreaders == 0;
+	if (last_close && pipe->ops->release_pipe)
+		pipe->ops->release_pipe(inode);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return 0;
@@ -272,34 +273,34 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
 	int res = 0;
 
 	mutex_lock(&inode->i_mutex);
-	if (rpci->ops == NULL) {
+	if (rpci->pipe->ops == NULL) {
 		res = -EPIPE;
 		goto out_unlock;
 	}
 	msg = filp->private_data;
 	if (msg == NULL) {
-		spin_lock(&rpci->lock);
-		if (!list_empty(&rpci->pipe)) {
-			msg = list_entry(rpci->pipe.next,
+		spin_lock(&rpci->pipe->lock);
+		if (!list_empty(&rpci->pipe->pipe)) {
+			msg = list_entry(rpci->pipe->pipe.next,
 					struct rpc_pipe_msg,
 					list);
-			list_move(&msg->list, &rpci->in_upcall);
-			rpci->pipelen -= msg->len;
+			list_move(&msg->list, &rpci->pipe->in_upcall);
+			rpci->pipe->pipelen -= msg->len;
 			filp->private_data = msg;
 			msg->copied = 0;
 		}
-		spin_unlock(&rpci->lock);
+		spin_unlock(&rpci->pipe->lock);
 		if (msg == NULL)
 			goto out_unlock;
 	}
 	/* NOTE: it is up to the callback to update msg->copied */
-	res = rpci->ops->upcall(filp, msg, buf, len);
+	res = rpci->pipe->ops->upcall(filp, msg, buf, len);
 	if (res < 0 || msg->len == msg->copied) {
 		filp->private_data = NULL;
-		spin_lock(&rpci->lock);
+		spin_lock(&rpci->pipe->lock);
 		list_del_init(&msg->list);
-		spin_unlock(&rpci->lock);
-		rpci->ops->destroy_msg(msg);
+		spin_unlock(&rpci->pipe->lock);
+		rpci->pipe->ops->destroy_msg(msg);
 	}
 out_unlock:
 	mutex_unlock(&inode->i_mutex);
@@ -315,8 +316,8 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of
 
 	mutex_lock(&inode->i_mutex);
 	res = -EPIPE;
-	if (rpci->ops != NULL)
-		res = rpci->ops->downcall(filp, buf, len);
+	if (rpci->pipe->ops != NULL)
+		res = rpci->pipe->ops->downcall(filp, buf, len);
 	mutex_unlock(&inode->i_mutex);
 	return res;
 }
@@ -328,12 +329,12 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
 	unsigned int mask = 0;
 
 	rpci = RPC_I(filp->f_path.dentry->d_inode);
-	poll_wait(filp, &rpci->waitq, wait);
+	poll_wait(filp, &rpci->pipe->waitq, wait);
 
 	mask = POLLOUT | POLLWRNORM;
-	if (rpci->ops == NULL)
+	if (rpci->pipe->ops == NULL)
 		mask |= POLLERR | POLLHUP;
-	if (filp->private_data || !list_empty(&rpci->pipe))
+	if (filp->private_data || !list_empty(&rpci->pipe->pipe))
 		mask |= POLLIN | POLLRDNORM;
 	return mask;
 }
@@ -347,18 +348,18 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case FIONREAD:
-		spin_lock(&rpci->lock);
-		if (rpci->ops == NULL) {
-			spin_unlock(&rpci->lock);
+		spin_lock(&rpci->pipe->lock);
+		if (rpci->pipe->ops == NULL) {
+			spin_unlock(&rpci->pipe->lock);
 			return -EPIPE;
 		}
-		len = rpci->pipelen;
+		len = rpci->pipe->pipelen;
 		if (filp->private_data) {
 			struct rpc_pipe_msg *msg;
 			msg = filp->private_data;
 			len += msg->len - msg->copied;
 		}
-		spin_unlock(&rpci->lock);
+		spin_unlock(&rpci->pipe->lock);
 		return put_user(len, (int __user *)arg);
 	default:
 		return -EINVAL;
@@ -543,6 +544,23 @@ static int __rpc_mkdir(struct inode *dir, struct dentry *dentry,
 	return 0;
 }
 
+static void
+init_pipe(struct rpc_pipe *pipe)
+{
+	pipe->nreaders = 0;
+	pipe->nwriters = 0;
+	INIT_LIST_HEAD(&pipe->in_upcall);
+	INIT_LIST_HEAD(&pipe->in_downcall);
+	INIT_LIST_HEAD(&pipe->pipe);
+	pipe->pipelen = 0;
+	init_waitqueue_head(&pipe->waitq);
+	INIT_DELAYED_WORK(&pipe->queue_timeout,
+			    rpc_timeout_upcall_queue);
+	pipe->ops = NULL;
+	spin_lock_init(&pipe->lock);
+
+}
+
 static int __rpc_mkpipe(struct inode *dir, struct dentry *dentry,
 			umode_t mode,
 			const struct file_operations *i_fop,
@@ -550,16 +568,24 @@ static int __rpc_mkpipe(struct inode *dir, struct dentry *dentry,
 			const struct rpc_pipe_ops *ops,
 			int flags)
 {
+	struct rpc_pipe *pipe;
 	struct rpc_inode *rpci;
 	int err;
 
+	pipe = kzalloc(sizeof(struct rpc_pipe), GFP_KERNEL);
+	if (!pipe)
+		return -ENOMEM;
+	init_pipe(pipe);
 	err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private);
-	if (err)
+	if (err) {
+		kfree(pipe);
 		return err;
+	}
 	rpci = RPC_I(dentry->d_inode);
 	rpci->private = private;
-	rpci->flags = flags;
-	rpci->ops = ops;
+	rpci->pipe = pipe;
+	rpci->pipe->flags = flags;
+	rpci->pipe->ops = ops;
 	fsnotify_create(dir, dentry);
 	return 0;
 }
@@ -1123,17 +1149,7 @@ init_once(void *foo)
 
 	inode_init_once(&rpci->vfs_inode);
 	rpci->private = NULL;
-	rpci->nreaders = 0;
-	rpci->nwriters = 0;
-	INIT_LIST_HEAD(&rpci->in_upcall);
-	INIT_LIST_HEAD(&rpci->in_downcall);
-	INIT_LIST_HEAD(&rpci->pipe);
-	rpci->pipelen = 0;
-	init_waitqueue_head(&rpci->waitq);
-	INIT_DELAYED_WORK(&rpci->queue_timeout,
-			    rpc_timeout_upcall_queue);
-	rpci->ops = NULL;
-	spin_lock_init(&rpci->lock);
+	rpci->pipe = NULL;
 }
 
 int register_rpc_pipefs(void)

^ permalink raw reply related

* Re: [PATCH] xfrm: optimize ipv4 selector matching
From: Alexey Dobriyan @ 2011-11-22 15:41 UTC (permalink / raw)
  To: David Laight; +Cc: Eric Dumazet, davem, netdev
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F6D8AED9@saturn3.aculab.com>

On Tue, Nov 22, 2011 at 03:15:25PM -0000, David Laight wrote:
>  
> > Please use :
> > 
> > +static inline bool addr4_match(__be32 a1, __be32 a2, u8 prefixlen)
> > +{
> > +       /* C99 6.5.7 (3): u32 << 32 is undefined behaviour */
> > +       if (prefixlen == 0)
> > +               return true;
> > +       return !((a1 ^ a2) & htonl(0xFFFFFFFFu << (32 - prefixlen)));
> > +}
> 
> I'm not sure I'd agree about using 'u8'.
> It may well cause an unnecessary mask with 0xff.

It's u8 in all other places.

^ permalink raw reply

* [PATCH 1/6] SUNRPC: replace inode lock with pipe lock for RPC PipeFS operations
From: Stanislav Kinsbursky @ 2011-11-22 15:41 UTC (permalink / raw)
  To: Trond.Myklebust
  Cc: linux-nfs, xemul, neilb, netdev, linux-kernel, jbottomley,
	bfields, davem, devel
In-Reply-To: <20111122134514.479.9848.stgit@localhost6.localdomain6>

Currenly, inode i_lock is used to provide concurrent access to SUNPRC PipeFS
pipes. It looks redundant, since now other use of inode is present in most of
these places and thus can be easely replaced, which will allow to remove most
of inode references from PipeFS code. This is a first step towards to removing
PipeFS inode references from kernel code other than PipeFS itself.

Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>

---
 include/linux/sunrpc/rpc_pipe_fs.h |    1 +
 net/sunrpc/auth_gss/auth_gss.c     |   57 ++++++++++++++++++------------------
 net/sunrpc/rpc_pipe.c              |   38 ++++++++++++------------
 3 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index f32490c..8c51471 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -35,6 +35,7 @@ struct rpc_inode {
 	int flags;
 	struct delayed_work queue_timeout;
 	const struct rpc_pipe_ops *ops;
+	spinlock_t lock;
 };
 
 static inline struct rpc_inode *
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 364eb45..6ba2784 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -112,7 +112,7 @@ gss_put_ctx(struct gss_cl_ctx *ctx)
 /* gss_cred_set_ctx:
  * called by gss_upcall_callback and gss_create_upcall in order
  * to set the gss context. The actual exchange of an old context
- * and a new one is protected by the inode->i_lock.
+ * and a new one is protected by the rpci->lock.
  */
 static void
 gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx)
@@ -316,17 +316,16 @@ static inline struct gss_upcall_msg *
 gss_add_msg(struct gss_upcall_msg *gss_msg)
 {
 	struct rpc_inode *rpci = gss_msg->inode;
-	struct inode *inode = &rpci->vfs_inode;
 	struct gss_upcall_msg *old;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&rpci->lock);
 	old = __gss_find_upcall(rpci, gss_msg->uid);
 	if (old == NULL) {
 		atomic_inc(&gss_msg->count);
 		list_add(&gss_msg->list, &rpci->in_downcall);
 	} else
 		gss_msg = old;
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&rpci->lock);
 	return gss_msg;
 }
 
@@ -342,14 +341,14 @@ __gss_unhash_msg(struct gss_upcall_msg *gss_msg)
 static void
 gss_unhash_msg(struct gss_upcall_msg *gss_msg)
 {
-	struct inode *inode = &gss_msg->inode->vfs_inode;
+	struct rpc_inode *rpci = gss_msg->inode;
 
 	if (list_empty(&gss_msg->list))
 		return;
-	spin_lock(&inode->i_lock);
+	spin_lock(&rpci->lock);
 	if (!list_empty(&gss_msg->list))
 		__gss_unhash_msg(gss_msg);
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&rpci->lock);
 }
 
 static void
@@ -376,11 +375,11 @@ gss_upcall_callback(struct rpc_task *task)
 	struct gss_cred *gss_cred = container_of(task->tk_rqstp->rq_cred,
 			struct gss_cred, gc_base);
 	struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall;
-	struct inode *inode = &gss_msg->inode->vfs_inode;
+	struct rpc_inode *rpci = gss_msg->inode;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&rpci->lock);
 	gss_handle_downcall_result(gss_cred, gss_msg);
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&rpci->lock);
 	task->tk_status = gss_msg->msg.errno;
 	gss_release_msg(gss_msg);
 }
@@ -508,7 +507,7 @@ gss_refresh_upcall(struct rpc_task *task)
 	struct gss_cred *gss_cred = container_of(cred,
 			struct gss_cred, gc_base);
 	struct gss_upcall_msg *gss_msg;
-	struct inode *inode;
+	struct rpc_inode *rpci;
 	int err = 0;
 
 	dprintk("RPC: %5u gss_refresh_upcall for uid %u\n", task->tk_pid,
@@ -526,8 +525,8 @@ gss_refresh_upcall(struct rpc_task *task)
 		err = PTR_ERR(gss_msg);
 		goto out;
 	}
-	inode = &gss_msg->inode->vfs_inode;
-	spin_lock(&inode->i_lock);
+	rpci = gss_msg->inode;
+	spin_lock(&rpci->lock);
 	if (gss_cred->gc_upcall != NULL)
 		rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL);
 	else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) {
@@ -540,7 +539,7 @@ gss_refresh_upcall(struct rpc_task *task)
 		gss_handle_downcall_result(gss_cred, gss_msg);
 		err = gss_msg->msg.errno;
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&rpci->lock);
 	gss_release_msg(gss_msg);
 out:
 	dprintk("RPC: %5u gss_refresh_upcall for uid %u result %d\n",
@@ -551,7 +550,7 @@ out:
 static inline int
 gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
 {
-	struct inode *inode;
+	struct rpc_inode *rpci;
 	struct rpc_cred *cred = &gss_cred->gc_base;
 	struct gss_upcall_msg *gss_msg;
 	DEFINE_WAIT(wait);
@@ -575,14 +574,14 @@ retry:
 		err = PTR_ERR(gss_msg);
 		goto out;
 	}
-	inode = &gss_msg->inode->vfs_inode;
+	rpci = gss_msg->inode;
 	for (;;) {
 		prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_KILLABLE);
-		spin_lock(&inode->i_lock);
+		spin_lock(&rpci->lock);
 		if (gss_msg->ctx != NULL || gss_msg->msg.errno < 0) {
 			break;
 		}
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&rpci->lock);
 		if (fatal_signal_pending(current)) {
 			err = -ERESTARTSYS;
 			goto out_intr;
@@ -593,7 +592,7 @@ retry:
 		gss_cred_set_ctx(cred, gss_msg->ctx);
 	else
 		err = gss_msg->msg.errno;
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&rpci->lock);
 out_intr:
 	finish_wait(&gss_msg->waitqueue, &wait);
 	gss_release_msg(gss_msg);
@@ -631,7 +630,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	const void *p, *end;
 	void *buf;
 	struct gss_upcall_msg *gss_msg;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode);
 	struct gss_cl_ctx *ctx;
 	uid_t uid;
 	ssize_t err = -EFBIG;
@@ -661,14 +660,14 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 
 	err = -ENOENT;
 	/* Find a matching upcall */
-	spin_lock(&inode->i_lock);
-	gss_msg = __gss_find_upcall(RPC_I(inode), uid);
+	spin_lock(&rpci->lock);
+	gss_msg = __gss_find_upcall(rpci, uid);
 	if (gss_msg == NULL) {
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&rpci->lock);
 		goto err_put_ctx;
 	}
 	list_del_init(&gss_msg->list);
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&rpci->lock);
 
 	p = gss_fill_context(p, end, ctx, gss_msg->auth->mech);
 	if (IS_ERR(p)) {
@@ -696,9 +695,9 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	err = mlen;
 
 err_release_msg:
-	spin_lock(&inode->i_lock);
+	spin_lock(&rpci->lock);
 	__gss_unhash_msg(gss_msg);
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&rpci->lock);
 	gss_release_msg(gss_msg);
 err_put_ctx:
 	gss_put_ctx(ctx);
@@ -748,7 +747,7 @@ gss_pipe_release(struct inode *inode)
 	struct gss_upcall_msg *gss_msg;
 
 restart:
-	spin_lock(&inode->i_lock);
+	spin_lock(&rpci->lock);
 	list_for_each_entry(gss_msg, &rpci->in_downcall, list) {
 
 		if (!list_empty(&gss_msg->msg.list))
@@ -756,11 +755,11 @@ restart:
 		gss_msg->msg.errno = -EPIPE;
 		atomic_inc(&gss_msg->count);
 		__gss_unhash_msg(gss_msg);
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&rpci->lock);
 		gss_release_msg(gss_msg);
 		goto restart;
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&rpci->lock);
 
 	put_pipe_version();
 }
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index a1f23c4..d0ffdf4 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -83,12 +83,11 @@ rpc_timeout_upcall_queue(struct work_struct *work)
 	LIST_HEAD(free_list);
 	struct rpc_inode *rpci =
 		container_of(work, struct rpc_inode, queue_timeout.work);
-	struct inode *inode = &rpci->vfs_inode;
 	void (*destroy_msg)(struct rpc_pipe_msg *);
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&rpci->lock);
 	if (rpci->ops == NULL) {
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&rpci->lock);
 		return;
 	}
 	destroy_msg = rpci->ops->destroy_msg;
@@ -96,7 +95,7 @@ rpc_timeout_upcall_queue(struct work_struct *work)
 		list_splice_init(&rpci->pipe, &free_list);
 		rpci->pipelen = 0;
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&rpci->lock);
 	rpc_purge_list(rpci, &free_list, destroy_msg, -ETIMEDOUT);
 }
 
@@ -116,7 +115,7 @@ rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg)
 	struct rpc_inode *rpci = RPC_I(inode);
 	int res = -EPIPE;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&rpci->lock);
 	if (rpci->ops == NULL)
 		goto out;
 	if (rpci->nreaders) {
@@ -133,7 +132,7 @@ rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg)
 		res = 0;
 	}
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&rpci->lock);
 	wake_up(&rpci->waitq);
 	return res;
 }
@@ -156,14 +155,14 @@ rpc_close_pipes(struct inode *inode)
 	ops = rpci->ops;
 	if (ops != NULL) {
 		LIST_HEAD(free_list);
-		spin_lock(&inode->i_lock);
+		spin_lock(&rpci->lock);
 		need_release = rpci->nreaders != 0 || rpci->nwriters != 0;
 		rpci->nreaders = 0;
 		list_splice_init(&rpci->in_upcall, &free_list);
 		list_splice_init(&rpci->pipe, &free_list);
 		rpci->pipelen = 0;
 		rpci->ops = NULL;
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&rpci->lock);
 		rpc_purge_list(rpci, &free_list, ops->destroy_msg, -EPIPE);
 		rpci->nwriters = 0;
 		if (need_release && ops->release_pipe)
@@ -236,10 +235,10 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
 		goto out;
 	msg = filp->private_data;
 	if (msg != NULL) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&rpci->lock);
 		msg->errno = -EAGAIN;
 		list_del_init(&msg->list);
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&rpci->lock);
 		rpci->ops->destroy_msg(msg);
 	}
 	if (filp->f_mode & FMODE_WRITE)
@@ -248,10 +247,10 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
 		rpci->nreaders --;
 		if (rpci->nreaders == 0) {
 			LIST_HEAD(free_list);
-			spin_lock(&inode->i_lock);
+			spin_lock(&rpci->lock);
 			list_splice_init(&rpci->pipe, &free_list);
 			rpci->pipelen = 0;
-			spin_unlock(&inode->i_lock);
+			spin_unlock(&rpci->lock);
 			rpc_purge_list(rpci, &free_list,
 					rpci->ops->destroy_msg, -EAGAIN);
 		}
@@ -279,7 +278,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
 	}
 	msg = filp->private_data;
 	if (msg == NULL) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&rpci->lock);
 		if (!list_empty(&rpci->pipe)) {
 			msg = list_entry(rpci->pipe.next,
 					struct rpc_pipe_msg,
@@ -289,7 +288,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
 			filp->private_data = msg;
 			msg->copied = 0;
 		}
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&rpci->lock);
 		if (msg == NULL)
 			goto out_unlock;
 	}
@@ -297,9 +296,9 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
 	res = rpci->ops->upcall(filp, msg, buf, len);
 	if (res < 0 || msg->len == msg->copied) {
 		filp->private_data = NULL;
-		spin_lock(&inode->i_lock);
+		spin_lock(&rpci->lock);
 		list_del_init(&msg->list);
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&rpci->lock);
 		rpci->ops->destroy_msg(msg);
 	}
 out_unlock:
@@ -348,9 +347,9 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case FIONREAD:
-		spin_lock(&inode->i_lock);
+		spin_lock(&rpci->lock);
 		if (rpci->ops == NULL) {
-			spin_unlock(&inode->i_lock);
+			spin_unlock(&rpci->lock);
 			return -EPIPE;
 		}
 		len = rpci->pipelen;
@@ -359,7 +358,7 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			msg = filp->private_data;
 			len += msg->len - msg->copied;
 		}
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&rpci->lock);
 		return put_user(len, (int __user *)arg);
 	default:
 		return -EINVAL;
@@ -1134,6 +1133,7 @@ init_once(void *foo)
 	INIT_DELAYED_WORK(&rpci->queue_timeout,
 			    rpc_timeout_upcall_queue);
 	rpci->ops = NULL;
+	spin_lock_init(&rpci->lock);
 }
 
 int register_rpc_pipefs(void)

^ permalink raw reply related

* [PATCH 0/6] SUNPRC: cleanup PipeFS for network-namespace-aware users
From: Stanislav Kinsbursky @ 2011-11-22 15:41 UTC (permalink / raw)
  To: Trond.Myklebust
  Cc: linux-nfs, xemul, neilb, netdev, linux-kernel, jbottomley,
	bfields, davem, devel

This patch-set was created in context of clone of git
branch: git://git.linux-nfs.org/projects/trondmy/nfs-2.6.git.
tag: v3.1

This is cleanup precursor patch set. It's required for easier further
implementation of network-namespace-aware SUNRPC PipeFS pipes creators.
Generally, this patch set split SUNRPC PipeFS logic into two parts: working
with pipes (new rpc_pipe structure with link to PipeFS dentry) and
creating/destroying PipeFS dentries (old rpc_inode structure with link to
rpc_pipe).
With this patch-set kernel PipeFS pipes users initially creates
rpc_pipe data and then creates Pipefs dentries. Later these dentries will be
created in notifier callbacks on PipeFS mount event from user-space and in
network namespace operations for such modules like nfs and blocklayout.

The following series consists of:

---

Stanislav Kinsbursky (6):
      SUNRPC: replace inode lock with pipe lock for RPC PipeFS operations
      SUNRPC: split SUNPRC PipeFS pipe data and inode creation
      SUNRPC: cleanup PipeFS redundant RPC inode usage
      SUNPRC: cleanup RPC PipeFS pipes upcall interface
      SUNRPC: cleanup GSS pipes usage
      SUNRPC: split SUNPRC PipeFS dentry and private pipe data creation


 fs/nfs/blocklayout/blocklayout.c    |   16 ++
 fs/nfs/blocklayout/blocklayout.h    |    2 
 fs/nfs/blocklayout/blocklayoutdev.c |    2 
 fs/nfs/blocklayout/blocklayoutdm.c  |    2 
 fs/nfs/idmap.c                      |   28 +++-
 include/linux/sunrpc/rpc_pipe_fs.h  |   20 ++-
 net/sunrpc/auth_gss/auth_gss.c      |  130 ++++++++++--------
 net/sunrpc/rpc_pipe.c               |  258 +++++++++++++++++++----------------
 8 files changed, 262 insertions(+), 196 deletions(-)

-- 
Signature

^ permalink raw reply

* [GIT PULL] Support for Cadence GEM in the MACB driver (Take 2)
From: Jamie Iles @ 2011-11-22 15:25 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Jamie Iles, Arnd Bergmann, Nicolas Ferre, netdev,
	linux-arm-kernel, davem, Jean-Christophe PLAGNIOL-VILLARD
In-Reply-To: <20111122145922.GM21480@game.jcrosoft.org>

Hi Arnd,

I've updated the branch with Jean-Christophe's version of the first 
patch which he sent some time ago and I managed to drop.  Apologies all!

Thanks,

Jamie

The following changes since commit cfcfc9eca2bcbd26a8e206baeb005b055dbf8e37:

  Linux 3.2-rc2 (2011-11-15 15:02:59 -0200)

are available in the git repository at:
  git://github.com/jamieiles/linux-2.6-ji.git macb-gem

Jamie Iles (9):
      at91: provide macb clks with "pclk" and "hclk" name
      macb: remove conditional clk handling
      macb: unify at91 and avr32 platform data
      macb: convert printk to netdev_ and friends
      macb: initial support for Cadence GEM
      macb: support higher rate GEM MDIO clock divisors
      macb: support statistics for GEM devices
      macb: support DMA bus widths > 32 bits
      macb: allow GEM to have configurable receive buffer size

 arch/arm/mach-at91/at91cap9.c               |    4 +-
 arch/arm/mach-at91/at91cap9_devices.c       |    6 +-
 arch/arm/mach-at91/at91rm9200_devices.c     |    6 +-
 arch/arm/mach-at91/at91sam9260.c            |    4 +-
 arch/arm/mach-at91/at91sam9260_devices.c    |    6 +-
 arch/arm/mach-at91/at91sam9263.c            |    4 +-
 arch/arm/mach-at91/at91sam9263_devices.c    |    6 +-
 arch/arm/mach-at91/at91sam9g45.c            |    4 +-
 arch/arm/mach-at91/at91sam9g45_devices.c    |    6 +-
 arch/arm/mach-at91/board-1arm.c             |    2 +-
 arch/arm/mach-at91/board-afeb-9260v1.c      |    2 +-
 arch/arm/mach-at91/board-cam60.c            |    2 +-
 arch/arm/mach-at91/board-cap9adk.c          |    2 +-
 arch/arm/mach-at91/board-carmeva.c          |    2 +-
 arch/arm/mach-at91/board-cpu9krea.c         |    2 +-
 arch/arm/mach-at91/board-cpuat91.c          |    2 +-
 arch/arm/mach-at91/board-csb337.c           |    2 +-
 arch/arm/mach-at91/board-csb637.c           |    2 +-
 arch/arm/mach-at91/board-eb9200.c           |    2 +-
 arch/arm/mach-at91/board-ecbat91.c          |    2 +-
 arch/arm/mach-at91/board-eco920.c           |    2 +-
 arch/arm/mach-at91/board-foxg20.c           |    2 +-
 arch/arm/mach-at91/board-gsia18s.c          |    2 +-
 arch/arm/mach-at91/board-kafa.c             |    2 +-
 arch/arm/mach-at91/board-kb9202.c           |    2 +-
 arch/arm/mach-at91/board-neocore926.c       |    2 +-
 arch/arm/mach-at91/board-pcontrol-g20.c     |    2 +-
 arch/arm/mach-at91/board-picotux200.c       |    2 +-
 arch/arm/mach-at91/board-qil-a9260.c        |    2 +-
 arch/arm/mach-at91/board-rm9200dk.c         |    2 +-
 arch/arm/mach-at91/board-rm9200ek.c         |    2 +-
 arch/arm/mach-at91/board-rsi-ews.c          |    2 +-
 arch/arm/mach-at91/board-sam9-l9260.c       |    2 +-
 arch/arm/mach-at91/board-sam9260ek.c        |    2 +-
 arch/arm/mach-at91/board-sam9263ek.c        |    2 +-
 arch/arm/mach-at91/board-sam9g20ek.c        |    2 +-
 arch/arm/mach-at91/board-sam9m10g45ek.c     |    2 +-
 arch/arm/mach-at91/board-snapper9260.c      |    2 +-
 arch/arm/mach-at91/board-stamp9g20.c        |    2 +-
 arch/arm/mach-at91/board-usb-a926x.c        |    2 +-
 arch/arm/mach-at91/board-yl-9200.c          |    2 +-
 arch/arm/mach-at91/include/mach/board.h     |   14 +-
 arch/avr32/boards/atngw100/setup.c          |    2 +-
 arch/avr32/boards/atstk1000/atstk1002.c     |    2 +-
 arch/avr32/boards/favr-32/setup.c           |    2 +-
 arch/avr32/boards/hammerhead/setup.c        |    2 +-
 arch/avr32/boards/merisc/setup.c            |    2 +-
 arch/avr32/boards/mimc200/setup.c           |    2 +-
 arch/avr32/mach-at32ap/at32ap700x.c         |    8 +-
 arch/avr32/mach-at32ap/include/mach/board.h |    7 +-
 drivers/net/ethernet/Makefile               |    2 +-
 drivers/net/ethernet/cadence/Kconfig        |   16 +-
 drivers/net/ethernet/cadence/at91_ether.c   |    3 +-
 drivers/net/ethernet/cadence/at91_ether.h   |    4 +-
 drivers/net/ethernet/cadence/macb.c         |  344 +++++++++++++++++----------
 drivers/net/ethernet/cadence/macb.h         |  150 ++++++++++++-
 include/linux/platform_data/macb.h          |   17 ++
 57 files changed, 476 insertions(+), 211 deletions(-)
 create mode 100644 include/linux/platform_data/macb.h

^ permalink raw reply

* RE: [PATCH] xfrm: optimize ipv4 selector matching
From: David Laight @ 2011-11-22 15:15 UTC (permalink / raw)
  To: Eric Dumazet, Alexey Dobriyan; +Cc: davem, netdev
In-Reply-To: <1321974652.2474.27.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>

 
> Please use :
> 
> +static inline bool addr4_match(__be32 a1, __be32 a2, u8 prefixlen)
> +{
> +       /* C99 6.5.7 (3): u32 << 32 is undefined behaviour */
> +       if (prefixlen == 0)
> +               return true;
> +       return !((a1 ^ a2) & htonl(0xFFFFFFFFu << (32 - prefixlen)));
> +}

I'm not sure I'd agree about using 'u8'.
It may well cause an unnecessary mask with 0xff.

	David

^ permalink raw reply

* [PATCH 2/2] net: add documentation for net_prio cgroups (v4)
From: Neil Horman @ 2011-11-22 15:10 UTC (permalink / raw)
  To: netdev; +Cc: Neil Horman, John Fastabend, Robert Love, David S. Miller
In-Reply-To: <1321974652-8318-1-git-send-email-nhorman@tuxdriver.com>

Add the requisite documentation to explain to new users how net_prio cgroups work

Signed-off-by:Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
CC: Robert Love <robert.w.love@intel.com>
CC: "David S. Miller" <davem@davemloft.net>
---
 Documentation/cgroups/net_prio.txt |   53 ++++++++++++++++++++++++++++++++++++
 1 files changed, 53 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/cgroups/net_prio.txt

diff --git a/Documentation/cgroups/net_prio.txt b/Documentation/cgroups/net_prio.txt
new file mode 100644
index 0000000..01b3226
--- /dev/null
+++ b/Documentation/cgroups/net_prio.txt
@@ -0,0 +1,53 @@
+Network priority cgroup
+-------------------------
+
+The Network priority cgroup provides an interface to allow an administrator to
+dynamically set the priority of network traffic generated by various
+applications
+
+Nominally, an application would set the priority of its traffic via the
+SO_PRIORITY socket option.  This however, is not always possible because:
+
+1) The application may not have been coded to set this value
+2) The priority of application traffic is often a site-specific administrative
+   decision rather than an application defined one.
+
+This cgroup allows an administrator to assign a process to a group which defines
+the priority of egress traffic on a given interface. Network priority groups can
+be created by first mounting the cgroup filesystem.
+
+# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
+
+With the above step, the initial group acting as the parent accounting group
+becomes visible at '/sys/fs/cgroup/net_prio'.  This group includes all tasks in
+the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
+
+Each net_prio cgroup contains two files that are subsystem specific
+
+net_prio.prioidx
+This file is read-only, and is simply informative.  It contains a unique integer
+value that the kernel uses as an internal representation of this cgroup.
+
+net_prio.ifpriomap
+This file contains a map of the priorities assigned to traffic originating from
+processes in this group and egressing the system on various interfaces. It
+contains a list of tuples in the form <ifname priority>.  Contents of this file
+can be modified by echoing a string into the file using the same tuple format.
+for example:
+
+echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
+
+This command would force any traffic originating from processes belonging to the
+iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
+said traffic set to the value 5. The parent accounting group also has a
+writeable 'net_prio.ifpriomap' file that can be used to set a system default
+priority.
+
+Priorities are set immediately prior to queueing a frame to the device
+queueing discipline (qdisc) so priorities will be assigned prior to the hardware
+queue selection being made.
+
+One usage for the net_prio cgroup is with mqprio qdisc allowing application
+traffic to be steered to hardware/driver based traffic classes. These mappings
+can then be managed by administrators or other networking protocols such as
+DCBX.
-- 
1.7.6.4

^ permalink raw reply related

* [PATCH 1/2] net: add network priority cgroup infrastructure (v4)
From: Neil Horman @ 2011-11-22 15:10 UTC (permalink / raw)
  To: netdev; +Cc: Neil Horman, John Fastabend, Robert Love, David S. Miller
In-Reply-To: <1321974652-8318-1-git-send-email-nhorman@tuxdriver.com>

This patch adds in the infrastructure code to create the network priority
cgroup.  The cgroup, in addition to the standard processes file creates two
control files:

1) prioidx - This is a read-only file that exports the index of this cgroup.
This is a value that is both arbitrary and unique to a cgroup in this subsystem,
and is used to index the per-device priority map

2) priomap - This is a writeable file.  On read it reports a table of 2-tuples
<name:priority> where name is the name of a network interface and priority is
indicates the priority assigned to frames egresessing on the named interface and
originating from a pid in this cgroup

This cgroup allows for skb priority to be set prior to a root qdisc getting
selected. This is benenficial for DCB enabled systems, in that it allows for any
application to use dcb configured priorities so without application modification

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
CC: Robert Love <robert.w.love@intel.com>
CC: "David S. Miller" <davem@davemloft.net>
---
 include/linux/cgroup_subsys.h |    8 +
 include/linux/netdevice.h     |    4 +
 include/net/netprio_cgroup.h  |   65 ++++++++
 include/net/sock.h            |    3 +
 net/Kconfig                   |    7 +
 net/core/Makefile             |    1 +
 net/core/dev.c                |   14 ++
 net/core/netprio_cgroup.c     |  344 +++++++++++++++++++++++++++++++++++++++++
 net/core/sock.c               |   22 +++-
 net/socket.c                  |    2 +
 10 files changed, 469 insertions(+), 1 deletions(-)
 create mode 100644 include/net/netprio_cgroup.h
 create mode 100644 net/core/netprio_cgroup.c

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ac663c1..0bd390c 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -59,8 +59,16 @@ SUBSYS(net_cls)
 SUBSYS(blkio)
 #endif
 
+/* */
+
 #ifdef CONFIG_CGROUP_PERF
 SUBSYS(perf)
 #endif
 
 /* */
+
+#ifdef CONFIG_NETPRIO_CGROUP
+SUBSYS(net_prio)
+#endif
+
+/* */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0db1f5f..750ea8e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -50,6 +50,7 @@
 #ifdef CONFIG_DCB
 #include <net/dcbnl.h>
 #endif
+#include <net/netprio_cgroup.h>
 
 struct vlan_group;
 struct netpoll_info;
@@ -1312,6 +1313,9 @@ struct net_device {
 	/* max exchange id for FCoE LRO by ddp */
 	unsigned int		fcoe_ddp_xid;
 #endif
+#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
+	struct netprio_map __rcu *priomap;
+#endif
 	/* phy device may attach itself for hardware timestamping */
 	struct phy_device *phydev;
 
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
new file mode 100644
index 0000000..8cfeb3c
--- /dev/null
+++ b/include/net/netprio_cgroup.h
@@ -0,0 +1,65 @@
+/*
+ * netprio_cgroup.h			Control Group Priority set 
+ *
+ *
+ * Authors:	Neil Horman <nhorman@tuxdriver.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#ifndef _NETPRIO_CGROUP_H
+#define _NETPRIO_CGROUP_H
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/hardirq.h>
+#include <linux/rcupdate.h>
+
+struct cgroup_netprio_state
+{
+	struct cgroup_subsys_state css;
+	u32 prioidx;
+};
+
+struct netprio_map {
+	struct rcu_head rcu;
+	u32 priomap_len;
+	u32 priomap[];
+};
+
+#ifdef CONFIG_CGROUPS
+
+#ifndef CONFIG_NETPRIO_CGROUP
+extern int net_prio_subsys_id;
+#endif
+
+extern void sock_update_netprioidx(struct sock *sk);
+
+static inline struct cgroup_netprio_state
+		*task_netprio_state(struct task_struct *p)
+{
+#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
+	return container_of(task_subsys_state(p, net_prio_subsys_id),
+			    struct cgroup_netprio_state, css);
+#else
+	return NULL;
+#endif
+}
+
+#else
+
+#define sock_update_netprioidx(sk)
+#define skb_update_prio(skb)
+
+static inline struct cgroup_netprio_state
+		*task_netprio_state(struct task_struct *p)
+{
+	return NULL;
+}
+
+#endif
+
+#endif  /* _NET_CLS_CGROUP_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 5ac682f..87b24aa 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -321,6 +321,9 @@ struct sock {
 	unsigned short		sk_ack_backlog;
 	unsigned short		sk_max_ack_backlog;
 	__u32			sk_priority;
+#ifdef CONFIG_CGROUPS
+	__u32			sk_cgrp_prioidx;
+#endif
 	struct pid		*sk_peer_pid;
 	const struct cred	*sk_peer_cred;
 	long			sk_rcvtimeo;
diff --git a/net/Kconfig b/net/Kconfig
index a073148..63d2c5d 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -232,6 +232,13 @@ config XPS
 	depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
 	default y
 
+config NETPRIO_CGROUP
+	tristate "Network priority cgroup"
+	depends on CGROUPS
+	---help---
+	  Cgroup subsystem for use in assigning processes to network priorities on
+	  a per-interface basis
+
 config HAVE_BPF_JIT
 	bool
 
diff --git a/net/core/Makefile b/net/core/Makefile
index 0d357b1..3606d40 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o
 obj-$(CONFIG_TRACEPOINTS) += net-traces.o
 obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
 obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
+obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o
diff --git a/net/core/dev.c b/net/core/dev.c
index b7ba81a..373c908 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2456,6 +2456,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	return rc;
 }
 
+#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
+static void skb_update_prio(struct sk_buff *skb)
+{
+	struct netprio_map *map = rcu_dereference(skb->dev->priomap);
+
+	if ((!skb->priority) && (skb->sk) && map)
+		skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx];
+}
+#else
+#define skb_update_prio(skb)
+#endif
+
 static DEFINE_PER_CPU(int, xmit_recursion);
 #define RECURSION_LIMIT 10
 
@@ -2496,6 +2508,8 @@ int dev_queue_xmit(struct sk_buff *skb)
 	 */
 	rcu_read_lock_bh();
 
+	skb_update_prio(skb);
+
 	txq = dev_pick_tx(dev, skb);
 	q = rcu_dereference_bh(txq->qdisc);
 
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
new file mode 100644
index 0000000..72ad0bc
--- /dev/null
+++ b/net/core/netprio_cgroup.c
@@ -0,0 +1,344 @@
+/*
+ * net/core/netprio_cgroup.c	Priority Control Group
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/cgroup.h>
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+#include <net/netprio_cgroup.h>
+
+static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
+					       struct cgroup *cgrp);
+static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
+
+struct cgroup_subsys net_prio_subsys = {
+	.name		= "net_prio",
+	.create		= cgrp_create,
+	.destroy	= cgrp_destroy,
+	.populate	= cgrp_populate,
+#ifdef CONFIG_NETPRIO_CGROUP
+	.subsys_id	= net_prio_subsys_id,
+#endif
+	.module		= THIS_MODULE
+};
+
+#define PRIOIDX_SZ 128
+
+static unsigned long prioidx_map[PRIOIDX_SZ];
+static DEFINE_SPINLOCK(prioidx_map_lock);
+static atomic_t max_prioidx = ATOMIC_INIT(0);
+
+static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp)
+{
+	return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id),
+			    struct cgroup_netprio_state, css);
+}
+
+static int get_prioidx(u32 *prio)
+{
+	unsigned long flags;
+	u32 prioidx;
+
+	spin_lock_irqsave(&prioidx_map_lock, flags);
+	prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ);
+	set_bit(prioidx, prioidx_map);
+	spin_unlock_irqrestore(&prioidx_map_lock, flags);
+	if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ)
+		return -ENOSPC;
+
+	atomic_set(&max_prioidx, prioidx);
+	*prio = prioidx;
+	return 0;
+}
+
+static void put_prioidx(u32 idx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&prioidx_map_lock, flags);
+	clear_bit(idx, prioidx_map);
+	spin_unlock_irqrestore(&prioidx_map_lock, flags);
+}
+
+static void extend_netdev_table(struct net_device *dev, u32 new_len)
+{
+	size_t new_size = sizeof(struct netprio_map) +
+			   ((sizeof(u32) * new_len));
+	struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL);
+	struct netprio_map *old_priomap;
+	int i;
+
+	old_priomap  = rtnl_dereference(dev->priomap);
+
+	if (!new_priomap) {
+		printk(KERN_WARNING "Unable to alloc new priomap!\n");
+		return;
+	}
+
+	for (i = 0;
+	     old_priomap && (i < old_priomap->priomap_len);
+	     i++)
+		new_priomap->priomap[i] = old_priomap->priomap[i];
+
+	new_priomap->priomap_len = new_len;
+
+	rcu_assign_pointer(dev->priomap, new_priomap);
+	if (old_priomap)
+		kfree_rcu(old_priomap, rcu);
+}
+
+static void update_netdev_tables(void)
+{
+	struct net_device *dev;
+	u32 max_len = atomic_read(&max_prioidx);
+	struct netprio_map *map;
+
+	rtnl_lock();
+	for_each_netdev(&init_net, dev) {
+		map = rtnl_dereference(dev->priomap);
+		if ((!map) ||
+		    (map->priomap_len < max_len))
+			extend_netdev_table(dev, max_len);
+	}
+	rtnl_unlock();
+}
+
+static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
+						 struct cgroup *cgrp)
+{
+	struct cgroup_netprio_state *cs;
+	int ret;
+
+	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+	if (!cs)
+		return ERR_PTR(-ENOMEM);
+
+	if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) {
+		kfree(cs);
+		return ERR_PTR(-EINVAL);
+	}
+
+	ret = get_prioidx(&cs->prioidx);
+	if (ret != 0) {
+		printk(KERN_WARNING "No space in priority index array\n");
+		kfree(cs);
+		return ERR_PTR(ret);
+	}
+
+	return &cs->css;
+}
+
+static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	struct cgroup_netprio_state *cs;
+	struct net_device *dev;
+	struct netprio_map *map;
+
+	cs = cgrp_netprio_state(cgrp);
+	rtnl_lock();
+	for_each_netdev(&init_net, dev) {
+		map = rtnl_dereference(dev->priomap);
+		if (map)
+			map->priomap[cs->prioidx] = 0;
+	}
+	rtnl_unlock();
+	put_prioidx(cs->prioidx);
+	kfree(cs);
+}
+
+static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft)
+{
+	return (u64)cgrp_netprio_state(cgrp)->prioidx;
+}
+
+static int read_priomap(struct cgroup *cont, struct cftype *cft,
+			struct cgroup_map_cb *cb)
+{
+	struct net_device *dev;
+	u32 prioidx = cgrp_netprio_state(cont)->prioidx;
+	u32 priority;
+	struct netprio_map *map;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
+		map = rcu_dereference(dev->priomap);
+		priority = map ? map->priomap[prioidx] : 0;
+		cb->fill(cb, dev->name, priority);
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
+static int write_priomap(struct cgroup *cgrp, struct cftype *cft,
+			 const char *buffer)
+{
+	char *devname = kstrdup(buffer, GFP_KERNEL);
+	int ret = -EINVAL;
+	u32 prioidx = cgrp_netprio_state(cgrp)->prioidx;
+	unsigned long priority;
+	char *priostr;
+	struct net_device *dev;
+	struct netprio_map *map;
+
+	if (!devname)
+		return -ENOMEM;
+
+	/*
+	 * Minimally sized valid priomap string
+	 */
+	if (strlen(devname) < 3)
+		goto out_free_devname;
+
+	priostr = strstr(devname, " ");
+	if (!priostr)
+		goto out_free_devname;
+
+	/*
+	 *Separate the devname from the associated priority
+	 *and advance the priostr poitner to the priority value
+	 */
+	*priostr = '\0';
+	priostr++;
+
+	/*
+	 * If the priostr points to NULL, we're at the end of the passed
+	 * in string, and its not a valid write
+	 */
+	if (*priostr == '\0')
+		goto out_free_devname;
+
+	ret = kstrtoul(priostr, 10, &priority);
+	if (ret < 0)
+		goto out_free_devname;
+
+	ret = -ENODEV;
+
+	dev = dev_get_by_name(&init_net, devname);
+	if (!dev)
+		goto out_free_devname;
+
+	update_netdev_tables();
+	ret = 0;
+	rcu_read_lock();
+	map = rcu_dereference(dev->priomap);
+	if (map)
+		map->priomap[prioidx] = priority;
+	rcu_read_unlock();
+	dev_put(dev);
+
+out_free_devname:
+	kfree(devname);
+	return ret;
+}
+
+static struct cftype ss_files[] = {
+	{
+		.name = "prioidx",
+		.read_u64 = read_prioidx,
+	},
+	{
+		.name = "ifpriomap",
+		.read_map = read_priomap,
+		.write_string = write_priomap,
+	},
+};
+
+static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
+}
+
+static int netprio_device_event(struct notifier_block *unused,
+				unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct netprio_map *old;
+	u32 max_len = atomic_read(&max_prioidx);
+
+	/*
+	 * Note this is called with rtnl_lock held so we have update side
+	 * protection on our rcu assignments
+	 */
+
+	switch (event) {
+
+	case NETDEV_REGISTER:
+		if (max_len)
+			extend_netdev_table(dev, max_len);
+		break;
+	case NETDEV_UNREGISTER:
+		old = rtnl_dereference(dev->priomap);
+		rcu_assign_pointer(dev->priomap, NULL);
+		if (old)
+			kfree_rcu(old, rcu);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block netprio_device_notifier = {
+	.notifier_call = netprio_device_event
+};
+
+static int __init init_cgroup_netprio(void)
+{
+	int ret;
+
+	ret = cgroup_load_subsys(&net_prio_subsys);
+	if (ret)
+		goto out;
+#ifndef CONFIG_NETPRIO_CGROUP
+	smp_wmb();
+	net_prio_subsys_id = net_prio_subsys.subsys_id;
+#endif
+
+	register_netdevice_notifier(&netprio_device_notifier);
+
+out:
+	return ret;
+}
+
+static void __exit exit_cgroup_netprio(void)
+{
+	struct netprio_map *old;
+	struct net_device *dev;
+
+	unregister_netdevice_notifier(&netprio_device_notifier);
+
+	cgroup_unload_subsys(&net_prio_subsys);
+
+#ifndef CONFIG_NETPRIO_CGROUP
+	net_prio_subsys_id = -1;
+	synchronize_rcu();
+#endif
+
+	rtnl_lock();
+	for_each_netdev(&init_net, dev) {
+		old = rtnl_dereference(dev->priomap);
+		rcu_assign_pointer(dev->priomap, NULL);
+		if (old)
+			kfree_rcu(old, rcu);
+	}
+	rtnl_unlock();
+}
+
+module_init(init_cgroup_netprio);
+module_exit(exit_cgroup_netprio);
+MODULE_LICENSE("GPL v2");
diff --git a/net/core/sock.c b/net/core/sock.c
index 5a08762..77a4888 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -125,6 +125,7 @@
 #include <net/xfrm.h>
 #include <linux/ipsec.h>
 #include <net/cls_cgroup.h>
+#include <net/netprio_cgroup.h>
 
 #include <linux/filter.h>
 
@@ -221,10 +222,16 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 EXPORT_SYMBOL(sysctl_optmem_max);
 
-#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
+#if defined(CONFIG_CGROUPS)
+#if !defined(CONFIG_NET_CLS_CGROUP)
 int net_cls_subsys_id = -1;
 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
 #endif
+#if !defined(CONFIG_NETPRIO_CGROUP)
+int net_prio_subsys_id = -1;
+EXPORT_SYMBOL_GPL(net_prio_subsys_id);
+#endif
+#endif
 
 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 {
@@ -1111,6 +1118,18 @@ void sock_update_classid(struct sock *sk)
 		sk->sk_classid = classid;
 }
 EXPORT_SYMBOL(sock_update_classid);
+
+void sock_update_netprioidx(struct sock *sk)
+{
+	struct cgroup_netprio_state *state;
+	if (in_interrupt())
+		return;
+	rcu_read_lock();
+	state = task_netprio_state(current);
+	sk->sk_cgrp_prioidx = state ? state->prioidx : 0;
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(sock_update_netprioidx);
 #endif
 
 /**
@@ -1138,6 +1157,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		atomic_set(&sk->sk_wmem_alloc, 1);
 
 		sock_update_classid(sk);
+		sock_update_netprioidx(sk);
 	}
 
 	return sk;
diff --git a/net/socket.c b/net/socket.c
index 2877647..108716f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -549,6 +549,8 @@ static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock,
 
 	sock_update_classid(sock->sk);
 
+	sock_update_netprioidx(sock->sk);
+
 	si->sock = sock;
 	si->scm = NULL;
 	si->msg = msg;
-- 
1.7.6.4

^ permalink raw reply related

* [PATCH 0/2] net: Add network priority cgroup (v4)
From: Neil Horman @ 2011-11-22 15:10 UTC (permalink / raw)
  To: netdev; +Cc: Neil Horman, John Fastabend, Robert Love, David S. Miller
In-Reply-To: <1321476666-8225-1-git-send-email-nhorman@tuxdriver.com>


Data Center Bridging environments are currently somewhat limited in their
ability to provide a general mechanism for controlling traffic priority.
Specifically they are unable to administratively control the priority at which
various types of network traffic are sent.
 
Currently, the only ways to set the priority of a network buffer are:

1) Through the use of the SO_PRIORITY socket option
2) By using low level hooks, like a tc action

(1) is difficult from an administrative perspective because it requires that the
application to be coded to not just assume the default priority is sufficient,
and must expose an administrative interface to allow priority adjustment.  Such
a solution is not scalable in a DCB environment

(2) is also difficult, as it requires constant administrative oversight of
applications so as to build appropriate rules to match traffic belonging to
various classes, so that priority can be appropriately set. It is further
limiting when DCB enabled hardware is in use, due to the fact that tc rules are
only run after a root qdisc has been selected (DCB enabled hardware may reserve
hw queues for various traffic classes and needs the priority to be set prior to
selecting the root qdisc)


I've discussed various solutions with John Fastabend, and we saw a cgroup as
being a good general solution to this problem.  The network priority cgroup
allows for a per-interface priority map to be built per cgroup.  Any traffic
originating from an application in a cgroup, that does not explicitly set its
priority with SO_PRIORITY will have its priority assigned to the value
designated for that group on that interface.  This allows a user space daemon,
when conducting LLDP negotiation with a DCB enabled peer to create a cgroup
based on the APP_TLV value received and administratively assign applications to
that priority using the existing cgroup utility infrastructure.

Tested by John and myself, with good results

(v2)
Based on reviews from John F., Amerigo Wang and Neerav Parikh, I've cleaned up
the rcu locking, fixed a memory leak in an error path, and corrected some typos.

(v3)
Converted rcu_dereference to rntl_dereference where appropriate as per request
from John F.

(v4)
Cleaned up some spacing issues, and optimized the skb_update_priority path as
per request from Dave M.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
CC: Robert Love <robert.w.love@intel.com>
CC: "David S. Miller" <davem@davemloft.net>

^ permalink raw reply

* Re: [PATCH] xfrm: optimize ipv4 selector matching
From: Eric Dumazet @ 2011-11-22 15:10 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: David Laight, davem, netdev
In-Reply-To: <20111122145940.GA24909@p183.telecom.by>

Le mardi 22 novembre 2011 à 17:59 +0300, Alexey Dobriyan a écrit :
> On Tue, Nov 22, 2011 at 02:50:59PM -0000, David Laight wrote:
> >  
> > > +static inline int addr4_match(const __be32 a1, const __be32 
> > > a2, const u8 prefixlen)
> > > +{
> > > +	/* C99 6.5.7 (3): u32 << 32 is undefined behaviour */
> > > +	if (prefixlen == 0) {
> > > +		/* Matching constants result in smaller assembly. */
> > > +		return 0xFFFFFFFFu;
> > > +	}
> > > +	return !((a1 ^ a2) & htonl(0xFFFFFFFFu << (32 - prefixlen)));
> > > +}
> > > +
> > 
> > It would probably be clearer to 'return 1' when prefixlen is zero.
> 
> "return 1" results in bigger code.
> This function used only in boolean context, so exact return value doesn't matter.


Thats too ugly, sorry.

Also, using "const" attributes on integral types (not pointers) make
code less clear. Caller doesnt care if the implementation wants to
change a1 or a2 or prefixlen.

Please use :

+static inline bool addr4_match(__be32 a1, __be32 a2, u8 prefixlen)
+{
+       /* C99 6.5.7 (3): u32 << 32 is undefined behaviour */
+       if (prefixlen == 0)
+               return true;
+       return !((a1 ^ a2) & htonl(0xFFFFFFFFu << (32 - prefixlen)));
+}

^ permalink raw reply

* Re: WARNING: at mm/slub.c:3357, kernel BUG at mm/slub.c:3413
From: Christoph Lameter @ 2011-11-22 15:07 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Markus Trippelsdorf, Christian Kujau, Benjamin Herrenschmidt,
	Alex,Shi, linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Pekka Enberg, Matt Mackall, netdev@vger.kernel.org, Tejun Heo
In-Reply-To: <alpine.DEB.2.00.1111220900330.25785@router.home>

On Tue, 22 Nov 2011, Christoph Lameter wrote:

> Not sure what the solution is but the simplest would be to disable
> validation if lockless is active.

That is already being done so you must have run the validation with
debugging on.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: WARNING: at mm/slub.c:3357, kernel BUG at mm/slub.c:3413
From: Christoph Lameter @ 2011-11-22 15:02 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Markus Trippelsdorf, Christian Kujau, Benjamin Herrenschmidt,
	Alex,Shi, linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Pekka Enberg, Matt Mackall, netdev@vger.kernel.org, Tejun Heo
In-Reply-To: <1321973567.2474.17.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>

[-- Attachment #1: Type: TEXT/PLAIN, Size: 882 bytes --]

On Tue, 22 Nov 2011, Eric Dumazet wrote:

> Le mardi 22 novembre 2011 à 08:46 -0600, Christoph Lameter a écrit :
> > On Tue, 22 Nov 2011, Eric Dumazet wrote:
> >
> > > I trigger a bug in less than 10 secondes, with this running while a
> > > "make -j16 " kernel build is run.
> > >
> > > while :; do slabinfo -v; done
> >
> > Hmm.... Yeah there was limited testing on the debug features. There could
> > be something amiss there. Need to review the validation interactions with
> > the new code.
> >
> >
>
> Hmm... yes, we probably should take a look.

Allright there is an obvious issue if debugging is off since the lockless
paths do not take the page lock in that case but the validation logic
does. Did you run the test with debugging on or off?

Not sure what the solution is but the simplest would be to disable
validation if lockless is active.



^ permalink raw reply

* Re: [GIT PULL] Support for Cadence GEM in the MACB driver
From: Jean-Christophe PLAGNIOL-VILLARD @ 2011-11-22 14:59 UTC (permalink / raw)
  To: Jamie Iles; +Cc: Arnd Bergmann, Nicolas Ferre, netdev, linux-arm-kernel, davem
In-Reply-To: <20111122102039.GB8872@totoro>

On 10:20 Tue 22 Nov     , Jamie Iles wrote:
> Hi Jean-Christophe,
> 
> On Tue, Nov 22, 2011 at 11:00:32AM +0100, Jean-Christophe PLAGNIOL-VILLARD wrote:
> > On 10:00 Mon 21 Nov     , Jamie Iles wrote:
> > > Hi Arnd,
> > > 
> > > Please consider pulling the patches to add support for Cadence GEM to 
> > > the MACB driver.  These have been ready to go for a little while but got 
> > > held up with the rename of Ethernet drivers in the last merge window.  
> > > It would be great if we can get some exposure in -next for a little 
> > > while before the next merge window.
> > > 
> > > Thanks,
> > > 
> > > Jamie
> > > 
> > > The following changes since commit cfcfc9eca2bcbd26a8e206baeb005b055dbf8e37:
> > > 
> > >   Linux 3.2-rc2 (2011-11-15 15:02:59 -0200)
> > > 
> > > are available in the git repository at:
> > >   git://github.com/jamieiles/linux-2.6-ji.git macb-gem
> > Arnd please wait
> > 
> > please update his patch
> > 
> > "at91: provide macb clks with "pclk" and "hclk" name"
> > 
> > with the version I send to the ML
> 
> Is that a patch that you've already posted or will be posting?
http://patchwork.ozlabs.org/patch/109265/

but please specify the device too

Best Regards,
J.

^ permalink raw reply

* Re: [PATCH] xfrm: optimize ipv4 selector matching
From: Alexey Dobriyan @ 2011-11-22 14:59 UTC (permalink / raw)
  To: David Laight; +Cc: davem, netdev
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F6D8AED7@saturn3.aculab.com>

On Tue, Nov 22, 2011 at 02:50:59PM -0000, David Laight wrote:
>  
> > +static inline int addr4_match(const __be32 a1, const __be32 
> > a2, const u8 prefixlen)
> > +{
> > +	/* C99 6.5.7 (3): u32 << 32 is undefined behaviour */
> > +	if (prefixlen == 0) {
> > +		/* Matching constants result in smaller assembly. */
> > +		return 0xFFFFFFFFu;
> > +	}
> > +	return !((a1 ^ a2) & htonl(0xFFFFFFFFu << (32 - prefixlen)));
> > +}
> > +
> 
> It would probably be clearer to 'return 1' when prefixlen is zero.

"return 1" results in bigger code.
This function used only in boolean context, so exact return value doesn't matter.

> If this is a common path, might be worth caching
>     htonl(0xFFFFFFFFu << (32 - prefixlen))
> in the enclosing structure.

This means one more branch, which wouldn't be a win compared
to current code.

^ permalink raw reply

* RE: [PATCH] xfrm: optimize ipv4 selector matching
From: David Laight @ 2011-11-22 14:50 UTC (permalink / raw)
  To: Alexey Dobriyan, davem; +Cc: netdev
In-Reply-To: <20111122144334.GA22824@p183.telecom.by>

 
> +static inline int addr4_match(const __be32 a1, const __be32 
> a2, const u8 prefixlen)
> +{
> +	/* C99 6.5.7 (3): u32 << 32 is undefined behaviour */
> +	if (prefixlen == 0) {
> +		/* Matching constants result in smaller assembly. */
> +		return 0xFFFFFFFFu;
> +	}
> +	return !((a1 ^ a2) & htonl(0xFFFFFFFFu << (32 - prefixlen)));
> +}
> +

It would probably be clearer to 'return 1' when prefixlen is zero.

If this is a common path, might be worth caching
    htonl(0xFFFFFFFFu << (32 - prefixlen))
in the enclosing structure.

	David

^ permalink raw reply

* Re: WARNING: at mm/slub.c:3357, kernel BUG at mm/slub.c:3413
From: Eric Dumazet @ 2011-11-22 14:52 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Markus Trippelsdorf, Christian Kujau, Benjamin Herrenschmidt,
	Alex,Shi, linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Pekka Enberg, Matt Mackall, netdev@vger.kernel.org, Tejun Heo
In-Reply-To: <alpine.DEB.2.00.1111220844400.25785@router.home>

Le mardi 22 novembre 2011 à 08:46 -0600, Christoph Lameter a écrit :
> On Tue, 22 Nov 2011, Eric Dumazet wrote:
> 
> > I trigger a bug in less than 10 secondes, with this running while a
> > "make -j16 " kernel build is run.
> >
> > while :; do slabinfo -v; done
> 
> Hmm.... Yeah there was limited testing on the debug features. There could
> be something amiss there. Need to review the validation interactions with
> the new code.
> 
> 

Hmm... yes, we probably should take a look.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: WARNING: at mm/slub.c:3357, kernel BUG at mm/slub.c:3413
From: Christoph Lameter @ 2011-11-22 14:50 UTC (permalink / raw)
  To: Markus Trippelsdorf
  Cc: Eric Dumazet, Christian Kujau, Benjamin Herrenschmidt, Alex,Shi,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org, Pekka Enberg,
	Matt Mackall, netdev@vger.kernel.org, Tejun Heo
In-Reply-To: <20111122112107.GA1675@x4.trippels.de>

On Tue, 22 Nov 2011, Markus Trippelsdorf wrote:

> > Given slub is now lockless, validate_slab_slab() is probably very wrong
> > these days.
>
> OK "slabinfo -v" is useless then. But that doesn't invalidate the BUGs
> that I saw during boot. They happend before I could even run slabinfo
> for the first time.

Correct. Also the use of debugging disables the use of cmpxchg_double()
but not this_cpu_cmpxchg() use. See cmpxchg_double_slab() and
kmem_cache_open()s determination of the __CMPXCHG_DOUBLE flag.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: WARNING: at mm/slub.c:3357, kernel BUG at mm/slub.c:3413
From: Christoph Lameter @ 2011-11-22 14:46 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Markus Trippelsdorf, Christian Kujau, Benjamin Herrenschmidt,
	Alex,Shi, linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Pekka Enberg, Matt Mackall, netdev@vger.kernel.org, Tejun Heo
In-Reply-To: <1321955185.2474.6.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>

On Tue, 22 Nov 2011, Eric Dumazet wrote:

> I trigger a bug in less than 10 secondes, with this running while a
> "make -j16 " kernel build is run.
>
> while :; do slabinfo -v; done

Hmm.... Yeah there was limited testing on the debug features. There could
be something amiss there. Need to review the validation interactions with
the new code.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: WARNING: at mm/slub.c:3357, kernel BUG at mm/slub.c:3413
From: Christoph Lameter @ 2011-11-22 14:44 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Markus Trippelsdorf, Christian Kujau, Benjamin Herrenschmidt,
	Alex,Shi, linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Pekka Enberg, Matt Mackall, netdev@vger.kernel.org, Tejun Heo
In-Reply-To: <1321954729.2474.4.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>

On Tue, 22 Nov 2011, Eric Dumazet wrote:

> Given slub is now lockless, validate_slab_slab() is probably very wrong
> these days.

Debugging reenables the use of the page lock.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH] xfrm: optimize ipv4 selector matching
From: Alexey Dobriyan @ 2011-11-22 14:43 UTC (permalink / raw)
  To: davem; +Cc: netdev

Current addr_match() is errh, under-optimized.

Compiler doesn't know that memcmp() branch doesn't trigger for IPv4.
Pass addresses by value.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 include/net/xfrm.h     |   10 ++++++++++
 net/xfrm/xfrm_policy.c |    4 ++--
 2 files changed, 12 insertions(+), 2 deletions(-)

--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -827,6 +827,16 @@ static inline bool addr_match(const void *token1, const void *token2,
 	return true;
 }
 
+static inline int addr4_match(const __be32 a1, const __be32 a2, const u8 prefixlen)
+{
+	/* C99 6.5.7 (3): u32 << 32 is undefined behaviour */
+	if (prefixlen == 0) {
+		/* Matching constants result in smaller assembly. */
+		return 0xFFFFFFFFu;
+	}
+	return !((a1 ^ a2) & htonl(0xFFFFFFFFu << (32 - prefixlen)));
+}
+
 static __inline__
 __be16 xfrm_flowi_sport(const struct flowi *fl, const union flowi_uli *uli)
 {
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -61,8 +61,8 @@ __xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
 {
 	const struct flowi4 *fl4 = &fl->u.ip4;
 
-	return  addr_match(&fl4->daddr, &sel->daddr, sel->prefixlen_d) &&
-		addr_match(&fl4->saddr, &sel->saddr, sel->prefixlen_s) &&
+	return  addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) &&
+		addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) &&
 		!((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) &&
 		!((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) &&
 		(fl4->flowi4_proto == sel->proto || !sel->proto) &&

^ permalink raw reply

* Re: MPLS for Linux kernel
From: Jorge Boncompte [DTI2] @ 2011-11-22 14:35 UTC (permalink / raw)
  To: netdev
In-Reply-To: <4ECBB2C4.9000505@dti2.net>

El 22/11/2011 15:33, Jorge Boncompte [DTI2] escribió:
> 
> 	(Dropped Dave from the cc: list)

	That should have been David, sorry.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox