* [PATCH 1/6] SUNRPC: handle RPC client pipefs dentries by network namespace aware routines
From: Stanislav Kinsbursky @ 2011-11-23 11:51 UTC (permalink / raw)
To: bfields, Trond.Myklebust
Cc: linux-nfs, xemul, neilb, netdev, linux-kernel, jbottomley, davem,
devel
In-Reply-To: <20111123104945.11077.10270.stgit@localhost6.localdomain6>
This patch makes RPC clients PipeFs dentries allocations in it's owner network
namespace context.
RPC client pipefs dentries creation logic has been changed:
1) Pipefs dentries creation by sb was moved to separated function, which will
be used for handling PipeFS mount notification.
2) Initial value of RPC client PipeFS dir dentry is set no NULL now.
RPC client pipefs dentries cleanup logic has been changed:
1) Cleanup is done now in separated rpc_remove_pipedir() function, which takes
care about pipefs superblock locking.
Also this patch removes slashes from cb_program.pipe_dir_name and from
NFS_PIPE_DIRNAME to make rpc_d_lookup_sb() work. This doesn't affect
vfs_path_lookup() results in nfs4blocklayout_init() since this slash is cutted
off anyway in link_path_walk().
Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
---
fs/nfsd/nfs4callback.c | 2 +
include/linux/nfs.h | 2 +
net/sunrpc/clnt.c | 93 +++++++++++++++++++++++++++++++-----------------
3 files changed, 62 insertions(+), 35 deletions(-)
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 02eb4ed..1ac6f55 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -618,7 +618,7 @@ static struct rpc_program cb_program = {
.nrvers = ARRAY_SIZE(nfs_cb_version),
.version = nfs_cb_version,
.stats = &cb_stats,
- .pipe_dir_name = "/nfsd4_cb",
+ .pipe_dir_name = "nfsd4_cb",
};
static int max_cb_time(void)
diff --git a/include/linux/nfs.h b/include/linux/nfs.h
index 8c6ee44..6d1fb63 100644
--- a/include/linux/nfs.h
+++ b/include/linux/nfs.h
@@ -29,7 +29,7 @@
#define NFS_MNT_VERSION 1
#define NFS_MNT3_VERSION 3
-#define NFS_PIPE_DIRNAME "/nfs"
+#define NFS_PIPE_DIRNAME "nfs"
/*
* NFS stats. The good thing with these values is that NFSv3 errors are
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index c5347d2..008c755 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -93,52 +93,85 @@ static void rpc_unregister_client(struct rpc_clnt *clnt)
spin_unlock(&rpc_client_lock);
}
-static int
-rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
+static void __rpc_clnt_remove_pipedir(struct rpc_clnt *clnt)
+{
+ if (clnt->cl_path.dentry)
+ rpc_remove_client_dir(clnt->cl_path.dentry);
+ clnt->cl_path.dentry = NULL;
+}
+
+static void rpc_clnt_remove_pipedir(struct rpc_clnt *clnt)
+{
+ struct super_block *pipefs_sb;
+
+ pipefs_sb = rpc_get_sb_net(clnt->cl_xprt->xprt_net);
+ if (pipefs_sb) {
+ __rpc_clnt_remove_pipedir(clnt);
+ rpc_put_sb_net(clnt->cl_xprt->xprt_net);
+ }
+ rpc_put_mount();
+}
+
+static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb,
+ struct rpc_clnt *clnt, char *dir_name)
{
static uint32_t clntid;
- struct path path, dir;
char name[15];
struct qstr q = {
.name = name,
};
+ struct dentry *dir, *dentry;
int error;
- clnt->cl_path.mnt = ERR_PTR(-ENOENT);
- clnt->cl_path.dentry = ERR_PTR(-ENOENT);
- if (dir_name == NULL)
- return 0;
-
- path.mnt = rpc_get_mount();
- if (IS_ERR(path.mnt))
- return PTR_ERR(path.mnt);
- error = vfs_path_lookup(path.mnt->mnt_root, path.mnt, dir_name, 0, &dir);
- if (error)
- goto err;
-
+ dir = rpc_d_lookup_sb(sb, dir_name);
+ if (dir == NULL)
+ return dir;
for (;;) {
q.len = snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++);
name[sizeof(name) - 1] = '\0';
q.hash = full_name_hash(q.name, q.len);
- path.dentry = rpc_create_client_dir(dir.dentry, &q, clnt);
- if (!IS_ERR(path.dentry))
+ dentry = rpc_create_client_dir(dir, &q, clnt);
+ if (!IS_ERR(dentry))
break;
- error = PTR_ERR(path.dentry);
+ error = PTR_ERR(dentry);
if (error != -EEXIST) {
printk(KERN_INFO "RPC: Couldn't create pipefs entry"
" %s/%s, error %d\n",
dir_name, name, error);
- goto err_path_put;
+ break;
}
}
- path_put(&dir);
+ dput(dir);
+ return dentry;
+}
+
+static int
+rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
+{
+ struct super_block *pipefs_sb;
+ struct path path;
+
+ clnt->cl_path.mnt = ERR_PTR(-ENOENT);
+ clnt->cl_path.dentry = NULL;
+ if (dir_name == NULL)
+ return 0;
+
+ path.mnt = rpc_get_mount();
+ if (IS_ERR(path.mnt))
+ return PTR_ERR(path.mnt);
+ pipefs_sb = rpc_get_sb_net(clnt->cl_xprt->xprt_net);
+ if (!pipefs_sb) {
+ rpc_put_mount();
+ return -ENOENT;
+ }
+ path.dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt, dir_name);
+ rpc_put_sb_net(clnt->cl_xprt->xprt_net);
+ if (IS_ERR(path.dentry)) {
+ rpc_put_mount();
+ return PTR_ERR(path.dentry);
+ }
clnt->cl_path = path;
return 0;
-err_path_put:
- path_put(&dir);
-err:
- rpc_put_mount();
- return error;
}
static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, struct rpc_xprt *xprt)
@@ -246,10 +279,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
return clnt;
out_no_auth:
- if (!IS_ERR(clnt->cl_path.dentry)) {
- rpc_remove_client_dir(clnt->cl_path.dentry);
- rpc_put_mount();
- }
+ rpc_clnt_remove_pipedir(clnt);
out_no_path:
kfree(clnt->cl_principal);
out_no_principal:
@@ -474,10 +504,7 @@ rpc_free_client(struct rpc_clnt *clnt)
{
dprintk("RPC: destroying %s client for %s\n",
clnt->cl_protname, clnt->cl_server);
- if (!IS_ERR(clnt->cl_path.dentry)) {
- rpc_remove_client_dir(clnt->cl_path.dentry);
- rpc_put_mount();
- }
+ rpc_clnt_remove_pipedir(clnt);
if (clnt->cl_parent != clnt) {
rpc_release_client(clnt->cl_parent);
goto out_free;
^ permalink raw reply related
* [PATCH 6/6] SUNRPC: remove RPC PipeFS mount point reference from RPC client
From: Stanislav Kinsbursky @ 2011-11-23 11:52 UTC (permalink / raw)
To: bfields, Trond.Myklebust
Cc: linux-nfs, xemul, neilb, netdev, linux-kernel, jbottomley, davem,
devel
In-Reply-To: <20111123104945.11077.10270.stgit@localhost6.localdomain6>
This is a cleanup patch. We don't need this reference anymore.
Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
---
fs/nfs/idmap.c | 4 ++--
include/linux/sunrpc/clnt.h | 2 +-
net/sunrpc/auth_gss/auth_gss.c | 8 ++++----
net/sunrpc/clnt.c | 21 ++++++++++-----------
4 files changed, 17 insertions(+), 18 deletions(-)
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index b09a7f1..60698a1 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -370,8 +370,8 @@ nfs_idmap_new(struct nfs_client *clp)
return error;
}
- if (clp->cl_rpcclient->cl_path.dentry)
- pipe->dentry = rpc_mkpipe_dentry(clp->cl_rpcclient->cl_path.dentry,
+ if (clp->cl_rpcclient->cl_dentry)
+ pipe->dentry = rpc_mkpipe_dentry(clp->cl_rpcclient->cl_dentry,
"idmap", idmap, pipe);
if (IS_ERR(pipe->dentry)) {
error = PTR_ERR(pipe->dentry);
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index db7bcaf..9fe39bc 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -56,7 +56,7 @@ struct rpc_clnt {
int cl_nodelen; /* nodename length */
char cl_nodename[UNX_MAXNODENAME];
- struct path cl_path;
+ struct dentry * cl_dentry;
struct rpc_clnt * cl_parent; /* Points to parent of clones */
struct rpc_rtt cl_rtt_default;
struct rpc_timeout cl_timeout_default;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 66293dc..8673220 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -799,12 +799,12 @@ static int gss_pipes_dentries_create(struct rpc_auth *auth)
gss_auth = container_of(auth, struct gss_auth, rpc_auth);
clnt = gss_auth->client;
- gss_auth->pipe[1]->dentry = rpc_mkpipe_dentry(clnt->cl_path.dentry,
+ gss_auth->pipe[1]->dentry = rpc_mkpipe_dentry(clnt->cl_dentry,
"gssd",
clnt, gss_auth->pipe[1]);
if (IS_ERR(gss_auth->pipe[1]->dentry))
return PTR_ERR(gss_auth->pipe[1]->dentry);
- gss_auth->pipe[0]->dentry = rpc_mkpipe_dentry(clnt->cl_path.dentry,
+ gss_auth->pipe[0]->dentry = rpc_mkpipe_dentry(clnt->cl_dentry,
gss_auth->mech->gm_name,
clnt, gss_auth->pipe[0]);
if (IS_ERR(gss_auth->pipe[0]->dentry)) {
@@ -826,7 +826,7 @@ static void gss_pipes_dentries_destroy_net(struct rpc_clnt *clnt,
sb = rpc_get_sb_net(net);
if (sb) {
- if (clnt->cl_path.dentry)
+ if (clnt->cl_dentry)
gss_pipes_dentries_destroy(auth);
rpc_put_sb_net(net);
}
@@ -841,7 +841,7 @@ static int gss_pipes_dentries_create_net(struct rpc_clnt *clnt,
sb = rpc_get_sb_net(net);
if (sb) {
- if (clnt->cl_path.dentry)
+ if (clnt->cl_dentry)
err = gss_pipes_dentries_create(auth);
rpc_put_sb_net(net);
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index eb2595f..3971aaa 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -95,12 +95,12 @@ static void rpc_unregister_client(struct rpc_clnt *clnt)
static void __rpc_clnt_remove_pipedir(struct rpc_clnt *clnt)
{
- if (clnt->cl_path.dentry) {
+ if (clnt->cl_dentry) {
if (clnt->cl_auth && clnt->cl_auth->au_ops->pipes_destroy)
clnt->cl_auth->au_ops->pipes_destroy(clnt->cl_auth);
- rpc_remove_client_dir(clnt->cl_path.dentry);
+ rpc_remove_client_dir(clnt->cl_dentry);
}
- clnt->cl_path.dentry = NULL;
+ clnt->cl_dentry = NULL;
}
static void rpc_clnt_remove_pipedir(struct rpc_clnt *clnt)
@@ -151,20 +151,19 @@ static int
rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
{
struct super_block *pipefs_sb;
- struct path path;
+ struct dentry *dentry;
- clnt->cl_path.mnt = ERR_PTR(-ENOENT);
- clnt->cl_path.dentry = NULL;
+ clnt->cl_dentry = NULL;
if (dir_name == NULL)
return 0;
pipefs_sb = rpc_get_sb_net(clnt->cl_xprt->xprt_net);
if (!pipefs_sb)
return 0;
- path.dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt, dir_name);
+ dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt, dir_name);
rpc_put_sb_net(clnt->cl_xprt->xprt_net);
- if (IS_ERR(path.dentry))
- return PTR_ERR(path.dentry);
- clnt->cl_path = path;
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ clnt->cl_dentry = dentry;
return 0;
}
@@ -183,7 +182,7 @@ static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event,
BUG_ON(dentry == NULL);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- clnt->cl_path.dentry = dentry;
+ clnt->cl_dentry = dentry;
if (clnt->cl_auth->au_ops->pipes_create) {
err = clnt->cl_auth->au_ops->pipes_create(clnt->cl_auth);
if (err)
^ permalink raw reply related
* query : unregister/register netdev
From: manoj bm @ 2011-11-23 11:14 UTC (permalink / raw)
To: netdev
Hi Guys,
WRT below mail chain link, I am facing similar issue.
http://www.spinics.net/lists/netdev/msg180176.html
Please find the usage details below:
Situation:
I have concurrent interfaces running & active doing data
transactions. (ex: interface0 & interface1)
-> whenever user want to delete activity on one interface, i have to
stop activity & hide from user (this i am trying to achieve by
unregistering the netdev but not freeing it)
-> whenever user want to create activity on interface, i have to
start activity & make it visible to user (this i am trying to achieve
by registering the netdev which would have unregistered previously)
Issues
-> First interface(0) is basic so i have allocated private driver data
area to it & other interfaces contains basic info & pointing to this
"netdev[0]->net_priv" (i.e. first interface private data area).
Because of this above requirement i cant free the netdev0 until chip
is power supply removed.
Other options thought of but not used are:
-> I can remove other interfaces using unregister_netdev followed by
free_netdev() BUT i am afraid that during the heavy data traffic
conditions(apps continuously pumping data with high bandwidth),
allocating & registering might causes the issue of registration fail
because of resource crunch. Which we never tolerate as this a
concurrent interface solution
-> After verification found that, even ethernet also works in similar
fashion. If chip is powered then irrespective of RJ45 connected or not
interface is given & visible to the user. That's the way i also
created all of my interfaces statically(during initialization)
depending on the user input i want to hide/unhide (i.e.
register/unregister netdev) corresponding netdevice visible using
ifconfig.
Currently i am facing a issue that bug_on from kernel
--------------------------------------------------------------------------
-> Depending on application input i have to unregister & register the
same netdev without freeing
I am facing the bug_on from kernel during registration as below
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
-> If we forcefully reset the dev->reg_state to uninitialized in our
driver, then this works fine but kernel warns already registered
device & some thing seriously wrong here (i dont want to hack netdev
structures here :))
-> Then i have tried to initialize the device after unregistration by
calling ether_setup & then again registering BUT same bug_on appeared.
My requirement is quiet simple,
-> Whenever user application want to "delete the netdev interface"
then i have to make the interface disappeared for ifconfig/iwconfig,
so that user can't play with that
-> Whenever user application want to "Add the netdev interface" then i
have to make the interface appearing for ifconfig/iwconfig, so that
user can use those
The problem is i don't want to call Alloc_netdev/free_netdev based on
application input (dynamically creating/destroying), i just want to
register/unregister statically created interfaces accordingly.
Please let me know your views on it. Thanks in advance
Thanks,
Manoj
^ permalink raw reply
* [PATCH] ipv6: tcp: fix panic in SYN processing
From: Eric Dumazet @ 2011-11-23 11:33 UTC (permalink / raw)
To: David Miller; +Cc: netdev
commit 72a3effaf633bc ([NET]: Size listen hash tables using backlog
hint) added a bug allowing inet6_synq_hash() to return an out of bound
array index, because of u16 overflow.
Bug can happen if system admins set net.core.somaxconn &
net.ipv4.tcp_max_syn_backlog sysctls to values greater than 65536
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
net/ipv6/inet6_connection_sock.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index fee46d5..1567fb1 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -85,7 +85,7 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk,
* request_sock (formerly open request) hash tables.
*/
static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
- const u32 rnd, const u16 synq_hsize)
+ const u32 rnd, const u32 synq_hsize)
{
u32 c;
^ permalink raw reply related
* Re: [PATCH 0/2] net: Add network priority cgroup (v4)
From: Neil Horman @ 2011-11-23 11:49 UTC (permalink / raw)
To: Kirill Smelkov; +Cc: David Miller, netdev, john.r.fastabend, robert.w.love
In-Reply-To: <20111123101933.GA10127@tugrik.mns.mnsspb.ru>
On Wed, Nov 23, 2011 at 02:19:34PM +0400, Kirill Smelkov wrote:
> On Tue, Nov 22, 2011 at 04:00:08PM -0500, Neil Horman wrote:
> > On Tue, Nov 22, 2011 at 03:45:16PM -0500, David Miller wrote:
> > > From: Neil Horman <nhorman@tuxdriver.com>
> > > Date: Tue, 22 Nov 2011 15:39:38 -0500
> > >
> > > > On Tue, Nov 22, 2011 at 03:23:15PM -0500, David Miller wrote:
> > > >> From: Neil Horman <nhorman@tuxdriver.com>
> > > >> Date: Tue, 22 Nov 2011 10:10:50 -0500
> > > >>
> > > >> > (v2)
> > > >> > Based on reviews from John F., Amerigo Wang and Neerav Parikh, I've cleaned up
> > > >> > the rcu locking, fixed a memory leak in an error path, and corrected some typos.
> > > >> >
> > > >> > (v3)
> > > >> > Converted rcu_dereference to rntl_dereference where appropriate as per request
> > > >> > from John F.
> > > >> >
> > > >> > (v4)
> > > >> > Cleaned up some spacing issues, and optimized the skb_update_priority path as
> > > >> > per request from Dave M.
> > > >> >
> > > >> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > > >> > Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> > > >> > CC: Robert Love <robert.w.love@intel.com>
> > > >> > CC: "David S. Miller" <davem@davemloft.net>
> > > >>
> > > >> Applied and going through some build testing now, thanks Neil.
> > > > Thanks Dave! I tested here building separately with:
> > > > CONFIG_CGROUPS=n
> > > > CONFIG_CROUPS=y && CONFIG_NETPRIO_CGROUP=y
> > > > CONFIG_CROUPS=y && CONFIG_NETPRIO_CGROUP=m
> > > >
> > > > So you should be square, but theres rarely any valid accounting for my ability
> > > > to screw something up :)
> > >
> > > I want to be absolutely sure, because I had to apply your changes by hand,
> > > in particular the linux/netdevice.h changes had line offsets of up to 67.
> > Really? Thats odd, I've got thse changes here applied to a local copy of the
> > net-next tree. I branched from commit 8b3408f8ee994973869d8ba32c5bf482bc4ddca4
> > and it applies cleanly. Perhaps something wen't in while I was developing this?
>
> The patch broke the build with CONFIG_CGROUPS=n (testing on today's
> net-next v3.2-rc2-714-g1f2149c):
>
> kirr@mini:~/src/linux/linux-net-next$ make
> CHK include/linux/version.h
> CHK include/generated/utsrelease.h
> CALL scripts/checksyscalls.sh
> CHK include/generated/compile.h
> CC init/do_mounts.o
> In file included from include/linux/netdevice.h:53,
> from include/linux/icmpv6.h:173,
> from include/linux/ipv6.h:220,
> from include/net/ipv6.h:16,
> from include/linux/sunrpc/clnt.h:26,
> from include/linux/nfs_fs.h:50,
> from init/do_mounts.c:20:
> include/net/netprio_cgroup.h:23: error: field ‘css’ has incomplete type
> scripts/Makefile.build:305: recipe for target `init/do_mounts.o' failed
> make[1]: *** [init/do_mounts.o] Error 1
> Makefile:945: recipe for target `init' failed
> make: *** [init] Error 2
>
>
> Config attached.
>
Damn, I apologize. I had tested with CGROUP_CONFIG=n, but when I did a make
oldconfig, since I had cgroup subsystems enabled, it re-selected CGROUP_CONFIG,
instead of squashing the subsystems. I'll fix this immediately.
Neil
^ permalink raw reply
* [PATCH 0/6] SUNRPC: make RPC clients use network-namespace-aware PipeFS routines
From: Stanislav Kinsbursky @ 2011-11-23 11:51 UTC (permalink / raw)
To: bfields, Trond.Myklebust
Cc: linux-nfs, xemul, neilb, netdev, linux-kernel, jbottomley, davem,
devel
This patch set was created in context of clone of git
branch: git://git.linux-nfs.org/projects/trondmy/nfs-2.6.git.
tag: v3.1
This patch set depends on previous patch sets titled:
1) "SUNRPC: initial part of making pipefs work in net ns"
2) "SUNPRC: cleanup PipeFS for network-namespace-aware users"
This patch set is a first part of reworking SUNPRC PipeFS users.
It makes SUNRPC clients using PipeFS nofitications for directory and GSS pipes
dentries creation. With this patch set RPC clients and GSS auth creations
routines doesn't force SUNRPC PipeFS mount point creation which actually means,
that they now can work without PipeFS dentries.
The following series consists of:
---
Stanislav Kinsbursky (6):
SUNRPC: handle RPC client pipefs dentries by network namespace aware routines
SUNRPC: handle GSS AUTH pipes by network namespace aware routines
SUNRPC: subscribe RPC clients to pipefs notifications
SUNRPC: remove RPC client pipefs dentries after unregister
SUNRPC: remove RPC pipefs mount point manipulations from RPC clients code
SUNRPC: remove RPC PipeFS mount point reference from RPC client
fs/nfs/idmap.c | 4 +
fs/nfsd/nfs4callback.c | 2 -
include/linux/nfs.h | 2 -
include/linux/sunrpc/auth.h | 2 +
include/linux/sunrpc/clnt.h | 2 -
net/sunrpc/auth_gss/auth_gss.c | 101 +++++++++++++++++++++------
net/sunrpc/clnt.c | 151 +++++++++++++++++++++++++++++++---------
net/sunrpc/rpc_pipe.c | 19 +++--
net/sunrpc/sunrpc.h | 2 +
9 files changed, 218 insertions(+), 67 deletions(-)
--
Signature
^ permalink raw reply
* [PATCH 2/6] SUNRPC: handle GSS AUTH pipes by network namespace aware routines
From: Stanislav Kinsbursky @ 2011-11-23 11:51 UTC (permalink / raw)
To: bfields, Trond.Myklebust
Cc: linux-nfs, xemul, neilb, netdev, linux-kernel, jbottomley, davem,
devel
In-Reply-To: <20111123104945.11077.10270.stgit@localhost6.localdomain6>
This patch makes RPC GSS PipeFs pipes allocated in it's RPC client owner
network namespace context.
Pipes creation and destruction now done in separated functions, which takes
care about PipeFS superblock locking.
Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
---
net/sunrpc/auth_gss/auth_gss.c | 95 +++++++++++++++++++++++++++++++---------
1 files changed, 73 insertions(+), 22 deletions(-)
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 2b25a7b..248acd0 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -779,6 +779,73 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
}
}
+static void gss_pipes_dentries_destroy(struct rpc_auth *auth)
+{
+ struct gss_auth *gss_auth;
+
+ gss_auth = container_of(auth, struct gss_auth, rpc_auth);
+ rpc_unlink(gss_auth->pipe[0]->dentry);
+ rpc_unlink(gss_auth->pipe[1]->dentry);
+}
+
+static int gss_pipes_dentries_create(struct rpc_auth *auth)
+{
+ int err;
+ struct gss_auth *gss_auth;
+ struct rpc_clnt *clnt;
+
+ gss_auth = container_of(auth, struct gss_auth, rpc_auth);
+ clnt = gss_auth->client;
+
+ gss_auth->pipe[1]->dentry = rpc_mkpipe_dentry(clnt->cl_path.dentry,
+ "gssd",
+ clnt, gss_auth->pipe[1]);
+ if (IS_ERR(gss_auth->pipe[1]->dentry))
+ return PTR_ERR(gss_auth->pipe[1]->dentry);
+ gss_auth->pipe[0]->dentry = rpc_mkpipe_dentry(clnt->cl_path.dentry,
+ gss_auth->mech->gm_name,
+ clnt, gss_auth->pipe[0]);
+ if (IS_ERR(gss_auth->pipe[0]->dentry)) {
+ err = PTR_ERR(gss_auth->pipe[0]->dentry);
+ goto err_unlink_pipe_1;
+ }
+ return 0;
+
+err_unlink_pipe_1:
+ rpc_unlink(gss_auth->pipe[1]->dentry);
+ return err;
+}
+
+static void gss_pipes_dentries_destroy_net(struct rpc_clnt *clnt,
+ struct rpc_auth *auth)
+{
+ struct net *net = clnt->cl_xprt->xprt_net;
+ struct super_block *sb;
+
+ sb = rpc_get_sb_net(net);
+ if (sb) {
+ if (clnt->cl_path.dentry)
+ gss_pipes_dentries_destroy(auth);
+ rpc_put_sb_net(net);
+ }
+}
+
+static int gss_pipes_dentries_create_net(struct rpc_clnt *clnt,
+ struct rpc_auth *auth)
+{
+ struct net *net = clnt->cl_xprt->xprt_net;
+ struct super_block *sb;
+ int err = 0;
+
+ sb = rpc_get_sb_net(net);
+ if (sb) {
+ if (clnt->cl_path.dentry)
+ err = gss_pipes_dentries_create(auth);
+ rpc_put_sb_net(net);
+ }
+ return err;
+}
+
/*
* NOTE: we have the opportunity to use different
* parameters based on the input flavor (which must be a pseudoflavor)
@@ -834,31 +901,16 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
err = PTR_ERR(gss_auth->pipe[0]);
goto err_destroy_pipe_1;
}
-
- gss_auth->pipe[1]->dentry = rpc_mkpipe_dentry(clnt->cl_path.dentry,
- "gssd",
- clnt, gss_auth->pipe[1]);
- if (IS_ERR(gss_auth->pipe[1]->dentry)) {
- err = PTR_ERR(gss_auth->pipe[1]->dentry);
+ err = gss_pipes_dentries_create_net(clnt, auth);
+ if (err)
goto err_destroy_pipe_0;
- }
-
- gss_auth->pipe[0]->dentry = rpc_mkpipe_dentry(clnt->cl_path.dentry,
- gss_auth->mech->gm_name,
- clnt, gss_auth->pipe[0]);
- if (IS_ERR(gss_auth->pipe[0]->dentry)) {
- err = PTR_ERR(gss_auth->pipe[0]->dentry);
- goto err_unlink_pipe_1;
- }
err = rpcauth_init_credcache(auth);
if (err)
- goto err_unlink_pipe_0;
+ goto err_unlink_pipes;
return auth;
-err_unlink_pipe_0:
- rpc_unlink(gss_auth->pipe[0]->dentry);
-err_unlink_pipe_1:
- rpc_unlink(gss_auth->pipe[1]->dentry);
+err_unlink_pipes:
+ gss_pipes_dentries_destroy_net(clnt, auth);
err_destroy_pipe_0:
rpc_destroy_pipe_data(gss_auth->pipe[0]);
err_destroy_pipe_1:
@@ -875,8 +927,7 @@ out_dec:
static void
gss_free(struct gss_auth *gss_auth)
{
- rpc_unlink(gss_auth->pipe[0]->dentry);
- rpc_unlink(gss_auth->pipe[1]->dentry);
+ gss_pipes_dentries_destroy_net(gss_auth->client, &gss_auth->rpc_auth);
rpc_destroy_pipe_data(gss_auth->pipe[0]);
rpc_destroy_pipe_data(gss_auth->pipe[1]);
gss_mech_put(gss_auth->mech);
^ permalink raw reply related
* [PATCH 3/6] SUNRPC: subscribe RPC clients to pipefs notifications
From: Stanislav Kinsbursky @ 2011-11-23 11:51 UTC (permalink / raw)
To: bfields, Trond.Myklebust
Cc: linux-nfs, xemul, neilb, netdev, linux-kernel, jbottomley, davem,
devel
In-Reply-To: <20111123104945.11077.10270.stgit@localhost6.localdomain6>
This patch subscribes RPC clients to RPC pipefs notifications. RPC clients
notifier block is registering with pipefs initialization during SUNRPC module
init.
This notifier callback is responsible for RPC client PipeFS directory and GSS
pipes creation. For pipes creation and destruction two additional callbacks
were added to struct rpc_authops.
Note that no locking required in notifier callback because PipeFS superblock
pointer is passed as an argument from it's creation or destruction routine and
thus we can be sure about it's validity.
Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
---
include/linux/sunrpc/auth.h | 2 +
net/sunrpc/auth_gss/auth_gss.c | 10 ++++--
net/sunrpc/clnt.c | 70 +++++++++++++++++++++++++++++++++++++++-
net/sunrpc/rpc_pipe.c | 19 +++++++----
net/sunrpc/sunrpc.h | 2 +
5 files changed, 93 insertions(+), 10 deletions(-)
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index febc4db..83f493f 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -98,6 +98,8 @@ struct rpc_authops {
struct rpc_cred * (*lookup_cred)(struct rpc_auth *, struct auth_cred *, int);
struct rpc_cred * (*crcreate)(struct rpc_auth*, struct auth_cred *, int);
+ int (*pipes_create)(struct rpc_auth *);
+ void (*pipes_destroy)(struct rpc_auth *);
};
struct rpc_credops {
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 248acd0..66293dc 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -784,8 +784,10 @@ static void gss_pipes_dentries_destroy(struct rpc_auth *auth)
struct gss_auth *gss_auth;
gss_auth = container_of(auth, struct gss_auth, rpc_auth);
- rpc_unlink(gss_auth->pipe[0]->dentry);
- rpc_unlink(gss_auth->pipe[1]->dentry);
+ if (gss_auth->pipe[0]->dentry)
+ rpc_unlink(gss_auth->pipe[0]->dentry);
+ if (gss_auth->pipe[1]->dentry)
+ rpc_unlink(gss_auth->pipe[1]->dentry);
}
static int gss_pipes_dentries_create(struct rpc_auth *auth)
@@ -1628,7 +1630,9 @@ static const struct rpc_authops authgss_ops = {
.create = gss_create,
.destroy = gss_destroy,
.lookup_cred = gss_lookup_cred,
- .crcreate = gss_create_cred
+ .crcreate = gss_create_cred,
+ .pipes_create = gss_pipes_dentries_create,
+ .pipes_destroy = gss_pipes_dentries_destroy,
};
static const struct rpc_credops gss_credops = {
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 008c755..61c76ae 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -95,8 +95,11 @@ static void rpc_unregister_client(struct rpc_clnt *clnt)
static void __rpc_clnt_remove_pipedir(struct rpc_clnt *clnt)
{
- if (clnt->cl_path.dentry)
+ if (clnt->cl_path.dentry) {
+ if (clnt->cl_auth && clnt->cl_auth->au_ops->pipes_destroy)
+ clnt->cl_auth->au_ops->pipes_destroy(clnt->cl_auth);
rpc_remove_client_dir(clnt->cl_path.dentry);
+ }
clnt->cl_path.dentry = NULL;
}
@@ -174,6 +177,71 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
return 0;
}
+static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event,
+ struct super_block *sb)
+{
+ struct dentry *dentry;
+ int err = 0;
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ if (clnt->cl_program->pipe_dir_name == NULL)
+ break;
+ dentry = rpc_setup_pipedir_sb(sb, clnt,
+ clnt->cl_program->pipe_dir_name);
+ BUG_ON(dentry == NULL);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ clnt->cl_path.dentry = dentry;
+ if (clnt->cl_auth->au_ops->pipes_create) {
+ err = clnt->cl_auth->au_ops->pipes_create(clnt->cl_auth);
+ if (err)
+ __rpc_clnt_remove_pipedir(clnt);
+ }
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ __rpc_clnt_remove_pipedir(clnt);
+ break;
+ default:
+ printk(KERN_ERR "%s: unknown event: %ld\n", __func__, event);
+ return -ENOTSUPP;
+ }
+ return err;
+}
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct super_block *sb = ptr;
+ struct rpc_clnt *clnt;
+ int error = 0;
+
+ spin_lock(&rpc_client_lock);
+ list_for_each_entry(clnt, &all_clients, cl_clients) {
+ if (clnt->cl_xprt->xprt_net != sb->s_fs_info)
+ continue;
+ error = __rpc_pipefs_event(clnt, event, sb);
+ if (error)
+ break;
+ }
+ spin_unlock(&rpc_client_lock);
+ return error;
+}
+
+static struct notifier_block rpc_clients_block = {
+ .notifier_call = rpc_pipefs_event,
+};
+
+int rpc_clients_notifier_register(void)
+{
+ return rpc_pipefs_notifier_register(&rpc_clients_block);
+}
+
+void rpc_clients_notifier_unregister(void)
+{
+ return rpc_pipefs_notifier_unregister(&rpc_clients_block);
+}
+
static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, struct rpc_xprt *xprt)
{
struct rpc_program *program = args->program;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 8e59580..1ea0dcf 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -918,7 +918,7 @@ struct dentry *rpc_create_client_dir(struct dentry *dentry,
/**
* rpc_remove_client_dir - Remove a directory created with rpc_create_client_dir()
- * @dentry: directory to remove
+ * @clnt: rpc client
*/
int rpc_remove_client_dir(struct dentry *dentry)
{
@@ -1169,17 +1169,24 @@ int register_rpc_pipefs(void)
init_once);
if (!rpc_inode_cachep)
return -ENOMEM;
+ err = rpc_clients_notifier_register();
+ if (err)
+ goto err_notifier;
err = register_filesystem(&rpc_pipe_fs_type);
- if (err) {
- kmem_cache_destroy(rpc_inode_cachep);
- return err;
- }
-
+ if (err)
+ goto err_register;
return 0;
+
+err_register:
+ rpc_clients_notifier_unregister();
+err_notifier:
+ kmem_cache_destroy(rpc_inode_cachep);
+ return err;
}
void unregister_rpc_pipefs(void)
{
+ rpc_clients_notifier_unregister();
kmem_cache_destroy(rpc_inode_cachep);
unregister_filesystem(&rpc_pipe_fs_type);
}
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index 90c292e..14c9f6d 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -47,5 +47,7 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
struct page *headpage, unsigned long headoffset,
struct page *tailpage, unsigned long tailoffset);
+int rpc_clients_notifier_register(void);
+void rpc_clients_notifier_unregister(void);
#endif /* _NET_SUNRPC_SUNRPC_H */
^ permalink raw reply related
* [PATCH 4/6] SUNRPC: remove RPC client pipefs dentries after unregister
From: Stanislav Kinsbursky @ 2011-11-23 11:51 UTC (permalink / raw)
To: bfields, Trond.Myklebust
Cc: linux-nfs, xemul, neilb, netdev, linux-kernel, jbottomley, davem,
devel
In-Reply-To: <20111123104945.11077.10270.stgit@localhost6.localdomain6>
Without this patch we have races:
rpc_fill_super rpc_free_client
rpc_pipefs_event(MOUNT) rpc_remove_pipedir
spin_lock(&rpc_client_lock);
rpc_setup_pipedir_sb
spin_unlock(&rpc_client_lock);
spin_lock(&rpc_client_lock);
(remove from list)
spin_unlock(&rpc_client_lock);
MEAMORY LEAKED
Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
---
net/sunrpc/clnt.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 61c76ae..23776a4 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -572,7 +572,6 @@ rpc_free_client(struct rpc_clnt *clnt)
{
dprintk("RPC: destroying %s client for %s\n",
clnt->cl_protname, clnt->cl_server);
- rpc_clnt_remove_pipedir(clnt);
if (clnt->cl_parent != clnt) {
rpc_release_client(clnt->cl_parent);
goto out_free;
@@ -581,6 +580,7 @@ rpc_free_client(struct rpc_clnt *clnt)
kfree(clnt->cl_server);
out_free:
rpc_unregister_client(clnt);
+ rpc_clnt_remove_pipedir(clnt);
rpc_free_iostats(clnt->cl_metrics);
kfree(clnt->cl_principal);
clnt->cl_metrics = NULL;
^ permalink raw reply related
* [PATCH 5/6] SUNRPC: remove RPC pipefs mount point manipulations from RPC clients code
From: Stanislav Kinsbursky @ 2011-11-23 11:51 UTC (permalink / raw)
To: bfields, Trond.Myklebust
Cc: linux-nfs, xemul, neilb, netdev, linux-kernel, jbottomley, davem,
devel
In-Reply-To: <20111123104945.11077.10270.stgit@localhost6.localdomain6>
Now, with RPC pipefs mount notifications handling in RPC clients, we can remove
mount point creation and destruction. RPC clients dentries will be created on
PipeFS mount event and removed on umount event.
Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
---
net/sunrpc/clnt.c | 15 +++------------
1 files changed, 3 insertions(+), 12 deletions(-)
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 23776a4..eb2595f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -112,7 +112,6 @@ static void rpc_clnt_remove_pipedir(struct rpc_clnt *clnt)
__rpc_clnt_remove_pipedir(clnt);
rpc_put_sb_net(clnt->cl_xprt->xprt_net);
}
- rpc_put_mount();
}
static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb,
@@ -158,21 +157,13 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
clnt->cl_path.dentry = NULL;
if (dir_name == NULL)
return 0;
-
- path.mnt = rpc_get_mount();
- if (IS_ERR(path.mnt))
- return PTR_ERR(path.mnt);
pipefs_sb = rpc_get_sb_net(clnt->cl_xprt->xprt_net);
- if (!pipefs_sb) {
- rpc_put_mount();
- return -ENOENT;
- }
+ if (!pipefs_sb)
+ return 0;
path.dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt, dir_name);
rpc_put_sb_net(clnt->cl_xprt->xprt_net);
- if (IS_ERR(path.dentry)) {
- rpc_put_mount();
+ if (IS_ERR(path.dentry))
return PTR_ERR(path.dentry);
- }
clnt->cl_path = path;
return 0;
}
^ permalink raw reply related
* [net] bnx2x: Fix 5461x LED
From: Yaniv Rosner @ 2011-11-23 13:54 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Yaniv Rosner, Eilon Greenstein
Fix port identify test on 5461x PHY by driving LEDs through MDIO.
Signed-off-by: Yaniv Rosner <yanivr@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c | 39 +++++++++++++++++++++-
drivers/net/ethernet/broadcom/bnx2x/bnx2x_reg.h | 1 +
2 files changed, 39 insertions(+), 1 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
index bce203f..882f48f 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
@@ -10327,6 +10327,43 @@ static int bnx2x_54618se_config_init(struct bnx2x_phy *phy,
return 0;
}
+
+static void bnx2x_5461x_set_link_led(struct bnx2x_phy *phy,
+ struct link_params *params, u8 mode)
+{
+ struct bnx2x *bp = params->bp;
+ u16 temp;
+
+ bnx2x_cl22_write(bp, phy,
+ MDIO_REG_GPHY_SHADOW,
+ MDIO_REG_GPHY_SHADOW_LED_SEL1);
+ bnx2x_cl22_read(bp, phy,
+ MDIO_REG_GPHY_SHADOW,
+ &temp);
+ temp &= 0xff00;
+
+ DP(NETIF_MSG_LINK, "54618x set link led (mode=%x)\n", mode);
+ switch (mode) {
+ case LED_MODE_FRONT_PANEL_OFF:
+ case LED_MODE_OFF:
+ temp |= 0x00ee;
+ break;
+ case LED_MODE_OPER:
+ temp |= 0x0001;
+ break;
+ case LED_MODE_ON:
+ temp |= 0x00ff;
+ break;
+ default:
+ break;
+ }
+ bnx2x_cl22_write(bp, phy,
+ MDIO_REG_GPHY_SHADOW,
+ MDIO_REG_GPHY_SHADOW_WR_ENA | temp);
+ return;
+}
+
+
static void bnx2x_54618se_link_reset(struct bnx2x_phy *phy,
struct link_params *params)
{
@@ -11103,7 +11140,7 @@ static struct bnx2x_phy phy_54618se = {
.config_loopback = (config_loopback_t)bnx2x_54618se_config_loopback,
.format_fw_ver = (format_fw_ver_t)NULL,
.hw_reset = (hw_reset_t)NULL,
- .set_link_led = (set_link_led_t)NULL,
+ .set_link_led = (set_link_led_t)bnx2x_5461x_set_link_led,
.phy_specific_func = (phy_specific_func_t)NULL
};
/*****************************************************************/
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_reg.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_reg.h
index fc7bd0f..e58073e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_reg.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_reg.h
@@ -6990,6 +6990,7 @@ Theotherbitsarereservedandshouldbezero*/
#define MDIO_REG_INTR_MASK 0x1b
#define MDIO_REG_INTR_MASK_LINK_STATUS (0x1 << 1)
#define MDIO_REG_GPHY_SHADOW 0x1c
+#define MDIO_REG_GPHY_SHADOW_LED_SEL1 (0x0d << 10)
#define MDIO_REG_GPHY_SHADOW_LED_SEL2 (0x0e << 10)
#define MDIO_REG_GPHY_SHADOW_WR_ENA (0x1 << 15)
#define MDIO_REG_GPHY_SHADOW_AUTO_DET_MED (0x1e << 10)
--
1.7.7.1
^ permalink raw reply related
* Re: linux-next: build failure after merge of the final tree (net-next tree related)
From: Neil Horman @ 2011-11-23 12:09 UTC (permalink / raw)
To: Stephen Rothwell; +Cc: David Miller, netdev, linux-next, linux-kernel
In-Reply-To: <20111123150004.59be04eb36c480ec44edc9d5@canb.auug.org.au>
On Wed, Nov 23, 2011 at 03:00:04PM +1100, Stephen Rothwell wrote:
> Hi all,
>
> After merging the final tree, today's linux-next build (powerpc allnoconfig)
> failed like this:
>
> In file included from include/linux/netdevice.h:53:0,
> from include/linux/icmpv6.h:173,
> from include/linux/ipv6.h:220,
> from include/net/ipv6.h:16,
> from include/linux/sunrpc/clnt.h:26,
> from include/linux/nfs_fs.h:50,
> from init/do_mounts.c:20:
> include/net/netprio_cgroup.h:23:29: error: field 'css' has incomplete type
>
> And several more similar.
>
> Caused by commit 5bc1421e34ec ("net: add network priority cgroup
> infrastructure (v4)").
>
> I have added the following (minimal, but probably not optimal) patch for
> today:
>
> From: Stephen Rothwell <sfr@canb.auug.org.au>
> Date: Wed, 23 Nov 2011 14:49:49 +1100
> Subject: [PATCH] net: fix build error in network priority cgroup
> infrastructure
>
> Fixes this error:
>
> In file included from include/linux/netdevice.h:53:0,
> from include/linux/icmpv6.h:173,
> from include/linux/ipv6.h:220,
> from include/net/ipv6.h:16,
> from include/linux/sunrpc/clnt.h:26,
> from include/linux/nfs_fs.h:50,
> from init/do_mounts.c:20:
> include/net/netprio_cgroup.h:23:29: error: field 'css' has incomplete type
>
> When CONFIG_CGROUPS is not set.
>
> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
> ---
> include/net/netprio_cgroup.h | 2 ++
> 1 files changed, 2 insertions(+), 0 deletions(-)
>
> diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
> index c432e99..da71b91 100644
> --- a/include/net/netprio_cgroup.h
> +++ b/include/net/netprio_cgroup.h
> @@ -20,7 +20,9 @@
>
> struct cgroup_netprio_state
> {
> +#ifdef CONFIG_CGROUPS
> struct cgroup_subsys_state css;
> +#endif
> u32 prioidx;
> };
>
> --
> 1.7.7.3
>
> --
> Cheers,
> Stephen Rothwell sfr@canb.auug.org.au
FYI, I've got a more appropriate fix building right now getting posted for the
net-next tree. I'll cc you on it.
Neil
^ permalink raw reply
* [PATCH 1/5] route: Use the device mtu as the default for blackhole routes
From: Steffen Klassert @ 2011-11-23 12:12 UTC (permalink / raw)
To: David Miller; +Cc: netdev
As it is, we return null as the default mtu of blackhole routes.
This may lead to a propagation of a bogus pmtu if the default_mtu
method of a blackhole route is invoked. So return dst->dev->mtu
as the default mtu instead.
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
net/ipv4/route.c | 2 +-
net/ipv6/route.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0c74da8..5b17bf1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2757,7 +2757,7 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo
static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
{
- return 0;
+ return dst->dev->mtu;
}
static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8473016..d8fbd18 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -157,7 +157,7 @@ static struct dst_ops ip6_dst_ops_template = {
static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
{
- return 0;
+ return dst->dev->mtu;
}
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
--
1.7.0.4
^ permalink raw reply related
* [PATCH 2/5] net: Rename the dst_opt default_mtu method to mtu
From: Steffen Klassert @ 2011-11-23 12:12 UTC (permalink / raw)
To: David Miller; +Cc: netdev
In-Reply-To: <20111123121213.GA6348@secunet.com>
We plan to invoke the dst_opt->default_mtu() method unconditioally
from dst_mtu(). So rename the method to dst_opt->mtu() to match
the name with the new meaning.
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
include/net/dst.h | 2 +-
include/net/dst_ops.h | 2 +-
net/decnet/dn_route.c | 6 +++---
net/ipv4/route.c | 10 +++++-----
net/ipv6/route.c | 10 +++++-----
net/xfrm/xfrm_policy.c | 6 +++---
6 files changed, 18 insertions(+), 18 deletions(-)
diff --git a/include/net/dst.h b/include/net/dst.h
index 4fb6c43..666de31 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -208,7 +208,7 @@ static inline u32 dst_mtu(const struct dst_entry *dst)
u32 mtu = dst_metric_raw(dst, RTAX_MTU);
if (!mtu)
- mtu = dst->ops->default_mtu(dst);
+ mtu = dst->ops->mtu(dst);
return mtu;
}
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index 9adb998..e1c2ee0 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -17,7 +17,7 @@ struct dst_ops {
int (*gc)(struct dst_ops *ops);
struct dst_entry * (*check)(struct dst_entry *, __u32 cookie);
unsigned int (*default_advmss)(const struct dst_entry *);
- unsigned int (*default_mtu)(const struct dst_entry *);
+ unsigned int (*mtu)(const struct dst_entry *);
u32 * (*cow_metrics)(struct dst_entry *, unsigned long);
void (*destroy)(struct dst_entry *);
void (*ifdown)(struct dst_entry *,
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index a77d161..db48679 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -112,7 +112,7 @@ static unsigned long dn_rt_deadline;
static int dn_dst_gc(struct dst_ops *ops);
static struct dst_entry *dn_dst_check(struct dst_entry *, __u32);
static unsigned int dn_dst_default_advmss(const struct dst_entry *dst);
-static unsigned int dn_dst_default_mtu(const struct dst_entry *dst);
+static unsigned int dn_dst_mtu(const struct dst_entry *dst);
static void dn_dst_destroy(struct dst_entry *);
static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
static void dn_dst_link_failure(struct sk_buff *);
@@ -135,7 +135,7 @@ static struct dst_ops dn_dst_ops = {
.gc = dn_dst_gc,
.check = dn_dst_check,
.default_advmss = dn_dst_default_advmss,
- .default_mtu = dn_dst_default_mtu,
+ .mtu = dn_dst_mtu,
.cow_metrics = dst_cow_metrics_generic,
.destroy = dn_dst_destroy,
.negative_advice = dn_dst_negative_advice,
@@ -825,7 +825,7 @@ static unsigned int dn_dst_default_advmss(const struct dst_entry *dst)
return dn_mss_from_pmtu(dst->dev, dst_mtu(dst));
}
-static unsigned int dn_dst_default_mtu(const struct dst_entry *dst)
+static unsigned int dn_dst_mtu(const struct dst_entry *dst)
{
return dst->dev->mtu;
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 5b17bf1..f1ac3ef 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -138,7 +138,7 @@ static int rt_chain_length_max __read_mostly = 20;
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
-static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
+static unsigned int ipv4_mtu(const struct dst_entry *dst);
static void ipv4_dst_destroy(struct dst_entry *dst);
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
@@ -193,7 +193,7 @@ static struct dst_ops ipv4_dst_ops = {
.gc = rt_garbage_collect,
.check = ipv4_dst_check,
.default_advmss = ipv4_default_advmss,
- .default_mtu = ipv4_default_mtu,
+ .mtu = ipv4_mtu,
.cow_metrics = ipv4_cow_metrics,
.destroy = ipv4_dst_destroy,
.ifdown = ipv4_dst_ifdown,
@@ -1814,7 +1814,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
return advmss;
}
-static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
+static unsigned int ipv4_mtu(const struct dst_entry *dst)
{
unsigned int mtu = dst->dev->mtu;
@@ -2755,7 +2755,7 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo
return NULL;
}
-static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
+static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
{
return dst->dev->mtu;
}
@@ -2775,7 +2775,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
.protocol = cpu_to_be16(ETH_P_IP),
.destroy = ipv4_dst_destroy,
.check = ipv4_blackhole_dst_check,
- .default_mtu = ipv4_blackhole_default_mtu,
+ .mtu = ipv4_blackhole_mtu,
.default_advmss = ipv4_default_advmss,
.update_pmtu = ipv4_rt_blackhole_update_pmtu,
.cow_metrics = ipv4_rt_blackhole_cow_metrics,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d8fbd18..76645d7 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -77,7 +77,7 @@ static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
const struct in6_addr *dest);
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ip6_default_advmss(const struct dst_entry *dst);
-static unsigned int ip6_default_mtu(const struct dst_entry *dst);
+static unsigned int ip6_mtu(const struct dst_entry *dst);
static struct dst_entry *ip6_negative_advice(struct dst_entry *);
static void ip6_dst_destroy(struct dst_entry *);
static void ip6_dst_ifdown(struct dst_entry *,
@@ -144,7 +144,7 @@ static struct dst_ops ip6_dst_ops_template = {
.gc_thresh = 1024,
.check = ip6_dst_check,
.default_advmss = ip6_default_advmss,
- .default_mtu = ip6_default_mtu,
+ .mtu = ip6_mtu,
.cow_metrics = ipv6_cow_metrics,
.destroy = ip6_dst_destroy,
.ifdown = ip6_dst_ifdown,
@@ -155,7 +155,7 @@ static struct dst_ops ip6_dst_ops_template = {
.neigh_lookup = ip6_neigh_lookup,
};
-static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
+static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
{
return dst->dev->mtu;
}
@@ -175,7 +175,7 @@ static struct dst_ops ip6_dst_blackhole_ops = {
.protocol = cpu_to_be16(ETH_P_IPV6),
.destroy = ip6_dst_destroy,
.check = ip6_dst_check,
- .default_mtu = ip6_blackhole_default_mtu,
+ .mtu = ip6_blackhole_mtu,
.default_advmss = ip6_default_advmss,
.update_pmtu = ip6_rt_blackhole_update_pmtu,
.cow_metrics = ip6_rt_blackhole_cow_metrics,
@@ -1041,7 +1041,7 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
return mtu;
}
-static unsigned int ip6_default_mtu(const struct dst_entry *dst)
+static unsigned int ip6_mtu(const struct dst_entry *dst)
{
unsigned int mtu = IPV6_MIN_MTU;
struct inet6_dev *idev;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 552df27..b8be51e 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2382,7 +2382,7 @@ static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
return dst_metric_advmss(dst->path);
}
-static unsigned int xfrm_default_mtu(const struct dst_entry *dst)
+static unsigned int xfrm_mtu(const struct dst_entry *dst)
{
return dst_mtu(dst->path);
}
@@ -2411,8 +2411,8 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
dst_ops->check = xfrm_dst_check;
if (likely(dst_ops->default_advmss == NULL))
dst_ops->default_advmss = xfrm_default_advmss;
- if (likely(dst_ops->default_mtu == NULL))
- dst_ops->default_mtu = xfrm_default_mtu;
+ if (likely(dst_ops->mtu == NULL))
+ dst_ops->mtu = xfrm_mtu;
if (likely(dst_ops->negative_advice == NULL))
dst_ops->negative_advice = xfrm_negative_advice;
if (likely(dst_ops->link_failure == NULL))
--
1.7.0.4
^ permalink raw reply related
* [PATCH 3/5] net: Move mtu handling down to the protocol depended handlers
From: Steffen Klassert @ 2011-11-23 12:13 UTC (permalink / raw)
To: David Miller; +Cc: netdev
In-Reply-To: <20111123121213.GA6348@secunet.com>
We move all mtu handling from dst_mtu() down to the protocol
layer. So each protocol can implement the mtu handling in
a different manner.
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
include/net/dst.h | 7 +------
net/decnet/dn_route.c | 4 +++-
net/ipv4/route.c | 11 +++++++++--
net/ipv6/route.c | 11 +++++++++--
net/xfrm/xfrm_policy.c | 4 +++-
5 files changed, 25 insertions(+), 12 deletions(-)
diff --git a/include/net/dst.h b/include/net/dst.h
index 666de31..6faec1a 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -205,12 +205,7 @@ dst_feature(const struct dst_entry *dst, u32 feature)
static inline u32 dst_mtu(const struct dst_entry *dst)
{
- u32 mtu = dst_metric_raw(dst, RTAX_MTU);
-
- if (!mtu)
- mtu = dst->ops->mtu(dst);
-
- return mtu;
+ return dst->ops->mtu(dst);
}
/* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index db48679..94f4ec0 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -827,7 +827,9 @@ static unsigned int dn_dst_default_advmss(const struct dst_entry *dst)
static unsigned int dn_dst_mtu(const struct dst_entry *dst)
{
- return dst->dev->mtu;
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ return mtu ? : dst->dev->mtu;
}
static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst, const void *daddr)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f1ac3ef..11d1b20 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1816,7 +1816,12 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
static unsigned int ipv4_mtu(const struct dst_entry *dst)
{
- unsigned int mtu = dst->dev->mtu;
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ if (mtu)
+ return mtu;
+
+ mtu = dst->dev->mtu;
if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
const struct rtable *rt = (const struct rtable *) dst;
@@ -2757,7 +2762,9 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo
static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
{
- return dst->dev->mtu;
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ return mtu ? : dst->dev->mtu;
}
static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 76645d7..3399dd3 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -157,7 +157,9 @@ static struct dst_ops ip6_dst_ops_template = {
static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
{
- return dst->dev->mtu;
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ return mtu ? : dst->dev->mtu;
}
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -1043,8 +1045,13 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
static unsigned int ip6_mtu(const struct dst_entry *dst)
{
- unsigned int mtu = IPV6_MIN_MTU;
struct inet6_dev *idev;
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ if (mtu)
+ return mtu;
+
+ mtu = IPV6_MIN_MTU;
rcu_read_lock();
idev = __in6_dev_get(dst->dev);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b8be51e..2118d64 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2384,7 +2384,9 @@ static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
static unsigned int xfrm_mtu(const struct dst_entry *dst)
{
- return dst_mtu(dst->path);
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ return mtu ? : dst_mtu(dst->path);
}
static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, const void *daddr)
--
1.7.0.4
^ permalink raw reply related
* [PATCH 4/5] route: struct rtable can be const in rt_is_input_route and rt_is_output_route
From: Steffen Klassert @ 2011-11-23 12:14 UTC (permalink / raw)
To: David Miller; +Cc: netdev
In-Reply-To: <20111123121213.GA6348@secunet.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
include/net/route.h | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/net/route.h b/include/net/route.h
index db7b343..91855d1 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -71,12 +71,12 @@ struct rtable {
struct fib_info *fi; /* for client ref to shared metrics */
};
-static inline bool rt_is_input_route(struct rtable *rt)
+static inline bool rt_is_input_route(const struct rtable *rt)
{
return rt->rt_route_iif != 0;
}
-static inline bool rt_is_output_route(struct rtable *rt)
+static inline bool rt_is_output_route(const struct rtable *rt)
{
return rt->rt_route_iif == 0;
}
--
1.7.0.4
^ permalink raw reply related
* [PATCH 5/5] ipv4: Don't use the cached pmtu informations for input routes
From: Steffen Klassert @ 2011-11-23 12:14 UTC (permalink / raw)
To: David Miller; +Cc: netdev
In-Reply-To: <20111123121213.GA6348@secunet.com>
The pmtu informations on the inetpeer are visible for output and
input routes. On packet forwarding, we might propagate a learned
pmtu to the sender. As we update the pmtu informations of the
inetpeer on demand, the original sender of the forwarded packets
might never notice when the pmtu to that inetpeer increases.
So use the mtu of the outgoing device on packet forwarding instead
of the pmtu to the final destination.
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
net/ipv4/route.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 11d1b20..fb47c8f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1816,15 +1816,15 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
static unsigned int ipv4_mtu(const struct dst_entry *dst)
{
+ const struct rtable *rt = (const struct rtable *) dst;
unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
- if (mtu)
+ if (mtu && rt_is_output_route(rt))
return mtu;
mtu = dst->dev->mtu;
if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
- const struct rtable *rt = (const struct rtable *) dst;
if (rt->rt_gateway != rt->rt_dst && mtu > 576)
mtu = 576;
--
1.7.0.4
^ permalink raw reply related
* Re: [GIT PULL v2] Open vSwitch
From: jamal @ 2011-11-23 12:22 UTC (permalink / raw)
To: Herbert Xu; +Cc: David Miller, jesse, netdev, dev
In-Reply-To: <20111123075433.GA7928@gondor.apana.org.au>
On Wed, 2011-11-23 at 15:54 +0800, Herbert Xu wrote:
> I mostly agree with Jamal. As far as the concept of a policy
> lookup cache goes (which appears to be at the core of OVS), this
> almost fits exactly onto a u32 hash table. All that would be needed
> is to add the tail end of the policies, e.g., with new packet
> actions.
For a classifier, u32 or em matches would do the job - but they may
need a wrapper around it in user space; so from a usability pov, it
would make sense to have a new classifier that is specific to them.
All the VLAN actions could go into one tc action; the checksum action
is already present. The IP/TCP/UDP header re-writes may require
their own actions - I think one would be sufficient for all.
So in my estimate one classifier and two actions.
Then you get rid of half the code (they use generic netlink to set/get
policies)
> However, this is purely based on my conceptual view of OVS, which
> may or may not be accurate. I'll dig into the patches over the
> next couple of days to see if they could be easily turned into
> packet actions or whether this is difficult for reasons that we
> have not yet discovered.
>
I cant find one - you may. After staring at the code, I am also now
questioning if the existing bridge code couldnt have been re-used with
some small tweaks.
The virtual ports attached to the bridging code may be needed.
A lot of the multi-tenancy intelligence belongs in user space controller
(my reading was that was justification for not re-using bridging code
as is).
cheers,
jamal
^ permalink raw reply
* [PATCH] netprio_cgroup: Fix build break
From: Neil Horman @ 2011-11-23 12:32 UTC (permalink / raw)
To: netdev
Cc: Neil Horman, Kirill Smelkov, David S. Miller, john.r.fastabend,
robert.w.love, Stephen Rothwell, linux-next
I broke the build with the addition of netprio_cgroups if CONFIG_CGROUPS=n.
This patch corrects it by moving the offending struct into an ifdef
CONFIG_CGROUPS block. Also clean up a few needless defines and inline functions
that don't get called if CONFIG_CGROUPS isn't defined while Im at it.
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Kirill Smelkov <kirr@mns.spb.ru>
CC: "David S. Miller" <davem@davemloft.net>
CC: john.r.fastabend@intel.com
CC: robert.w.love@intel.com
CC: Stephen Rothwell <sfr@canb.auug.org.au>
CC: linux-next@vger.kernel.org
---
include/net/netprio_cgroup.h | 18 +++++-------------
1 files changed, 5 insertions(+), 13 deletions(-)
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index c432e99..e503b87 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -18,11 +18,6 @@
#include <linux/hardirq.h>
#include <linux/rcupdate.h>
-struct cgroup_netprio_state
-{
- struct cgroup_subsys_state css;
- u32 prioidx;
-};
struct netprio_map {
struct rcu_head rcu;
@@ -32,6 +27,11 @@ struct netprio_map {
#ifdef CONFIG_CGROUPS
+struct cgroup_netprio_state {
+ struct cgroup_subsys_state css;
+ u32 prioidx;
+};
+
#ifndef CONFIG_NETPRIO_CGROUP
extern int net_prio_subsys_id;
#endif
@@ -52,14 +52,6 @@ static inline struct cgroup_netprio_state
#else
#define sock_update_netprioidx(sk)
-#define skb_update_prio(skb)
-
-static inline struct cgroup_netprio_state
- *task_netprio_state(struct task_struct *p)
-{
- return NULL;
-}
-
#endif
#endif /* _NET_CLS_CGROUP_H */
--
1.7.6.4
^ permalink raw reply related
* Re: [GIT PULL v2] Open vSwitch
From: jamal @ 2011-11-23 12:47 UTC (permalink / raw)
To: Eric Dumazet
Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
Herbert Xu, David Miller
In-Reply-To: <1322035942.1298.56.camel@edumazet-laptop>
On Wed, 2011-11-23 at 09:12 +0100, Eric Dumazet wrote:
> I had no time to look at OVS, but current tc model is not scalable,
> everything is performed under a queue lock.
> Maybe its time to redesign a new model, based on modern techniques.
Making the enqueur/dequeuer lockless would be a big win. What happened
to your idea of ring buffer?
What other hot areas do you see? It used to be ingress/egress share
the qdisc lock - but that is now gone.
> By the way, we seriously lack good documentation on tc, not counting
> many features. Code might be there, but without documenation, working
> samples, who can use it ?
>
> Take a look at last cls_flow extension, and try to use it on a real
> setup, you'll find its almost not possible...
There's no tc-central.org unlike the nice effort the netfilter guys have
put over the years. Documentation is there - sometimes a little too much
with differing "opinions" (lartc that Herbert pointed to is a good
starting point); but googling also helps.
Unfortunately, sometimes the people who understand stuff have no
motivation to do docs.
cheers,
jamal
^ permalink raw reply
* Re: [GIT PULL v2] Open vSwitch
From: Eric Dumazet @ 2011-11-23 12:55 UTC (permalink / raw)
To: jhs-jkUAjuhPggJWk0Htik3J/w
Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
Herbert Xu, David Miller
In-Reply-To: <1322052463.2039.135.camel@mojatatu>
Le mercredi 23 novembre 2011 à 07:47 -0500, jamal a écrit :
> On Wed, 2011-11-23 at 09:12 +0100, Eric Dumazet wrote:
>
> > I had no time to look at OVS, but current tc model is not scalable,
> > everything is performed under a queue lock.
> > Maybe its time to redesign a new model, based on modern techniques.
>
> Making the enqueur/dequeuer lockless would be a big win. What happened
> to your idea of ring buffer?
Currently thinking about it. I was also waiting Tom Herbert BQL patches.
Several people are interested, and John Fastabend told me he plans to :
(1) rcu'ify classifiers/actions as needed
(2) add flag to drop qdisc lock on simple or hw qdiscs
(3) mq and mqprio call root qdisc and run a pass over classifiers
actions possibly resetting queue_mapping.
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev
^ permalink raw reply
* Re: [GIT PULL v2] Open vSwitch
From: David Täht @ 2011-11-23 13:13 UTC (permalink / raw)
To: jhs; +Cc: jamal, Eric Dumazet, Herbert Xu, David Miller, jesse, netdev, dev
In-Reply-To: <1322052463.2039.135.camel@mojatatu>
[-- Attachment #1: Type: text/plain, Size: 2769 bytes --]
On 11/23/2011 01:47 PM, jamal wrote:
> On Wed, 2011-11-23 at 09:12 +0100, Eric Dumazet wrote:
>
>> I had no time to look at OVS, but current tc model is not scalable,
>> everything is performed under a queue lock.
>> Maybe its time to redesign a new model, based on modern techniques.
> Making the enqueur/dequeuer lockless would be a big win. What happened
> to your idea of ring buffer?
It's not so much 'modern tecniques', as modern environments.
High on my list would be a way to more easily expose QoS and AQM
features in the hardware all the way up the stack.
I'd like the hardware to be able to express 'I have FQ', or 'I have red',
much like we express many other features in ethtool, only abstractly
enough so that a qdisc setup can be made generic.
> What other hot areas do you see? It used to be ingress/egress share
> the qdisc lock - but that is now gone.
I find the mapping from hardware queues to any sort of complex software
queuing scheme hard to conceptualize. Also, as structured, tc cannot be
easily applied to wireless APs.
>
>> By the way, we seriously lack good documentation on tc, not counting
>> many features. Code might be there, but without documenation, working
>> samples, who can use it ?
I find tc's concepts incredibly difficult to use effectively. They start
with the presumption that what you are working with is a 1998 point to
point link and get harder from there. That said I think I've almost
managed to bend it to my will of late...
(this email written under the influence of Byte Queue Limits + QFQ +
RED, on ethernet)
>>
>> Take a look at last cls_flow extension, and try to use it on a real
>> setup, you'll find its almost not possible...
>
> There's no tc-central.org unlike the nice effort the netfilter guys have
> put over the years. Documentation is there - sometimes a little too much
> with differing "opinions" (lartc that Herbert pointed to is a good
> starting point); but googling also helps.
> Unfortunately, sometimes the people who understand stuff have no
> motivation to do docs.
After burning the last several months getting good enough at the tc layer
to do stuff in it, I would certainly like to have a place to put
documentation,
and also easily update what already exists.
If it helps any I could offer a redmine instance on bufferbloat.net for
this.
redmine has bug tracking and a wiki...
It would be nice also if the iproute2 code contained more working examples,
and man pages.
It's a ton of doc work, but I'd be willing to do some of it.
>
> cheers,
> jamal
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Dave Täht
[-- Attachment #2: dave_taht.vcf --]
[-- Type: text/x-vcard, Size: 214 bytes --]
begin:vcard
fn;quoted-printable:Dave T=C3=A4ht
n;quoted-printable:T=C3=A4ht;Dave
email;internet:dave.taht@gmail.com
tel;home:1-239-829-5608
tel;cell:0638645374
x-mozilla-html:FALSE
version:2.1
end:vcard
^ permalink raw reply
* Re: [PATCH] netprio_cgroup: Fix build break
From: Kirill Smelkov @ 2011-11-23 13:11 UTC (permalink / raw)
To: Neil Horman
Cc: netdev, David S. Miller, john.r.fastabend, robert.w.love,
Stephen Rothwell, linux-next
In-Reply-To: <1322051553-28987-1-git-send-email-nhorman@tuxdriver.com>
On Wed, Nov 23, 2011 at 07:32:33AM -0500, Neil Horman wrote:
> I broke the build with the addition of netprio_cgroups if CONFIG_CGROUPS=n.
> This patch corrects it by moving the offending struct into an ifdef
> CONFIG_CGROUPS block. Also clean up a few needless defines and inline functions
> that don't get called if CONFIG_CGROUPS isn't defined while Im at it.
Thanks Neil, now it compiles ok.
^ permalink raw reply
* Re: [GIT PULL v2] Open vSwitch
From: jamal @ 2011-11-23 13:36 UTC (permalink / raw)
To: David Täht
Cc: dev-yBygre7rU0TnMu66kgdUjQ, Herbert Xu, Eric Dumazet,
netdev-u79uwXL29TY76Z2rM5mHXA, David Miller
In-Reply-To: <4ECCF17D.5020509-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
On Wed, 2011-11-23 at 14:13 +0100, David Täht wrote:
> It's not so much 'modern tecniques', as modern environments.
modern as in "presence of a gazillion cpus" all trying to send
to that 40G port. You wont see much difference in a 2-4 cpu
sending to a GIG port.
> High on my list would be a way to more easily expose QoS and AQM
> features in the hardware all the way up the stack.
>
> I'd like the hardware to be able to express 'I have FQ', or 'I have red',
> much like we express many other features in ethtool, only abstractly
> enough so that a qdisc setup can be made generic.
>
Its been done - the challenge is agreeing on what the best path is.
My view is that we still need whatever thing the hardware can do
in software so we can configure the hardware with zero changes to
the user space architecture. The datapath can be bypassed.
> I find the mapping from hardware queues to any sort of complex software
> queuing scheme hard to conceptualize. Also, as structured, tc cannot be
> easily applied to wireless APs.
I didnt follow what that means - but you have all the tools you need.
You may need to provide the user a slightly different abstraction than
what tc provides. tc actually has a BNF grammar, so theres plenty of
opportunities to abstract.
i.e if tc was C then you may need to write a python interface
that uses C underneath.
> After burning the last several months getting good enough at the tc layer
> to do stuff in it, I would certainly like to have a place to put
> documentation,
> and also easily update what already exists.
>
> If it helps any I could offer a redmine instance on bufferbloat.net for
> this.
> redmine has bug tracking and a wiki...
>
> It would be nice also if the iproute2 code contained more working examples,
> and man pages.
man pages exist.
iproute2 has docs - that may be dated and need patching.
> It's a ton of doc work, but I'd be willing to do some of it.
If you wanna do this right - I suggest you get a different domain name.
tc.org or something along those lines.
Start aggregating documentation that is validated to be working. There's
a lot of "opinions" out there instead of facts.
cheers,
jamal
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev
^ permalink raw reply
* Re: [GIT PULL v2] Open vSwitch
From: Jamal Hadi Salim @ 2011-11-23 13:44 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Herbert Xu, David Miller, jesse, netdev, dev
In-Reply-To: <1322052938.17693.9.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>
On Wed, 2011-11-23 at 13:55 +0100, Eric Dumazet wrote:
> Currently thinking about it. I was also waiting Tom Herbert BQL patches.
Excellent. I can test when you have something.
> Several people are interested, and John Fastabend told me he plans to :
>
> (1) rcu'ify classifiers/actions as needed
Makes sense in most cases. If you have a lot of flow setup/teardown
it may harm.
Another one - but dont see how much you can do about this; useful
when you want to share state (eg multiple flows being policed
by a single rate meter);
An action could be shared across multiple policies i.e you can
have:
match1, action foo instance 1, action bar instance 3
match2, action bar instance3
match3, ....
This could would mean a lock contended across cpus when different
flows hitting match1/2 show up on different cpus.
> (2) add flag to drop qdisc lock on simple or hw qdiscs
Where does config for the hardware happen from?
> (3) mq and mqprio call root qdisc and run a pass over classifiers
> actions possibly resetting queue_mapping.
It seems to make sense - but I will wait and see to have better
understanding.
cheers,
jamal
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox