netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 net-next] rds-tcp: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket
@ 2016-03-15 15:12 Sowmini Varadhan
  2016-03-15 16:38 ` santosh shilimkar
  2016-03-15 17:30 ` Tom Herbert
  0 siblings, 2 replies; 11+ messages in thread
From: Sowmini Varadhan @ 2016-03-15 15:12 UTC (permalink / raw)
  To: netdev; +Cc: sowmini.varadhan, santosh.shilimkar, davem


Add per-net sysctl tunables to set the size of sndbuf and
rcvbuf on the kernel tcp socket.

The tunables are added at /proc/sys/net/rds/tcp/rds_tcp_sndbuf
and /proc/sys/net/rds/tcp/rds_tcp_rcvbuf.

These values must be set before accept() or connect(),
and there may be an arbitrary number of existing rds-tcp
sockets when the tunable is modified. To make sure that all
connections in the netns pick up the same value for the tunable,
we reset existing rds-tcp connections in the netns, so that
they can reconnect with the new parameters.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
v2: use sysctl instead of module param. Tunabes are now per netns,
    and can be dynamically modified without restarting all namespaces.

 net/rds/tcp.c |  139 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 129 insertions(+), 10 deletions(-)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index ad60299..5d62fd2 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -52,7 +52,27 @@ static LIST_HEAD(rds_tcp_conn_list);
 
 static struct kmem_cache *rds_tcp_conn_slab;
 
-#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
+static int sndbuf_handler(struct ctl_table *ctl, int write,
+			  void __user *buffer, size_t *lenp, loff_t *fpos);
+static int rcvbuf_handler(struct ctl_table *ctl, int write,
+			  void __user *buffer, size_t *lenp, loff_t *fpos);
+static struct ctl_table rds_tcp_sysctl_table[] = {
+	{
+		.procname       = "rds_tcp_sndbuf",
+		/* data is per-net pointer */
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = sndbuf_handler,
+	},
+	{
+		.procname       = "rds_tcp_rcvbuf",
+		/* data is per-net pointer */
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = rcvbuf_handler,
+	},
+	{ }
+};
 
 /* doing it this way avoids calling tcp_sk() */
 void rds_tcp_nonagle(struct socket *sock)
@@ -66,15 +86,6 @@ void rds_tcp_nonagle(struct socket *sock)
 	set_fs(oldfs);
 }
 
-/* All module specific customizations to the RDS-TCP socket should be done in
- * rds_tcp_tune() and applied after socket creation. In general these
- * customizations should be tunable via module_param()
- */
-void rds_tcp_tune(struct socket *sock)
-{
-	rds_tcp_nonagle(sock);
-}
-
 u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
 {
 	return tcp_sk(tc->t_sock->sk)->snd_nxt;
@@ -272,8 +283,33 @@ static int rds_tcp_netid;
 struct rds_tcp_net {
 	struct socket *rds_tcp_listen_sock;
 	struct work_struct rds_tcp_accept_w;
+	struct ctl_table_header *rds_tcp_sysctl;
+	int sndbuf_size;
+	int rcvbuf_size;
 };
 
+/* All module specific customizations to the RDS-TCP socket should be done in
+ * rds_tcp_tune() and applied after socket creation.
+ */
+void rds_tcp_tune(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+	rds_tcp_nonagle(sock);
+	lock_sock(sk);
+	if (rtn->sndbuf_size > 0) {
+		sk->sk_sndbuf = rtn->sndbuf_size;
+		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+	}
+	if (rtn->rcvbuf_size > 0) {
+		sk->sk_rcvbuf = rtn->rcvbuf_size;
+		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+	}
+	release_sock(sk);
+}
+
 static void rds_tcp_accept_worker(struct work_struct *work)
 {
 	struct rds_tcp_net *rtn = container_of(work,
@@ -296,6 +332,13 @@ static __net_init int rds_tcp_init_net(struct net *net)
 {
 	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 
+	/* 0 value for {snd, rcv}buf_size implies we let the stack
+	 * pick the default, and permit auto-tuning of buffer size.
+	 */
+	rtn->sndbuf_size = 0;
+	rtn->rcvbuf_size = 0;
+	rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp",
+						  rds_tcp_sysctl_table);
 	rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
 	if (!rtn->rds_tcp_listen_sock) {
 		pr_warn("could not set up listen sock\n");
@@ -309,6 +352,7 @@ static void __net_exit rds_tcp_exit_net(struct net *net)
 {
 	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
 
+	unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
 	/* If rds_tcp_exit_net() is called as a result of netns deletion,
 	 * the rds_tcp_kill_sock() device notifier would already have cleaned
 	 * up the listen socket, thus there is no work to do in this function.
@@ -383,6 +427,81 @@ static struct notifier_block rds_tcp_dev_notifier = {
 	.priority = -10, /* must be called after other network notifiers */
 };
 
+static int user_atoi(char __user *ubuf, size_t len)
+{
+	char buf[16];
+	unsigned long ret;
+	int err;
+
+	if (len > 15)
+		return -EINVAL;
+
+	if (copy_from_user(buf, ubuf, len))
+		return -EFAULT;
+
+	buf[len] = 0;
+	err =  kstrtoul(buf, 0, &ret);
+	if (err != 0)
+		return -ERANGE;
+	return  ret;
+}
+
+/* when sysctl is used to modify some kernel socket parameters,this
+ * function  resets the RDS connections in that netns  so that we can
+ * restart with new parameters.  The assumption is that such reset
+ * events are few and far-between.
+ */
+static void rds_tcp_sysctl_reset(struct net *net)
+{
+	struct rds_tcp_connection *tc, *_tc;
+
+	spin_lock_irq(&rds_tcp_conn_lock);
+	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
+		struct net *c_net = read_pnet(&tc->conn->c_net);
+
+		if (net != c_net || !tc->t_sock)
+			continue;
+		rds_conn_drop(tc->conn); /* reconnect with new parameters */
+	}
+	spin_unlock_irq(&rds_tcp_conn_lock);
+}
+
+static int sndbuf_handler(struct ctl_table *ctl, int write,
+			  void __user *buffer, size_t *lenp, loff_t *fpos)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+	struct ctl_table tmp;
+
+	tmp = *ctl;
+	tmp.data = &rtn->sndbuf_size;
+	if (!write)
+		return proc_dointvec(&tmp, write, buffer, lenp, fpos);
+
+	rtn->sndbuf_size = user_atoi(buffer, *lenp);
+	rds_tcp_sysctl_reset(net);
+
+	return 0;
+}
+
+static int rcvbuf_handler(struct ctl_table *ctl, int write,
+			  void __user *buffer, size_t *lenp, loff_t *fpos)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+	struct ctl_table tmp;
+
+	tmp = *ctl;
+	tmp.data = &rtn->rcvbuf_size;
+	if (!write)
+		return proc_dointvec(&tmp, write, buffer, lenp, fpos);
+
+	rtn->rcvbuf_size = user_atoi(buffer, *lenp);
+	rds_tcp_sysctl_reset(net);
+
+	return 0;
+}
+
 static void rds_tcp_exit(void)
 {
 	rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next] rds-tcp: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket
  2016-03-15 15:12 [PATCH v2 net-next] rds-tcp: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket Sowmini Varadhan
@ 2016-03-15 16:38 ` santosh shilimkar
  2016-03-15 16:58   ` Sowmini Varadhan
  2016-03-15 17:30 ` Tom Herbert
  1 sibling, 1 reply; 11+ messages in thread
From: santosh shilimkar @ 2016-03-15 16:38 UTC (permalink / raw)
  To: Sowmini Varadhan, netdev; +Cc: davem

Hi Sowmini,

$subject
s/rds-tcp: /RDS: TCP:

On 3/15/2016 8:12 AM, Sowmini Varadhan wrote:
>
> Add per-net sysctl tunables to set the size of sndbuf and
> rcvbuf on the kernel tcp socket.
>
> The tunables are added at /proc/sys/net/rds/tcp/rds_tcp_sndbuf
> and /proc/sys/net/rds/tcp/rds_tcp_rcvbuf.
>
> These values must be set before accept() or connect(),
This will be hard to achieve with RDS being reconnect protocol
but since you have added a reset, the above no longer a
problem.

> and there may be an arbitrary number of existing rds-tcp
> sockets when the tunable is modified. To make sure that all
> connections in the netns pick up the same value for the tunable,
> we reset existing rds-tcp connections in the netns, so that
> they can reconnect with the new parameters.
>
> Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
> ---
> v2: use sysctl instead of module param. Tunabes are now per netns,
>      and can be dynamically modified without restarting all namespaces.
>
>   net/rds/tcp.c |  139 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
>   1 files changed, 129 insertions(+), 10 deletions(-)
>
> diff --git a/net/rds/tcp.c b/net/rds/tcp.c
> index ad60299..5d62fd2 100644
> --- a/net/rds/tcp.c
> +++ b/net/rds/tcp.c
> @@ -52,7 +52,27 @@ static LIST_HEAD(rds_tcp_conn_list);
>
>   static struct kmem_cache *rds_tcp_conn_slab;
>
> -#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
This seems to be unrelated change and left over from the
previous clean-up. Please do that in separate patch.

> +static int sndbuf_handler(struct ctl_table *ctl, int write,
> +			  void __user *buffer, size_t *lenp, loff_t *fpos);
> +static int rcvbuf_handler(struct ctl_table *ctl, int write,
> +			  void __user *buffer, size_t *lenp, loff_t *fpos);
> +static struct ctl_table rds_tcp_sysctl_table[] = {
> +	{
> +		.procname       = "rds_tcp_sndbuf",
> +		/* data is per-net pointer */
> +		.maxlen         = sizeof(int),
> +		.mode           = 0644,
> +		.proc_handler   = sndbuf_handler,
> +	},
> +	{
> +		.procname       = "rds_tcp_rcvbuf",
> +		/* data is per-net pointer */
> +		.maxlen         = sizeof(int),
> +		.mode           = 0644,
> +		.proc_handler   = rcvbuf_handler,
> +	},
> +	{ }
> +};
>
>   /* doing it this way avoids calling tcp_sk() */
>   void rds_tcp_nonagle(struct socket *sock)
> @@ -66,15 +86,6 @@ void rds_tcp_nonagle(struct socket *sock)
>   	set_fs(oldfs);
>   }
>
> -/* All module specific customizations to the RDS-TCP socket should be done in
> - * rds_tcp_tune() and applied after socket creation. In general these
> - * customizations should be tunable via module_param()
> - */
> -void rds_tcp_tune(struct socket *sock)
> -{
> -	rds_tcp_nonagle(sock);
> -}
> -
>   u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
>   {
>   	return tcp_sk(tc->t_sock->sk)->snd_nxt;
> @@ -272,8 +283,33 @@ static int rds_tcp_netid;
>   struct rds_tcp_net {
>   	struct socket *rds_tcp_listen_sock;
>   	struct work_struct rds_tcp_accept_w;
> +	struct ctl_table_header *rds_tcp_sysctl;
> +	int sndbuf_size;
> +	int rcvbuf_size;
>   };
>
> +/* All module specific customizations to the RDS-TCP socket should be done in
> + * rds_tcp_tune() and applied after socket creation.
> + */
> +void rds_tcp_tune(struct socket *sock)
> +{
> +	struct sock *sk = sock->sk;
> +	struct net *net = sock_net(sk);
> +	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
> +
> +	rds_tcp_nonagle(sock);
> +	lock_sock(sk);
> +	if (rtn->sndbuf_size > 0) {
So value of 1 is allowed as well. There should be some
minimum default or multiple of it. Of course above check
can remain as is as long as you validate the user input
in handlers.

> @@ -296,6 +332,13 @@ static __net_init int rds_tcp_init_net(struct net *net)
>   {
>   	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
>
> +	/* 0 value for {snd, rcv}buf_size implies we let the stack
> +	 * pick the default, and permit auto-tuning of buffer size.
> +	 */
> +	rtn->sndbuf_size = 0;
> +	rtn->rcvbuf_size = 0;
> +	rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp",
> +						  rds_tcp_sysctl_table);
>   	rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
>   	if (!rtn->rds_tcp_listen_sock) {
>   		pr_warn("could not set up listen sock\n");
> @@ -309,6 +352,7 @@ static void __net_exit rds_tcp_exit_net(struct net *net)
>   {
>   	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
>
> +	unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
>   	/* If rds_tcp_exit_net() is called as a result of netns deletion,
>   	 * the rds_tcp_kill_sock() device notifier would already have cleaned
>   	 * up the listen socket, thus there is no work to do in this function.
You need to unregister it on rds_tcp_listen_init() failure.

[...]

> +/* when sysctl is used to modify some kernel socket parameters,this
> + * function  resets the RDS connections in that netns  so that we can
> + * restart with new parameters.  The assumption is that such reset
> + * events are few and far-between.
> + */
> +static void rds_tcp_sysctl_reset(struct net *net)
> +{
> +	struct rds_tcp_connection *tc, *_tc;
> +
> +	spin_lock_irq(&rds_tcp_conn_lock);
> +	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
> +		struct net *c_net = read_pnet(&tc->conn->c_net);
> +
> +		if (net != c_net || !tc->t_sock)
> +			continue;
a next line will be good here.

> +		rds_conn_drop(tc->conn); /* reconnect with new parameters */
> +	}
> +	spin_unlock_irq(&rds_tcp_conn_lock);
> +}
> +
> +static int sndbuf_handler(struct ctl_table *ctl, int write,
> +			  void __user *buffer, size_t *lenp, loff_t *fpos)
> +{
> +	struct net *net = current->nsproxy->net_ns;
> +	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
> +	struct ctl_table tmp;
> +
> +	tmp = *ctl;
> +	tmp.data = &rtn->sndbuf_size;
> +	if (!write)
> +		return proc_dointvec(&tmp, write, buffer, lenp, fpos);
> +
> +	rtn->sndbuf_size = user_atoi(buffer, *lenp);
As mentioned above, you should make sure the buffer lengths are
legitimate.

Regards,
Santosh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next] rds-tcp: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket
  2016-03-15 16:38 ` santosh shilimkar
@ 2016-03-15 16:58   ` Sowmini Varadhan
  2016-03-15 17:18     ` Eric Dumazet
  0 siblings, 1 reply; 11+ messages in thread
From: Sowmini Varadhan @ 2016-03-15 16:58 UTC (permalink / raw)
  To: santosh shilimkar; +Cc: netdev, davem

On (03/15/16 09:38), santosh shilimkar wrote:
> >+	if (rtn->sndbuf_size > 0) {
> So value of 1 is allowed as well. There should be some
> minimum default or multiple of it. Of course above check
> can remain as is as long as you validate the user input
> in handlers.

yes, just as user-space SO_SNDBUF allows ridiculous values
for buffer size..

> >@@ -309,6 +352,7 @@ static void __net_exit rds_tcp_exit_net(struct net *net)
> >  {
> >  	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
> >
> >+	unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
> >  	/* If rds_tcp_exit_net() is called as a result of netns deletion,
> >  	 * the rds_tcp_kill_sock() device notifier would already have cleaned
> >  	 * up the listen socket, thus there is no work to do in this function.
> You need to unregister it on rds_tcp_listen_init() failure.

Ok, good point.

> >+	rtn->sndbuf_size = user_atoi(buffer, *lenp);
> As mentioned above, you should make sure the buffer lengths are
> legitimate.

As above. We allow any values that the TCP socket itself allows.
If the user wants to shoot themself in the foot, we dont stop them.

I'll fix the nits and send out another patchset in a bit. 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next] rds-tcp: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket
  2016-03-15 16:58   ` Sowmini Varadhan
@ 2016-03-15 17:18     ` Eric Dumazet
  2016-03-15 17:30       ` Sowmini Varadhan
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2016-03-15 17:18 UTC (permalink / raw)
  To: Sowmini Varadhan; +Cc: santosh shilimkar, netdev, davem

On Tue, 2016-03-15 at 12:58 -0400, Sowmini Varadhan wrote:
> On (03/15/16 09:38), santosh shilimkar wrote:
> > >+	if (rtn->sndbuf_size > 0) {
> > So value of 1 is allowed as well. There should be some
> > minimum default or multiple of it. Of course above check
> > can remain as is as long as you validate the user input
> > in handlers.
> 
> yes, just as user-space SO_SNDBUF allows ridiculous values
> for buffer size..

Well... Not really.

> 
> > >@@ -309,6 +352,7 @@ static void __net_exit rds_tcp_exit_net(struct net *net)
> > >  {
> > >  	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
> > >
> > >+	unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
> > >  	/* If rds_tcp_exit_net() is called as a result of netns deletion,
> > >  	 * the rds_tcp_kill_sock() device notifier would already have cleaned
> > >  	 * up the listen socket, thus there is no work to do in this function.
> > You need to unregister it on rds_tcp_listen_init() failure.
> 
> Ok, good point.
> 
> > >+	rtn->sndbuf_size = user_atoi(buffer, *lenp);
> > As mentioned above, you should make sure the buffer lengths are
> > legitimate.
> 
> As above. We allow any values that the TCP socket itself allows.
> If the user wants to shoot themself in the foot, we dont stop them.

Look at SO_SNDBUF and SO_RCVBUF implementation.

sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);

sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);

kernel definitely has some logic here.

If you believe SOCK_MIN_SNDBUF and/or SOCK_MIN_RCVBUF are wrong, please
elaborate.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next] rds-tcp: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket
  2016-03-15 17:18     ` Eric Dumazet
@ 2016-03-15 17:30       ` Sowmini Varadhan
  2016-03-15 18:09         ` Eric Dumazet
  0 siblings, 1 reply; 11+ messages in thread
From: Sowmini Varadhan @ 2016-03-15 17:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: santosh shilimkar, netdev, davem

On (03/15/16 10:18), Eric Dumazet wrote:
> 
> Look at SO_SNDBUF and SO_RCVBUF implementation.
> 
> sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
> 
> sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
> 
> kernel definitely has some logic here.

Ok, I can do the same thing (and we do this consistently across
all drivers?)

> If you believe SOCK_MIN_SNDBUF and/or SOCK_MIN_RCVBUF are wrong, please
> elaborate.

I dont recall suggesting that. 

BTW, when I tried it, doing a SO_SNDBUF of 1 from uspace does not return
an error. It merely sets the buffer size to 4608 (as reported by
getsockopt in my env. I think the getsockopt value is impacted by
many factors).

--Sowmini

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next] rds-tcp: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket
  2016-03-15 15:12 [PATCH v2 net-next] rds-tcp: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket Sowmini Varadhan
  2016-03-15 16:38 ` santosh shilimkar
@ 2016-03-15 17:30 ` Tom Herbert
  2016-03-15 17:34   ` Sowmini Varadhan
  1 sibling, 1 reply; 11+ messages in thread
From: Tom Herbert @ 2016-03-15 17:30 UTC (permalink / raw)
  To: Sowmini Varadhan
  Cc: Linux Kernel Network Developers, santosh.shilimkar,
	David S. Miller

On Tue, Mar 15, 2016 at 8:12 AM, Sowmini Varadhan
<sowmini.varadhan@oracle.com> wrote:
>
> Add per-net sysctl tunables to set the size of sndbuf and
> rcvbuf on the kernel tcp socket.
>
> The tunables are added at /proc/sys/net/rds/tcp/rds_tcp_sndbuf
> and /proc/sys/net/rds/tcp/rds_tcp_rcvbuf.
>
> These values must be set before accept() or connect(),
> and there may be an arbitrary number of existing rds-tcp
> sockets when the tunable is modified. To make sure that all
> connections in the netns pick up the same value for the tunable,
> we reset existing rds-tcp connections in the netns, so that
> they can reconnect with the new parameters.
>
> Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
> ---
> v2: use sysctl instead of module param. Tunabes are now per netns,
>     and can be dynamically modified without restarting all namespaces.
>
I still don't understand why you won't consider using netlink. sndbuf
and rcvbuf seem like just two of many arbitrary parameters that a user
might want to configure. I would think that being able to configure
the listener port (currently a fixed value), an address to bind to
(instead of always inaddr-any), or not automatically creating an RDS
listener in every namespace are at least as important configuration
parameters as these two.

Tom

>  net/rds/tcp.c |  139 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 files changed, 129 insertions(+), 10 deletions(-)
>
> diff --git a/net/rds/tcp.c b/net/rds/tcp.c
> index ad60299..5d62fd2 100644
> --- a/net/rds/tcp.c
> +++ b/net/rds/tcp.c
> @@ -52,7 +52,27 @@ static LIST_HEAD(rds_tcp_conn_list);
>
>  static struct kmem_cache *rds_tcp_conn_slab;
>
> -#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
> +static int sndbuf_handler(struct ctl_table *ctl, int write,
> +                         void __user *buffer, size_t *lenp, loff_t *fpos);
> +static int rcvbuf_handler(struct ctl_table *ctl, int write,
> +                         void __user *buffer, size_t *lenp, loff_t *fpos);
> +static struct ctl_table rds_tcp_sysctl_table[] = {
> +       {
> +               .procname       = "rds_tcp_sndbuf",
> +               /* data is per-net pointer */
> +               .maxlen         = sizeof(int),
> +               .mode           = 0644,
> +               .proc_handler   = sndbuf_handler,
> +       },
> +       {
> +               .procname       = "rds_tcp_rcvbuf",
> +               /* data is per-net pointer */
> +               .maxlen         = sizeof(int),
> +               .mode           = 0644,
> +               .proc_handler   = rcvbuf_handler,
> +       },
> +       { }
> +};
>
>  /* doing it this way avoids calling tcp_sk() */
>  void rds_tcp_nonagle(struct socket *sock)
> @@ -66,15 +86,6 @@ void rds_tcp_nonagle(struct socket *sock)
>         set_fs(oldfs);
>  }
>
> -/* All module specific customizations to the RDS-TCP socket should be done in
> - * rds_tcp_tune() and applied after socket creation. In general these
> - * customizations should be tunable via module_param()
> - */
> -void rds_tcp_tune(struct socket *sock)
> -{
> -       rds_tcp_nonagle(sock);
> -}
> -
>  u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
>  {
>         return tcp_sk(tc->t_sock->sk)->snd_nxt;
> @@ -272,8 +283,33 @@ static int rds_tcp_netid;
>  struct rds_tcp_net {
>         struct socket *rds_tcp_listen_sock;
>         struct work_struct rds_tcp_accept_w;
> +       struct ctl_table_header *rds_tcp_sysctl;
> +       int sndbuf_size;
> +       int rcvbuf_size;
>  };
>
> +/* All module specific customizations to the RDS-TCP socket should be done in
> + * rds_tcp_tune() and applied after socket creation.
> + */
> +void rds_tcp_tune(struct socket *sock)
> +{
> +       struct sock *sk = sock->sk;
> +       struct net *net = sock_net(sk);
> +       struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
> +
> +       rds_tcp_nonagle(sock);
> +       lock_sock(sk);
> +       if (rtn->sndbuf_size > 0) {
> +               sk->sk_sndbuf = rtn->sndbuf_size;
> +               sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
> +       }
> +       if (rtn->rcvbuf_size > 0) {
> +               sk->sk_rcvbuf = rtn->rcvbuf_size;
> +               sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
> +       }
> +       release_sock(sk);
> +}
> +
>  static void rds_tcp_accept_worker(struct work_struct *work)
>  {
>         struct rds_tcp_net *rtn = container_of(work,
> @@ -296,6 +332,13 @@ static __net_init int rds_tcp_init_net(struct net *net)
>  {
>         struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
>
> +       /* 0 value for {snd, rcv}buf_size implies we let the stack
> +        * pick the default, and permit auto-tuning of buffer size.
> +        */
> +       rtn->sndbuf_size = 0;
> +       rtn->rcvbuf_size = 0;
> +       rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp",
> +                                                 rds_tcp_sysctl_table);
>         rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
>         if (!rtn->rds_tcp_listen_sock) {
>                 pr_warn("could not set up listen sock\n");
> @@ -309,6 +352,7 @@ static void __net_exit rds_tcp_exit_net(struct net *net)
>  {
>         struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
>
> +       unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
>         /* If rds_tcp_exit_net() is called as a result of netns deletion,
>          * the rds_tcp_kill_sock() device notifier would already have cleaned
>          * up the listen socket, thus there is no work to do in this function.
> @@ -383,6 +427,81 @@ static struct notifier_block rds_tcp_dev_notifier = {
>         .priority = -10, /* must be called after other network notifiers */
>  };
>
> +static int user_atoi(char __user *ubuf, size_t len)
> +{
> +       char buf[16];
> +       unsigned long ret;
> +       int err;
> +
> +       if (len > 15)
> +               return -EINVAL;
> +
> +       if (copy_from_user(buf, ubuf, len))
> +               return -EFAULT;
> +
> +       buf[len] = 0;
> +       err =  kstrtoul(buf, 0, &ret);
> +       if (err != 0)
> +               return -ERANGE;
> +       return  ret;
> +}
> +
> +/* when sysctl is used to modify some kernel socket parameters,this
> + * function  resets the RDS connections in that netns  so that we can
> + * restart with new parameters.  The assumption is that such reset
> + * events are few and far-between.
> + */
> +static void rds_tcp_sysctl_reset(struct net *net)
> +{
> +       struct rds_tcp_connection *tc, *_tc;
> +
> +       spin_lock_irq(&rds_tcp_conn_lock);
> +       list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
> +               struct net *c_net = read_pnet(&tc->conn->c_net);
> +
> +               if (net != c_net || !tc->t_sock)
> +                       continue;
> +               rds_conn_drop(tc->conn); /* reconnect with new parameters */
> +       }
> +       spin_unlock_irq(&rds_tcp_conn_lock);
> +}
> +
> +static int sndbuf_handler(struct ctl_table *ctl, int write,
> +                         void __user *buffer, size_t *lenp, loff_t *fpos)
> +{
> +       struct net *net = current->nsproxy->net_ns;
> +       struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
> +       struct ctl_table tmp;
> +
> +       tmp = *ctl;
> +       tmp.data = &rtn->sndbuf_size;
> +       if (!write)
> +               return proc_dointvec(&tmp, write, buffer, lenp, fpos);
> +
> +       rtn->sndbuf_size = user_atoi(buffer, *lenp);
> +       rds_tcp_sysctl_reset(net);
> +
> +       return 0;
> +}
> +
> +static int rcvbuf_handler(struct ctl_table *ctl, int write,
> +                         void __user *buffer, size_t *lenp, loff_t *fpos)
> +{
> +       struct net *net = current->nsproxy->net_ns;
> +       struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
> +       struct ctl_table tmp;
> +
> +       tmp = *ctl;
> +       tmp.data = &rtn->rcvbuf_size;
> +       if (!write)
> +               return proc_dointvec(&tmp, write, buffer, lenp, fpos);
> +
> +       rtn->rcvbuf_size = user_atoi(buffer, *lenp);
> +       rds_tcp_sysctl_reset(net);
> +
> +       return 0;
> +}
> +
>  static void rds_tcp_exit(void)
>  {
>         rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
> --
> 1.7.1
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next] rds-tcp: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket
  2016-03-15 17:30 ` Tom Herbert
@ 2016-03-15 17:34   ` Sowmini Varadhan
  2016-03-15 17:47     ` Tom Herbert
  0 siblings, 1 reply; 11+ messages in thread
From: Sowmini Varadhan @ 2016-03-15 17:34 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Linux Kernel Network Developers, santosh.shilimkar,
	David S. Miller

On (03/15/16 10:30), Tom Herbert wrote:
> I still don't understand why you won't consider using netlink. sndbuf
> and rcvbuf seem like just two of many arbitrary parameters that a user
> might want to configure. 

> I would think that being able to configure
> the listener port (currently a fixed value), an address to bind to

the listener port is properly registered with IANA. I dont think
we will need to change that, any more than ssh/nfs ports will need 
to change.

> (instead of always inaddr-any), or not automatically creating an RDS
> listener in every namespace are at least as important configuration
> parameters as these two.

Those parameters can be added via sysctl if needed- we have not
seen the need for that yet.

And if something comes up that needs the netlink etc. we can
add it down the road. It's just not needed now.

Thanks
--Sowmini
> 
> Tom
> 
> >  net/rds/tcp.c |  139 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
> >  1 files changed, 129 insertions(+), 10 deletions(-)
> >
> > diff --git a/net/rds/tcp.c b/net/rds/tcp.c
> > index ad60299..5d62fd2 100644
> > --- a/net/rds/tcp.c
> > +++ b/net/rds/tcp.c
> > @@ -52,7 +52,27 @@ static LIST_HEAD(rds_tcp_conn_list);
> >
> >  static struct kmem_cache *rds_tcp_conn_slab;
> >
> > -#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
> > +static int sndbuf_handler(struct ctl_table *ctl, int write,
> > +                         void __user *buffer, size_t *lenp, loff_t *fpos);
> > +static int rcvbuf_handler(struct ctl_table *ctl, int write,
> > +                         void __user *buffer, size_t *lenp, loff_t *fpos);
> > +static struct ctl_table rds_tcp_sysctl_table[] = {
> > +       {
> > +               .procname       = "rds_tcp_sndbuf",
> > +               /* data is per-net pointer */
> > +               .maxlen         = sizeof(int),
> > +               .mode           = 0644,
> > +               .proc_handler   = sndbuf_handler,
> > +       },
> > +       {
> > +               .procname       = "rds_tcp_rcvbuf",
> > +               /* data is per-net pointer */
> > +               .maxlen         = sizeof(int),
> > +               .mode           = 0644,
> > +               .proc_handler   = rcvbuf_handler,
> > +       },
> > +       { }
> > +};
> >
> >  /* doing it this way avoids calling tcp_sk() */
> >  void rds_tcp_nonagle(struct socket *sock)
> > @@ -66,15 +86,6 @@ void rds_tcp_nonagle(struct socket *sock)
> >         set_fs(oldfs);
> >  }
> >
> > -/* All module specific customizations to the RDS-TCP socket should be done in
> > - * rds_tcp_tune() and applied after socket creation. In general these
> > - * customizations should be tunable via module_param()
> > - */
> > -void rds_tcp_tune(struct socket *sock)
> > -{
> > -       rds_tcp_nonagle(sock);
> > -}
> > -
> >  u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
> >  {
> >         return tcp_sk(tc->t_sock->sk)->snd_nxt;
> > @@ -272,8 +283,33 @@ static int rds_tcp_netid;
> >  struct rds_tcp_net {
> >         struct socket *rds_tcp_listen_sock;
> >         struct work_struct rds_tcp_accept_w;
> > +       struct ctl_table_header *rds_tcp_sysctl;
> > +       int sndbuf_size;
> > +       int rcvbuf_size;
> >  };
> >
> > +/* All module specific customizations to the RDS-TCP socket should be done in
> > + * rds_tcp_tune() and applied after socket creation.
> > + */
> > +void rds_tcp_tune(struct socket *sock)
> > +{
> > +       struct sock *sk = sock->sk;
> > +       struct net *net = sock_net(sk);
> > +       struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
> > +
> > +       rds_tcp_nonagle(sock);
> > +       lock_sock(sk);
> > +       if (rtn->sndbuf_size > 0) {
> > +               sk->sk_sndbuf = rtn->sndbuf_size;
> > +               sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
> > +       }
> > +       if (rtn->rcvbuf_size > 0) {
> > +               sk->sk_rcvbuf = rtn->rcvbuf_size;
> > +               sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
> > +       }
> > +       release_sock(sk);
> > +}
> > +
> >  static void rds_tcp_accept_worker(struct work_struct *work)
> >  {
> >         struct rds_tcp_net *rtn = container_of(work,
> > @@ -296,6 +332,13 @@ static __net_init int rds_tcp_init_net(struct net *net)
> >  {
> >         struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
> >
> > +       /* 0 value for {snd, rcv}buf_size implies we let the stack
> > +        * pick the default, and permit auto-tuning of buffer size.
> > +        */
> > +       rtn->sndbuf_size = 0;
> > +       rtn->rcvbuf_size = 0;
> > +       rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp",
> > +                                                 rds_tcp_sysctl_table);
> >         rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
> >         if (!rtn->rds_tcp_listen_sock) {
> >                 pr_warn("could not set up listen sock\n");
> > @@ -309,6 +352,7 @@ static void __net_exit rds_tcp_exit_net(struct net *net)
> >  {
> >         struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
> >
> > +       unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
> >         /* If rds_tcp_exit_net() is called as a result of netns deletion,
> >          * the rds_tcp_kill_sock() device notifier would already have cleaned
> >          * up the listen socket, thus there is no work to do in this function.
> > @@ -383,6 +427,81 @@ static struct notifier_block rds_tcp_dev_notifier = {
> >         .priority = -10, /* must be called after other network notifiers */
> >  };
> >
> > +static int user_atoi(char __user *ubuf, size_t len)
> > +{
> > +       char buf[16];
> > +       unsigned long ret;
> > +       int err;
> > +
> > +       if (len > 15)
> > +               return -EINVAL;
> > +
> > +       if (copy_from_user(buf, ubuf, len))
> > +               return -EFAULT;
> > +
> > +       buf[len] = 0;
> > +       err =  kstrtoul(buf, 0, &ret);
> > +       if (err != 0)
> > +               return -ERANGE;
> > +       return  ret;
> > +}
> > +
> > +/* when sysctl is used to modify some kernel socket parameters,this
> > + * function  resets the RDS connections in that netns  so that we can
> > + * restart with new parameters.  The assumption is that such reset
> > + * events are few and far-between.
> > + */
> > +static void rds_tcp_sysctl_reset(struct net *net)
> > +{
> > +       struct rds_tcp_connection *tc, *_tc;
> > +
> > +       spin_lock_irq(&rds_tcp_conn_lock);
> > +       list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
> > +               struct net *c_net = read_pnet(&tc->conn->c_net);
> > +
> > +               if (net != c_net || !tc->t_sock)
> > +                       continue;
> > +               rds_conn_drop(tc->conn); /* reconnect with new parameters */
> > +       }
> > +       spin_unlock_irq(&rds_tcp_conn_lock);
> > +}
> > +
> > +static int sndbuf_handler(struct ctl_table *ctl, int write,
> > +                         void __user *buffer, size_t *lenp, loff_t *fpos)
> > +{
> > +       struct net *net = current->nsproxy->net_ns;
> > +       struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
> > +       struct ctl_table tmp;
> > +
> > +       tmp = *ctl;
> > +       tmp.data = &rtn->sndbuf_size;
> > +       if (!write)
> > +               return proc_dointvec(&tmp, write, buffer, lenp, fpos);
> > +
> > +       rtn->sndbuf_size = user_atoi(buffer, *lenp);
> > +       rds_tcp_sysctl_reset(net);
> > +
> > +       return 0;
> > +}
> > +
> > +static int rcvbuf_handler(struct ctl_table *ctl, int write,
> > +                         void __user *buffer, size_t *lenp, loff_t *fpos)
> > +{
> > +       struct net *net = current->nsproxy->net_ns;
> > +       struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
> > +       struct ctl_table tmp;
> > +
> > +       tmp = *ctl;
> > +       tmp.data = &rtn->rcvbuf_size;
> > +       if (!write)
> > +               return proc_dointvec(&tmp, write, buffer, lenp, fpos);
> > +
> > +       rtn->rcvbuf_size = user_atoi(buffer, *lenp);
> > +       rds_tcp_sysctl_reset(net);
> > +
> > +       return 0;
> > +}
> > +
> >  static void rds_tcp_exit(void)
> >  {
> >         rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
> > --
> > 1.7.1
> >

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next] rds-tcp: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket
  2016-03-15 17:34   ` Sowmini Varadhan
@ 2016-03-15 17:47     ` Tom Herbert
  2016-03-15 19:57       ` Sowmini Varadhan
  0 siblings, 1 reply; 11+ messages in thread
From: Tom Herbert @ 2016-03-15 17:47 UTC (permalink / raw)
  To: Sowmini Varadhan
  Cc: Linux Kernel Network Developers, santosh.shilimkar,
	David S. Miller

On Tue, Mar 15, 2016 at 10:34 AM, Sowmini Varadhan
<sowmini.varadhan@oracle.com> wrote:
> On (03/15/16 10:30), Tom Herbert wrote:
>> I still don't understand why you won't consider using netlink. sndbuf
>> and rcvbuf seem like just two of many arbitrary parameters that a user
>> might want to configure.
>
>> I would think that being able to configure
>> the listener port (currently a fixed value), an address to bind to
>
> the listener port is properly registered with IANA. I dont think
> we will need to change that, any more than ssh/nfs ports will need
> to change.
>
Both sshd and nfsd and allow configurable listener port numbers. Any
listener service will allow a configurable port number. An IANA port
number is good as a default, but there are many reasons why people
want or need to use a different port number. I don't see what makes
RDS special in this regard.

>> (instead of always inaddr-any), or not automatically creating an RDS
>> listener in every namespace are at least as important configuration
>> parameters as these two.
>
> Those parameters can be added via sysctl if needed- we have not
> seen the need for that yet.
>
> And if something comes up that needs the netlink etc. we can
> add it down the road. It's just not needed now.
>
> Thanks
> --Sowmini
>>
>> Tom
>>
>> >  net/rds/tcp.c |  139 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
>> >  1 files changed, 129 insertions(+), 10 deletions(-)
>> >
>> > diff --git a/net/rds/tcp.c b/net/rds/tcp.c
>> > index ad60299..5d62fd2 100644
>> > --- a/net/rds/tcp.c
>> > +++ b/net/rds/tcp.c
>> > @@ -52,7 +52,27 @@ static LIST_HEAD(rds_tcp_conn_list);
>> >
>> >  static struct kmem_cache *rds_tcp_conn_slab;
>> >
>> > -#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
>> > +static int sndbuf_handler(struct ctl_table *ctl, int write,
>> > +                         void __user *buffer, size_t *lenp, loff_t *fpos);
>> > +static int rcvbuf_handler(struct ctl_table *ctl, int write,
>> > +                         void __user *buffer, size_t *lenp, loff_t *fpos);
>> > +static struct ctl_table rds_tcp_sysctl_table[] = {
>> > +       {
>> > +               .procname       = "rds_tcp_sndbuf",
>> > +               /* data is per-net pointer */
>> > +               .maxlen         = sizeof(int),
>> > +               .mode           = 0644,
>> > +               .proc_handler   = sndbuf_handler,
>> > +       },
>> > +       {
>> > +               .procname       = "rds_tcp_rcvbuf",
>> > +               /* data is per-net pointer */
>> > +               .maxlen         = sizeof(int),
>> > +               .mode           = 0644,
>> > +               .proc_handler   = rcvbuf_handler,
>> > +       },
>> > +       { }
>> > +};
>> >
>> >  /* doing it this way avoids calling tcp_sk() */
>> >  void rds_tcp_nonagle(struct socket *sock)
>> > @@ -66,15 +86,6 @@ void rds_tcp_nonagle(struct socket *sock)
>> >         set_fs(oldfs);
>> >  }
>> >
>> > -/* All module specific customizations to the RDS-TCP socket should be done in
>> > - * rds_tcp_tune() and applied after socket creation. In general these
>> > - * customizations should be tunable via module_param()
>> > - */
>> > -void rds_tcp_tune(struct socket *sock)
>> > -{
>> > -       rds_tcp_nonagle(sock);
>> > -}
>> > -
>> >  u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
>> >  {
>> >         return tcp_sk(tc->t_sock->sk)->snd_nxt;
>> > @@ -272,8 +283,33 @@ static int rds_tcp_netid;
>> >  struct rds_tcp_net {
>> >         struct socket *rds_tcp_listen_sock;
>> >         struct work_struct rds_tcp_accept_w;
>> > +       struct ctl_table_header *rds_tcp_sysctl;
>> > +       int sndbuf_size;
>> > +       int rcvbuf_size;
>> >  };
>> >
>> > +/* All module specific customizations to the RDS-TCP socket should be done in
>> > + * rds_tcp_tune() and applied after socket creation.
>> > + */
>> > +void rds_tcp_tune(struct socket *sock)
>> > +{
>> > +       struct sock *sk = sock->sk;
>> > +       struct net *net = sock_net(sk);
>> > +       struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
>> > +
>> > +       rds_tcp_nonagle(sock);
>> > +       lock_sock(sk);
>> > +       if (rtn->sndbuf_size > 0) {
>> > +               sk->sk_sndbuf = rtn->sndbuf_size;
>> > +               sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
>> > +       }
>> > +       if (rtn->rcvbuf_size > 0) {
>> > +               sk->sk_rcvbuf = rtn->rcvbuf_size;
>> > +               sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
>> > +       }
>> > +       release_sock(sk);
>> > +}
>> > +
>> >  static void rds_tcp_accept_worker(struct work_struct *work)
>> >  {
>> >         struct rds_tcp_net *rtn = container_of(work,
>> > @@ -296,6 +332,13 @@ static __net_init int rds_tcp_init_net(struct net *net)
>> >  {
>> >         struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
>> >
>> > +       /* 0 value for {snd, rcv}buf_size implies we let the stack
>> > +        * pick the default, and permit auto-tuning of buffer size.
>> > +        */
>> > +       rtn->sndbuf_size = 0;
>> > +       rtn->rcvbuf_size = 0;
>> > +       rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp",
>> > +                                                 rds_tcp_sysctl_table);
>> >         rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
>> >         if (!rtn->rds_tcp_listen_sock) {
>> >                 pr_warn("could not set up listen sock\n");
>> > @@ -309,6 +352,7 @@ static void __net_exit rds_tcp_exit_net(struct net *net)
>> >  {
>> >         struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
>> >
>> > +       unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
>> >         /* If rds_tcp_exit_net() is called as a result of netns deletion,
>> >          * the rds_tcp_kill_sock() device notifier would already have cleaned
>> >          * up the listen socket, thus there is no work to do in this function.
>> > @@ -383,6 +427,81 @@ static struct notifier_block rds_tcp_dev_notifier = {
>> >         .priority = -10, /* must be called after other network notifiers */
>> >  };
>> >
>> > +static int user_atoi(char __user *ubuf, size_t len)
>> > +{
>> > +       char buf[16];
>> > +       unsigned long ret;
>> > +       int err;
>> > +
>> > +       if (len > 15)
>> > +               return -EINVAL;
>> > +
>> > +       if (copy_from_user(buf, ubuf, len))
>> > +               return -EFAULT;
>> > +
>> > +       buf[len] = 0;
>> > +       err =  kstrtoul(buf, 0, &ret);
>> > +       if (err != 0)
>> > +               return -ERANGE;
>> > +       return  ret;
>> > +}
>> > +
>> > +/* when sysctl is used to modify some kernel socket parameters,this
>> > + * function  resets the RDS connections in that netns  so that we can
>> > + * restart with new parameters.  The assumption is that such reset
>> > + * events are few and far-between.
>> > + */
>> > +static void rds_tcp_sysctl_reset(struct net *net)
>> > +{
>> > +       struct rds_tcp_connection *tc, *_tc;
>> > +
>> > +       spin_lock_irq(&rds_tcp_conn_lock);
>> > +       list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
>> > +               struct net *c_net = read_pnet(&tc->conn->c_net);
>> > +
>> > +               if (net != c_net || !tc->t_sock)
>> > +                       continue;
>> > +               rds_conn_drop(tc->conn); /* reconnect with new parameters */
>> > +       }
>> > +       spin_unlock_irq(&rds_tcp_conn_lock);
>> > +}
>> > +
>> > +static int sndbuf_handler(struct ctl_table *ctl, int write,
>> > +                         void __user *buffer, size_t *lenp, loff_t *fpos)
>> > +{
>> > +       struct net *net = current->nsproxy->net_ns;
>> > +       struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
>> > +       struct ctl_table tmp;
>> > +
>> > +       tmp = *ctl;
>> > +       tmp.data = &rtn->sndbuf_size;
>> > +       if (!write)
>> > +               return proc_dointvec(&tmp, write, buffer, lenp, fpos);
>> > +
>> > +       rtn->sndbuf_size = user_atoi(buffer, *lenp);
>> > +       rds_tcp_sysctl_reset(net);
>> > +
>> > +       return 0;
>> > +}
>> > +
>> > +static int rcvbuf_handler(struct ctl_table *ctl, int write,
>> > +                         void __user *buffer, size_t *lenp, loff_t *fpos)
>> > +{
>> > +       struct net *net = current->nsproxy->net_ns;
>> > +       struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
>> > +       struct ctl_table tmp;
>> > +
>> > +       tmp = *ctl;
>> > +       tmp.data = &rtn->rcvbuf_size;
>> > +       if (!write)
>> > +               return proc_dointvec(&tmp, write, buffer, lenp, fpos);
>> > +
>> > +       rtn->rcvbuf_size = user_atoi(buffer, *lenp);
>> > +       rds_tcp_sysctl_reset(net);
>> > +
>> > +       return 0;
>> > +}
>> > +
>> >  static void rds_tcp_exit(void)
>> >  {
>> >         rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
>> > --
>> > 1.7.1
>> >

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next] rds-tcp: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket
  2016-03-15 17:30       ` Sowmini Varadhan
@ 2016-03-15 18:09         ` Eric Dumazet
  2016-03-15 18:16           ` Sowmini Varadhan
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2016-03-15 18:09 UTC (permalink / raw)
  To: Sowmini Varadhan; +Cc: santosh shilimkar, netdev, davem

On Tue, 2016-03-15 at 13:30 -0400, Sowmini Varadhan wrote:
> On (03/15/16 10:18), Eric Dumazet wrote:
> > 
> > Look at SO_SNDBUF and SO_RCVBUF implementation.
> > 
> > sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
> > 
> > sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
> > 
> > kernel definitely has some logic here.
> 
> Ok, I can do the same thing (and we do this consistently across
> all drivers?)
> 
> > If you believe SOCK_MIN_SNDBUF and/or SOCK_MIN_RCVBUF are wrong, please
> > elaborate.
> 
> I dont recall suggesting that.

You said "just as user-space SO_SNDBUF allows ridiculous values
for buffer size.."

So I understood you believe SOCK_MIN_SNDBUF and/or SOCK_MIN_RCVBUF are
ridiculous ;)


> 
> BTW, when I tried it, doing a SO_SNDBUF of 1 from uspace does not return
> an error. It merely sets the buffer size to 4608 (as reported by
> getsockopt in my env. I think the getsockopt value is impacted by
> many factors).

I pointed to you the actual code.

sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);


No error is returned. kernel enforces a minimal value.

#define SOCK_MIN_SNDBUF         (TCP_SKB_MIN_TRUESIZE * 2)
#define TCP_SKB_MIN_TRUESIZE    (2048 + SKB_DATA_ALIGN(sizeof(struct
sk_buff)))

-> 2 * (2048 + 256) = 4608  given current sk_buff overhead (that might
change in linux 5.4 ... )

But again if your sysctl allows to set a value below SOCK_MIN_SNDBUF,
that might be a problem, because stack could have a hidden bug for very
small values of sndbuf/rcvbuf. 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next] rds-tcp: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket
  2016-03-15 18:09         ` Eric Dumazet
@ 2016-03-15 18:16           ` Sowmini Varadhan
  0 siblings, 0 replies; 11+ messages in thread
From: Sowmini Varadhan @ 2016-03-15 18:16 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: santosh shilimkar, netdev, davem

On (03/15/16 11:09), Eric Dumazet wrote:
> 
> You said "just as user-space SO_SNDBUF allows ridiculous values
> for buffer size.."
> 
> So I understood you believe SOCK_MIN_SNDBUF and/or SOCK_MIN_RCVBUF are
> ridiculous ;)

No, no! I was saying that as a clueless user-space app, I can SO_SNDBUF
to 1, and happily think that everything is fine (error == 0), when in
reality the kernel has helpfully fixed up the value for me..

> I pointed to you the actual code.
> 
> sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);

yes, I'm in the process of changing rds-tcp now (doing sanity tests
etc on it, will send out update in a short while)

> 
> 
> No error is returned. kernel enforces a minimal value.
> 
> #define SOCK_MIN_SNDBUF         (TCP_SKB_MIN_TRUESIZE * 2)
> #define TCP_SKB_MIN_TRUESIZE    (2048 + SKB_DATA_ALIGN(sizeof(struct
> sk_buff)))
> 
> -> 2 * (2048 + 256) = 4608  given current sk_buff overhead (that might
> change in linux 5.4 ... )

Yes, I've seen the comments somewhere (in sock_setsockopt?)

it's a bit unexpectd for someone coming from bsd/solaris because
the value returned by getsockopt is quite unpredictable (as you
point out, depends on the kernel version among other things).

> But again if your sysctl allows to set a value below SOCK_MIN_SNDBUF,
> that might be a problem, because stack could have a hidden bug for very
> small values of sndbuf/rcvbuf. 

sure, fixing/testing it as I write this.

--Sowmini

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 net-next] rds-tcp: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket
  2016-03-15 17:47     ` Tom Herbert
@ 2016-03-15 19:57       ` Sowmini Varadhan
  0 siblings, 0 replies; 11+ messages in thread
From: Sowmini Varadhan @ 2016-03-15 19:57 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Linux Kernel Network Developers, santosh.shilimkar,
	David S. Miller

On (03/15/16 10:47), Tom Herbert wrote:
> Both sshd and nfsd and allow configurable listener port numbers. Any
> listener service will allow a configurable port number. An IANA port
> number is good as a default, but there are many reasons why people
> want or need to use a different port number. I don't see what makes
> RDS special in this regard.

Tom,

TCP is supposed to be the transparent infra for RDS-TCP.
If the server listens at something other than the well-known port,
the client kernel tcp socket has to know the value of the port to
connect to.  That means you also have to push down configuration
at each node, saying "client IP address X is to be connected at port Y"
etc. That is a lot of configuration information to manage in a cluster.
In addition to the volume of information that needs to be managed,
it also makes connectivity problems hard to trouble-shoot.

It has the danger of being a solution looking for a problem, because
we have not seen the use-case for this so far.

Thus to repeat what I said earlier,
 *if* some such need arises
and
 *if* there is absolutely no way to solve it with the existing infra,
there is nothing preventing a design extension to the daemon approach
in the future.

By itself, the sysctl support adds value and can co-exist with those
extensions.

--Sowmini

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2016-03-15 19:57 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-03-15 15:12 [PATCH v2 net-next] rds-tcp: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket Sowmini Varadhan
2016-03-15 16:38 ` santosh shilimkar
2016-03-15 16:58   ` Sowmini Varadhan
2016-03-15 17:18     ` Eric Dumazet
2016-03-15 17:30       ` Sowmini Varadhan
2016-03-15 18:09         ` Eric Dumazet
2016-03-15 18:16           ` Sowmini Varadhan
2016-03-15 17:30 ` Tom Herbert
2016-03-15 17:34   ` Sowmini Varadhan
2016-03-15 17:47     ` Tom Herbert
2016-03-15 19:57       ` Sowmini Varadhan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).