* [net-next RFC 1/4] bindtosubnet: infrastructure
2016-03-16 13:19 [net-next RFC 0/4] SO_BINDTOSUBNET Gilberto Bertin
@ 2016-03-16 13:19 ` Gilberto Bertin
2016-03-16 13:19 ` [net-next RFC 2/4] bindtosubnet: TCP/IPv4 implementation Gilberto Bertin
` (3 subsequent siblings)
4 siblings, 0 replies; 15+ messages in thread
From: Gilberto Bertin @ 2016-03-16 13:19 UTC (permalink / raw)
To: netdev; +Cc: Gilberto Bertin
Signed-off-by: Gilberto Bertin <gilberto.bertin@gmail.com>
---
include/net/sock.h | 20 +++++++
include/uapi/asm-generic/socket.h | 1 +
net/core/sock.c | 111 ++++++++++++++++++++++++++++++++++++++
3 files changed, 132 insertions(+)
diff --git a/include/net/sock.h b/include/net/sock.h
index f5ea148..c115c48 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -109,6 +109,16 @@ typedef struct {
#endif
} socket_lock_t;
+struct ipv4_subnet {
+ __be32 net;
+ u_char plen;
+};
+
+struct ipv6_subnet {
+ struct in6_addr net;
+ u_char plen;
+};
+
struct sock;
struct proto;
struct net;
@@ -176,6 +186,13 @@ struct sock_common {
unsigned char skc_ipv6only:1;
unsigned char skc_net_refcnt:1;
int skc_bound_dev_if;
+
+ unsigned char skc_bind_to_subnet;
+ union {
+ struct ipv4_subnet skc_bind_subnet4;
+ struct ipv6_subnet skc_bind_subnet6;
+ };
+
union {
struct hlist_node skc_bind_node;
struct hlist_nulls_node skc_portaddr_node;
@@ -327,6 +344,9 @@ struct sock {
#define sk_state __sk_common.skc_state
#define sk_reuse __sk_common.skc_reuse
#define sk_reuseport __sk_common.skc_reuseport
+#define sk_bind_to_subnet __sk_common.skc_bind_to_subnet
+#define sk_bind_subnet4 __sk_common.skc_bind_subnet4
+#define sk_bind_subnet6 __sk_common.skc_bind_subnet6
#define sk_ipv6only __sk_common.skc_ipv6only
#define sk_net_refcnt __sk_common.skc_net_refcnt
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index fb8a416..b4bcac2 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -30,6 +30,7 @@
#define SO_SNDLOWAT 19
#define SO_RCVTIMEO 20
#define SO_SNDTIMEO 21
+#define SO_BINDTOSUBNET 22
#endif
/* Security levels - as per NRL IPv6 - don't actually do anything */
diff --git a/net/core/sock.c b/net/core/sock.c
index 6c1c8bc..7626153 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -571,6 +571,68 @@ out:
return ret;
}
+static int sock_setbindtosubnet(struct sock *sk, char __user *optval,
+ int optlen)
+{
+ int ret = -ENOPROTOOPT;
+
+ if (sk->sk_family == AF_INET) {
+ struct ipv4_subnet bind_subnet4;
+
+ ret = -EFAULT;
+ if (optlen != sizeof(struct ipv4_subnet))
+ goto out;
+
+ if (copy_from_user(&bind_subnet4, optval,
+ sizeof(struct ipv4_subnet)))
+ goto out;
+
+ ret = -EINVAL;
+ if (bind_subnet4.plen > 32)
+ goto out;
+
+ lock_sock(sk);
+
+ sk->sk_bind_to_subnet = 1;
+ sk->sk_bind_subnet4.net = bind_subnet4.net;
+ sk->sk_bind_subnet4.plen = bind_subnet4.plen;
+ sk_dst_reset(sk);
+
+ release_sock(sk);
+
+ ret = 0;
+ } else if (sk->sk_family == AF_INET6) {
+ struct ipv6_subnet bind_subnet6;
+
+ ret = -EFAULT;
+ if (optlen != sizeof(struct ipv6_subnet))
+ goto out;
+
+ if (copy_from_user(&bind_subnet6, optval,
+ sizeof(struct ipv6_subnet)))
+ goto out;
+
+ ret = -EINVAL;
+ if (bind_subnet6.plen > 128)
+ goto out;
+
+ lock_sock(sk);
+
+ sk->sk_bind_to_subnet = 1;
+ memcpy(&sk->sk_bind_subnet6.net, &bind_subnet6.net,
+ sizeof(struct in6_addr));
+ sk->sk_bind_subnet6.plen = bind_subnet6.plen;
+ sk_dst_reset(sk);
+
+ release_sock(sk);
+
+ ret = 0;
+ }
+
+out:
+ return ret;
+}
+
static int sock_getbindtodevice(struct sock *sk, char __user *optval,
int __user *optlen, int len)
{
@@ -611,6 +673,49 @@ out:
return ret;
}
+static int sock_getbindtosubnet(struct sock *sk, char __user *optval,
+ int __user *optlen, int len)
+{
+ int ret;
+
+ if (sk->sk_bind_to_subnet == 0) {
+ len = 0;
+ goto zero;
+ }
+
+ if (sk->sk_family == AF_INET) {
+ ret = -EINVAL;
+ if (len < sizeof(struct ipv4_subnet))
+ goto out;
+
+ len = sizeof(struct ipv4_subnet);
+
+ ret = -EFAULT;
+ if (copy_to_user(optval, &sk->sk_bind_subnet4, len))
+ goto out;
+
+ } else if (sk->sk_family == AF_INET6) {
+ ret = -EINVAL;
+ if (len < sizeof(struct ipv6_subnet))
+ goto out;
+
+ len = sizeof(struct ipv6_subnet);
+
+ ret = -EFAULT;
+ if (copy_to_user(optval, &sk->sk_bind_subnet6, len))
+ goto out;
+ }
+
+zero:
+ ret = -EFAULT;
+ if (put_user(len, optlen))
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
{
if (valbool)
@@ -659,6 +764,9 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
if (optname == SO_BINDTODEVICE)
return sock_setbindtodevice(sk, optval, optlen);
+ else if (optname == SO_BINDTOSUBNET)
+ return sock_setbindtosubnet(sk, optval, optlen);
+
if (optlen < sizeof(int))
return -EINVAL;
@@ -1214,6 +1322,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_BINDTODEVICE:
return sock_getbindtodevice(sk, optval, optlen, len);
+ case SO_BINDTOSUBNET:
+ return sock_getbindtosubnet(sk, optval, optlen, len);
+
case SO_GET_FILTER:
len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
if (len < 0)
--
2.7.2
^ permalink raw reply related [flat|nested] 15+ messages in thread* [net-next RFC 2/4] bindtosubnet: TCP/IPv4 implementation
2016-03-16 13:19 [net-next RFC 0/4] SO_BINDTOSUBNET Gilberto Bertin
2016-03-16 13:19 ` [net-next RFC 1/4] bindtosubnet: infrastructure Gilberto Bertin
@ 2016-03-16 13:19 ` Gilberto Bertin
2016-03-16 13:19 ` [net-next RFC 3/4] bindtosubnet: TCP/IPv6 implementation Gilberto Bertin
` (2 subsequent siblings)
4 siblings, 0 replies; 15+ messages in thread
From: Gilberto Bertin @ 2016-03-16 13:19 UTC (permalink / raw)
To: netdev; +Cc: Gilberto Bertin
Signed-off-by: Gilberto Bertin <gilberto.bertin@gmail.com>
---
net/ipv4/inet_connection_sock.c | 20 +++++++++++++++++++-
net/ipv4/inet_hashtables.c | 9 +++++++++
2 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 6414891..0a3777c 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/jhash.h>
+#include <linux/inetdevice.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
@@ -43,6 +44,22 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
}
EXPORT_SYMBOL(inet_get_local_port_range);
+static inline int inet_csk_bind_subnet_conflict(const struct sock *sk,
+ const struct sock *sk2)
+{
+ __be32 mask;
+
+ if (sk->sk_bind_to_subnet && sk2->sk_bind_to_subnet) {
+ mask = inet_make_mask(min(sk->sk_bind_subnet4.plen,
+ sk2->sk_bind_subnet4.plen));
+
+ return (sk->sk_bind_subnet4.net & mask) ==
+ (sk2->sk_bind_subnet4.net & mask);
+ }
+
+ return 0;
+}
+
int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb, bool relax)
{
@@ -63,7 +80,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
!inet_v6_ipv6only(sk2) &&
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
+ inet_csk_bind_subnet_conflict(sk, sk2)) {
if ((!reuse || !sk2->sk_reuse ||
sk2->sk_state == TCP_LISTEN) &&
(!reuseport || !sk2->sk_reuseport ||
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ccc5980..1a0229c 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -13,6 +13,7 @@
* 2 of the License, or (at your option) any later version.
*/
+#include <linux/inetdevice.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/sched.h>
@@ -189,6 +190,14 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
+ if (sk->sk_bind_to_subnet) {
+ __be32 mask = inet_make_mask(sk->sk_bind_subnet4.plen);
+
+ if ((sk->sk_bind_subnet4.net & mask) != (daddr & mask))
+ return -1;
+ score += 4;
+ }
+
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
}
--
2.7.2
^ permalink raw reply related [flat|nested] 15+ messages in thread* [net-next RFC 3/4] bindtosubnet: TCP/IPv6 implementation
2016-03-16 13:19 [net-next RFC 0/4] SO_BINDTOSUBNET Gilberto Bertin
2016-03-16 13:19 ` [net-next RFC 1/4] bindtosubnet: infrastructure Gilberto Bertin
2016-03-16 13:19 ` [net-next RFC 2/4] bindtosubnet: TCP/IPv4 implementation Gilberto Bertin
@ 2016-03-16 13:19 ` Gilberto Bertin
2016-03-16 13:19 ` [net-next RFC 4/4] bindtosubnet: UPD implementation Gilberto Bertin
2016-03-25 0:25 ` [net-next RFC 0/4] SO_BINDTOSUBNET Tom Herbert
4 siblings, 0 replies; 15+ messages in thread
From: Gilberto Bertin @ 2016-03-16 13:19 UTC (permalink / raw)
To: netdev; +Cc: Gilberto Bertin
Signed-off-by: Gilberto Bertin <gilberto.bertin@gmail.com>
---
net/ipv6/inet6_connection_sock.c | 17 ++++++++++++++++-
net/ipv6/inet6_hashtables.c | 6 ++++++
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 36c3f01..288bab6 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -27,6 +27,20 @@
#include <net/sock.h>
#include <net/inet6_connection_sock.h>
+int inet6_csk_bind_subnet_conflict(const struct sock *sk,
+ const struct sock *sk2)
+{
+ u_char plen;
+
+ plen = min(sk->sk_bind_subnet6.plen, sk2->sk_bind_subnet6.plen);
+
+ if (sk->sk_bind_to_subnet && sk2->sk_bind_to_subnet)
+ return ipv6_prefix_equal(&sk->sk_bind_subnet6.net,
+ &sk2->sk_bind_subnet6.net, plen);
+
+ return 0;
+}
+
int inet6_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb, bool relax)
{
@@ -44,7 +58,8 @@ int inet6_csk_bind_conflict(const struct sock *sk,
if (sk != sk2 &&
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
+ inet6_csk_bind_subnet_conflict(sk, sk2)) {
if ((!reuse || !sk2->sk_reuse ||
sk2->sk_state == TCP_LISTEN) &&
(!reuseport || !sk2->sk_reuseport ||
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 21ace5a..e88c82d 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -114,6 +114,12 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score++;
}
+ if (sk->sk_bind_to_subnet) {
+ if (!ipv6_prefix_equal(&sk->sk_bind_subnet6.net, daddr,
+ sk->sk_bind_subnet6.plen))
+ return -1;
+ score++;
+ }
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
}
--
2.7.2
^ permalink raw reply related [flat|nested] 15+ messages in thread* [net-next RFC 4/4] bindtosubnet: UPD implementation
2016-03-16 13:19 [net-next RFC 0/4] SO_BINDTOSUBNET Gilberto Bertin
` (2 preceding siblings ...)
2016-03-16 13:19 ` [net-next RFC 3/4] bindtosubnet: TCP/IPv6 implementation Gilberto Bertin
@ 2016-03-16 13:19 ` Gilberto Bertin
2016-03-25 0:25 ` [net-next RFC 0/4] SO_BINDTOSUBNET Tom Herbert
4 siblings, 0 replies; 15+ messages in thread
From: Gilberto Bertin @ 2016-03-16 13:19 UTC (permalink / raw)
To: netdev; +Cc: Gilberto Bertin
Signed-off-by: Gilberto Bertin <gilberto.bertin@gmail.com>
---
net/ipv4/udp.c | 36 ++++++++++++++++++++++++++++++++++++
1 file changed, 36 insertions(+)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 95d2f19..1ecffa8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -133,6 +133,23 @@ EXPORT_SYMBOL(udp_memory_allocated);
#define MAX_UDP_PORTS 65536
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
+static inline int udp_csk_bind_subnet_conflict(const struct sock *sk,
+ const struct sock *sk2)
+{
+ __be32 mask;
+
+ if (sk->sk_bind_to_subnet && sk2->sk_bind_to_subnet) {
+ mask = inet_make_mask(min(sk->sk_bind_subnet4.plen,
+ sk2->sk_bind_subnet4.plen));
+
+ return (sk->sk_bind_subnet4.net & mask) ==
+ (sk2->sk_bind_subnet4.net & mask);
+ }
+
+ return 0;
+}
+
+
static int udp_lib_lport_inuse(struct net *net, __u16 num,
const struct udp_hslot *hslot,
unsigned long *bitmap,
@@ -153,6 +170,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ udp_csk_bind_subnet_conflict(sk, sk2) &&
(!sk2->sk_reuseport || !sk->sk_reuseport ||
rcu_access_pointer(sk->sk_reuseport_cb) ||
!uid_eq(uid, sock_i_uid(sk2))) &&
@@ -189,6 +207,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ udp_csk_bind_subnet_conflict(sk, sk2) &&
(!sk2->sk_reuseport || !sk->sk_reuseport ||
rcu_access_pointer(sk->sk_reuseport_cb) ||
!uid_eq(uid, sock_i_uid(sk2))) &&
@@ -426,6 +445,15 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
+
+ if (sk->sk_bind_to_subnet) {
+ __be32 mask = inet_make_mask(sk->sk_bind_subnet4.plen);
+
+ if ((sk->sk_bind_subnet4.net & mask) != (daddr & mask))
+ return -1;
+ score += 4;
+ }
+
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
return score;
@@ -471,6 +499,14 @@ static inline int compute_score2(struct sock *sk, struct net *net,
score += 4;
}
+ if (sk->sk_bind_to_subnet) {
+ __be32 mask = inet_make_mask(sk->sk_bind_subnet4.plen);
+
+ if ((sk->sk_bind_subnet4.net & mask) != (daddr & mask))
+ return -1;
+ score += 4;
+ }
+
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
--
2.7.2
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [net-next RFC 0/4] SO_BINDTOSUBNET
2016-03-16 13:19 [net-next RFC 0/4] SO_BINDTOSUBNET Gilberto Bertin
` (3 preceding siblings ...)
2016-03-16 13:19 ` [net-next RFC 4/4] bindtosubnet: UPD implementation Gilberto Bertin
@ 2016-03-25 0:25 ` Tom Herbert
2016-03-25 22:29 ` Gilberto
4 siblings, 1 reply; 15+ messages in thread
From: Tom Herbert @ 2016-03-25 0:25 UTC (permalink / raw)
To: Gilberto Bertin; +Cc: Linux Kernel Network Developers
On Wed, Mar 16, 2016 at 6:19 AM, Gilberto Bertin
<gilberto.bertin@gmail.com> wrote:
> This is my second attempt to submit an RFC for this patch.
>
> Some arguments for and against it since the first submission:
> * SO_BINDTOSUBNET is an arbitrary option and can be seens as nother use
> * case of the SO_REUSEPORT BPF patch
> * but at the same time using BPF requires more work/code on the server
> and since the bind to subnet use case could potentially become a
> common one maybe there is some value in having it as an option instead
> of having to code (either manually or with clang) an eBPF program that
> would do the same
Gilberto, I'm not sure I understand this argument. Have you
implemented the BPF bind solution?
Thanks,
Tom
> * it may probably possible to archive the same results using VRF. This
> would require to create a VRF device, configure the device routing
> table and make each bind each process to a different VRF device (but
> I'm not sure how this would work/interfere with an existing iptables
> setup for example)
>
> -----------------------------------------------------------------------------
>
> This series introduces support for the SO_BINDTOSUBNET socket option, which
> allows a listener socket to bind to a subnet instead of * or a single address.
>
> Motivation:
> consider a set of servers, each one with thousands and thousands of IP
> addresses. Since assigning /32 or /128 IP individual addresses would be
> inefficient, one solution can be assigning subnets using local routes
> (with 'ip route add local').
>
> This allows a listener to listen and terminate connections going to any
> of the IP addresses of these subnets without explicitly configuring all
> the IP addresses of the subnet range.
> This is very efficient.
>
> Unfortunately there may be the need to use different subnets for
> different purposes.
> One can imagine port 80 being served by one HTTP server for some IP
> subnet, while another server used for another subnet.
> Right now Linux does not allow this.
> It is either possible to bind to *, indicating ALL traffic going to
> given port, or to individual IP addresses.
> The first only allows to accept connections from all the subnets.
> The latter does not scale well with lots of IP addresses.
>
> Using bindtosubnet would solve this problem: just by adding a local
> route rule and setting the SO_BINDTOSUBNET option for a socket it would
> be possible to easily partition traffic by subnets.
>
> API:
> the subnet is specified (as argument of the setsockopt syscall) by the
> address of the network, and the prefix length of the netmask.
>
> IPv4:
> struct ipv4_subnet {
> __be32 net;
> u_char plen;
> };
>
> and IPv6:
> struct ipv6_subnet {
> struct in6_addr net;
> u_char plen;
> };
>
> Bind conflicts:
> two sockets with the bindtosubnet option enabled generate a bind
> conflict if their network addresses masked with the shortest of their
> prefix are equal.
> The bindtosubnet option can be combined with soreuseport so that two
> listener can bind on the same subnet.
>
> Any questions/feedback appreciated.
>
> Thanks,
> Gilberto
>
> Gilberto Bertin (4):
> bindtosubnet: infrastructure
> bindtosubnet: TCP/IPv4 implementation
> bindtosubnet: TCP/IPv6 implementation
> bindtosubnet: UPD implementation
>
> include/net/sock.h | 20 +++++++
> include/uapi/asm-generic/socket.h | 1 +
> net/core/sock.c | 111 ++++++++++++++++++++++++++++++++++++++
> net/ipv4/inet_connection_sock.c | 20 ++++++-
> net/ipv4/inet_hashtables.c | 9 ++++
> net/ipv4/udp.c | 36 +++++++++++++
> net/ipv6/inet6_connection_sock.c | 17 +++++-
> net/ipv6/inet6_hashtables.c | 6 +++
> 8 files changed, 218 insertions(+), 2 deletions(-)
>
> --
> 2.7.2
>
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [net-next RFC 0/4] SO_BINDTOSUBNET
2016-03-25 0:25 ` [net-next RFC 0/4] SO_BINDTOSUBNET Tom Herbert
@ 2016-03-25 22:29 ` Gilberto
0 siblings, 0 replies; 15+ messages in thread
From: Gilberto @ 2016-03-25 22:29 UTC (permalink / raw)
To: Tom Herbert; +Cc: Linux Kernel Network Developers
On 03/25/2016 12:25 AM, Tom Herbert wrote:
> On Wed, Mar 16, 2016 at 6:19 AM, Gilberto Bertin
> <gilberto.bertin@gmail.com> wrote:
>> This is my second attempt to submit an RFC for this patch.
>>
>> Some arguments for and against it since the first submission:
>> * SO_BINDTOSUBNET is an arbitrary option and can be seens as nother use
>> * case of the SO_REUSEPORT BPF patch
>> * but at the same time using BPF requires more work/code on the server
>> and since the bind to subnet use case could potentially become a
>> common one maybe there is some value in having it as an option instead
>> of having to code (either manually or with clang) an eBPF program that
>> would do the same
>
> Gilberto, I'm not sure I understand this argument. Have you
> implemented the BPF bind solution?
>
> Thanks,
> Tom
Yes, I wrote up a very basic draft for this feature (I didn't know there
was already some work going on with SO_ATTACH_REUSEPORT_[CE]BPF).
Thanks,
Gilberto
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [net-next RFC 0/4] SO_BINDTOSUBNET
2016-02-23 15:27 Gilberto Bertin
@ 2016-02-24 5:06 ` Tom Herbert
2016-02-25 10:09 ` Gilberto Bertin
2016-03-07 17:22 ` Gilberto Bertin
2016-03-07 19:09 ` David Ahern
1 sibling, 2 replies; 15+ messages in thread
From: Tom Herbert @ 2016-02-24 5:06 UTC (permalink / raw)
To: Gilberto Bertin; +Cc: Linux Kernel Network Developers
On Tue, Feb 23, 2016 at 7:27 AM, Gilberto Bertin
<gilberto.bertin@gmail.com> wrote:
> This series introduces support for the SO_BINDTOSUBNET socket option, which
> allows a listener socket to bind to a subnet instead of * or a single address.
>
> Motivation:
> consider a set of servers, each one with thousands and thousands of IP
> addresses. Since assigning /32 or /128 IP individual addresses would be
> inefficient, one solution can be assigning subnets using local routes
> (with 'ip route add local').
>
Hi Gilberto,
The concept is certainly relevant, but allowing binds by subnet seems
arbitrary. I can imagine that someone might want to bind to a list of
addresses, list of interfaces, list of subnets, or complex
combinations like a subnet on one interface, and list of addresses on
another. So I wonder if this is another use case for a BPF program on
a listener socket, like a program for a scoring function. Maybe this
could even combined with BPF SO_REUSERPORT somehow?
Tom
> This allows a listener to listen and terminate connections going to any
> of the IP addresses of these subnets without explicitly configuring all
> of them. This is very efficient.
>
> Unfortunately there may be the need to use different subnets for
> different purposes.
> One can imagine port 80 being served by one HTTP server for some IP
> subnet, while another server used for another subnet.
> Right now Linux does not allow this.
> It is either possible to bind to *, indicating ALL traffic going to
> given port, or to individual IP addresses.
> The first only allows to accept connections from all the subnets.
> The latter does not scale well with lots of IP addresses.
>
> Using bindtosubnet would solve this problem: just by adding a local
> route rule and setting the SO_BINDTOSUBNET option for a socket it would
> be possible to easily partition traffic by subnets.
>
> API:
> the subnet is specified (as argument of the setsockopt syscall) by the
> address of the network, and the prefix length of the netmask.
>
> IPv4:
> struct ipv4_subnet {
> __be32 net;
> u_char plen;
> };
>
> and IPv6:
> struct ipv6_subnet {
> struct in6_addr net;
> u_char plen;
> };
>
> Bind conflicts:
> two sockets with the bindtosubnet option enabled generate a bind
> conflict if their network addresses masked with the shortest of their
> prefix are equal.
> The bindtosubnet option can be combined with soreuseport so that two
> listener can bind on the same subnet.
>
> Any questions/feedback appreciated.
>
> Thanks,
> Gilberto
>
> Gilberto Bertin (4):
> bindtosubnet: infrastructure
> bindtosubnet: TCP/IPv4 implementation
> bindtosubnet: TCP/IPv6 implementation
> bindtosubnet: UPD implementation
>
> include/net/sock.h | 20 +++++++
> include/uapi/asm-generic/socket.h | 1 +
> net/core/sock.c | 111 ++++++++++++++++++++++++++++++++++++++
> net/ipv4/inet_connection_sock.c | 20 ++++++-
> net/ipv4/inet_hashtables.c | 9 ++++
> net/ipv4/udp.c | 35 ++++++++++++
> net/ipv6/inet6_connection_sock.c | 17 +++++-
> net/ipv6/inet6_hashtables.c | 6 +++
> 8 files changed, 217 insertions(+), 2 deletions(-)
>
> --
> 2.7.1
>
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [net-next RFC 0/4] SO_BINDTOSUBNET
2016-02-24 5:06 ` Tom Herbert
@ 2016-02-25 10:09 ` Gilberto Bertin
2016-03-07 17:22 ` Gilberto Bertin
1 sibling, 0 replies; 15+ messages in thread
From: Gilberto Bertin @ 2016-02-25 10:09 UTC (permalink / raw)
To: Tom Herbert; +Cc: Linux Kernel Network Developers
> On 24 Feb 2016, at 05:06, Tom Herbert <tom@herbertland.com> wrote:
>
> On Tue, Feb 23, 2016 at 7:27 AM, Gilberto Bertin
> <gilberto.bertin@gmail.com> wrote:
>> This series introduces support for the SO_BINDTOSUBNET socket option, which
>> allows a listener socket to bind to a subnet instead of * or a single address.
>>
>> Motivation:
>> consider a set of servers, each one with thousands and thousands of IP
>> addresses. Since assigning /32 or /128 IP individual addresses would be
>> inefficient, one solution can be assigning subnets using local routes
>> (with 'ip route add local').
>>
> Hi Gilberto,
>
> The concept is certainly relevant, but allowing binds by subnet seems
> arbitrary. I can imagine that someone might want to bind to a list of
> addresses, list of interfaces, list of subnets, or complex
> combinations like a subnet on one interface, and list of addresses on
> another. So I wonder if this is another use case for a BPF program on
> a listener socket, like a program for a scoring function. Maybe this
> could even combined with BPF SO_REUSERPORT somehow?
>
> Tom
Hi Tom,
thanks for commenting on my patch.
Your proposal is certainly more generic, but at the same time it would require
a bit more work to be used (at least for my specific case).
Indeed in my case it would be only a matter of calling setsockopt() with a
struct that decsribes the subnet, while with BPF it would be necessary to write
and load a scoring function (but at least the subnet descriptor can be stored
in a map, so there's no need to rewrite the filter for each different subnet).
Anyway, I’m curious to explore your idea.
Thanks,
gilberto
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [net-next RFC 0/4] SO_BINDTOSUBNET
2016-02-24 5:06 ` Tom Herbert
2016-02-25 10:09 ` Gilberto Bertin
@ 2016-03-07 17:22 ` Gilberto Bertin
2016-03-07 17:49 ` Tom Herbert
2016-03-14 14:12 ` Willem de Bruijn
1 sibling, 2 replies; 15+ messages in thread
From: Gilberto Bertin @ 2016-03-07 17:22 UTC (permalink / raw)
To: Tom Herbert; +Cc: Linux Kernel Network Developers
> On 24 Feb 2016, at 05:06, Tom Herbert <tom@herbertland.com> wrote:
>
> On Tue, Feb 23, 2016 at 7:27 AM, Gilberto Bertin
> <gilberto.bertin@gmail.com> wrote:
>> This series introduces support for the SO_BINDTOSUBNET socket option, which
>> allows a listener socket to bind to a subnet instead of * or a single address.
>>
>> Motivation:
>> consider a set of servers, each one with thousands and thousands of IP
>> addresses. Since assigning /32 or /128 IP individual addresses would be
>> inefficient, one solution can be assigning subnets using local routes
>> (with 'ip route add local').
>>
> Hi Gilberto,
>
> The concept is certainly relevant, but allowing binds by subnet seems
> arbitrary. I can imagine that someone might want to bind to a list of
> addresses, list of interfaces, list of subnets, or complex
> combinations like a subnet on one interface, and list of addresses on
> another. So I wonder if this is another use case for a BPF program on
> a listener socket, like a program for a scoring function. Maybe this
> could even combined with BPF SO_REUSERPORT somehow?
>
> Tom
Hi Tom,
I have a working POC of the patch that adds support for BPF into the
compute_score function, and I would like to share some thoughts about
advantages and disadvantages of both solutions.
First, setup.
SO_BINDTOSUBET:
- add this to some_server.c:
subnet.net = addr.s_addr;
subnet.plen = 24
setsockopt(sock, SOL_SOCKET, SO_BINDTOSUBNET, &subnet, sizeof(subnet));
and you are done. Your server will accept all connections from the
specified subnet.
BPF_LISTENER_FILTER:
- write a bpf filter like this:
SEC("socket_bpf")
int bpf_prog1(struct __sk_buff *skb)
{
unsigned int daddr;
daddr = load_word(skb, ETH_HLEN + offsetof(struct iphdr, daddr));
if (/* daddr matches subnet */) {
return -1; //accept
}
return 0; // reject
}
- compile it:
$ clang -target bpf -c -o socket_bpf.o socket_bpf.c
- add this to your server.c:
bpf_load_file("/path/to/socket_bpf.o");
setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, sizeof(prog_fd[0]));
- link your server with a couple of libbpf libraries (I'm
using the kernel ones from samples/bpf) and -lelf
And this is still simplified (since instead of hardcoding the subnet
into the bpf filter it would be preferable to use maps).
thoughts:
- SO_BINDTOSUBNET is much simpler to configure than BPF
- BPF requires some external C libraries and I think it would not be
trivial to get it working with other languages than C/C++.
As an example, I have two working servers for SO_BINDTOSUBNET written
in Ruby and Go (since both these languages expose setsockopt), but it
would be necessary to write something that wrap the C libbpf to use
BPF
- I (personally) do not think SO_BINDTOSUBNET is that much arbitrary, I
see it more as the logical missing piece between * and a single
address when calling bind() (otherwise I think we should consider
arbitrary even SO_BINDTODEVICE)
That said, do you believe it could be an option to maybe have both these
options? I think that the ability to run BPF in the listening path is
really interesting, but it's probably an overkill for the bind-to-subnet
use case.
Thank you,
gilberto
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [net-next RFC 0/4] SO_BINDTOSUBNET
2016-03-07 17:22 ` Gilberto Bertin
@ 2016-03-07 17:49 ` Tom Herbert
2016-03-11 14:43 ` Gilberto Bertin
2016-03-14 14:12 ` Willem de Bruijn
1 sibling, 1 reply; 15+ messages in thread
From: Tom Herbert @ 2016-03-07 17:49 UTC (permalink / raw)
To: Gilberto Bertin; +Cc: Linux Kernel Network Developers
On Mon, Mar 7, 2016 at 9:22 AM, Gilberto Bertin
<gilberto.bertin@gmail.com> wrote:
>
>> On 24 Feb 2016, at 05:06, Tom Herbert <tom@herbertland.com> wrote:
>>
>> On Tue, Feb 23, 2016 at 7:27 AM, Gilberto Bertin
>> <gilberto.bertin@gmail.com> wrote:
>>> This series introduces support for the SO_BINDTOSUBNET socket option, which
>>> allows a listener socket to bind to a subnet instead of * or a single address.
>>>
>>> Motivation:
>>> consider a set of servers, each one with thousands and thousands of IP
>>> addresses. Since assigning /32 or /128 IP individual addresses would be
>>> inefficient, one solution can be assigning subnets using local routes
>>> (with 'ip route add local').
>>>
>> Hi Gilberto,
>>
>> The concept is certainly relevant, but allowing binds by subnet seems
>> arbitrary. I can imagine that someone might want to bind to a list of
>> addresses, list of interfaces, list of subnets, or complex
>> combinations like a subnet on one interface, and list of addresses on
>> another. So I wonder if this is another use case for a BPF program on
>> a listener socket, like a program for a scoring function. Maybe this
>> could even combined with BPF SO_REUSERPORT somehow?
>>
>> Tom
>
> Hi Tom,
>
> I have a working POC of the patch that adds support for BPF into the
> compute_score function, and I would like to share some thoughts about
> advantages and disadvantages of both solutions.
>
Cool, thanks for implementing that!
> First, setup.
>
> SO_BINDTOSUBET:
> - add this to some_server.c:
>
> subnet.net = addr.s_addr;
> subnet.plen = 24
> setsockopt(sock, SOL_SOCKET, SO_BINDTOSUBNET, &subnet, sizeof(subnet));
>
> and you are done. Your server will accept all connections from the
> specified subnet.
>
> BPF_LISTENER_FILTER:
> - write a bpf filter like this:
>
> SEC("socket_bpf")
> int bpf_prog1(struct __sk_buff *skb)
> {
> unsigned int daddr;
> daddr = load_word(skb, ETH_HLEN + offsetof(struct iphdr, daddr));
>
> if (/* daddr matches subnet */) {
> return -1; //accept
> }
>
> return 0; // reject
> }
>
> - compile it:
> $ clang -target bpf -c -o socket_bpf.o socket_bpf.c
>
> - add this to your server.c:
> bpf_load_file("/path/to/socket_bpf.o");
> setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, sizeof(prog_fd[0]));
>
> - link your server with a couple of libbpf libraries (I'm
> using the kernel ones from samples/bpf) and -lelf
>
> And this is still simplified (since instead of hardcoding the subnet
> into the bpf filter it would be preferable to use maps).
>
>
> thoughts:
> - SO_BINDTOSUBNET is much simpler to configure than BPF
> - BPF requires some external C libraries and I think it would not be
> trivial to get it working with other languages than C/C++.
Yes, but the direction seems to be to this type of potentially open
ended socket level filtering is done via BPF. The SO_REUSEPORT BPF
patches really demonstrates the potential.
> As an example, I have two working servers for SO_BINDTOSUBNET written
> in Ruby and Go (since both these languages expose setsockopt), but it
> would be necessary to write something that wrap the C libbpf to use
> BPF
> - I (personally) do not think SO_BINDTOSUBNET is that much arbitrary, I
> see it more as the logical missing piece between * and a single
> address when calling bind() (otherwise I think we should consider
> arbitrary even SO_BINDTODEVICE)
>
Yes SO_BINDTODEVICE is arbitrary. It seems like we could just as
easily have BINDTODEVICES. Or, as I said SO_BINDTOADDRESSES also makes
perfect sense.
> That said, do you believe it could be an option to maybe have both these
> options? I think that the ability to run BPF in the listening path is
> really interesting, but it's probably an overkill for the bind-to-subnet
> use case.
>
Maybe. It will be quite common server configuration with IPv6 to
assign each server its own /64 prefix(es). From that POV I suppose
there is some value in having SO_BINDTOSUBNET.
Tom
> Thank you,
> gilberto
>
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [net-next RFC 0/4] SO_BINDTOSUBNET
2016-03-07 17:49 ` Tom Herbert
@ 2016-03-11 14:43 ` Gilberto Bertin
0 siblings, 0 replies; 15+ messages in thread
From: Gilberto Bertin @ 2016-03-11 14:43 UTC (permalink / raw)
To: Tom Herbert; +Cc: Linux Kernel Network Developers
> On 7 Mar 2016, at 17:49, Tom Herbert <tom@herbertland.com> wrote:
>
>> That said, do you believe it could be an option to maybe have both these
>> options? I think that the ability to run BPF in the listening path is
>> really interesting, but it's probably an overkill for the bind-to-subnet
>> use case.
>>
>
> Maybe. It will be quite common server configuration with IPv6 to
> assign each server its own /64 prefix(es). From that POV I suppose
> there is some value in having SO_BINDTOSUBNET.
Good, in this case I will submit again this RFC when the net-next window
will open for the 4.6 release, so that we can gather more comments and
decide what to do.
Thank you,
Gilberto
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [net-next RFC 0/4] SO_BINDTOSUBNET
2016-03-07 17:22 ` Gilberto Bertin
2016-03-07 17:49 ` Tom Herbert
@ 2016-03-14 14:12 ` Willem de Bruijn
1 sibling, 0 replies; 15+ messages in thread
From: Willem de Bruijn @ 2016-03-14 14:12 UTC (permalink / raw)
To: Gilberto Bertin; +Cc: Tom Herbert, Linux Kernel Network Developers
> - write a bpf filter like this:
..
>
> - compile it:
> $ clang -target bpf -c -o socket_bpf.o socket_bpf.c
>
> - add this to your server.c:
> bpf_load_file("/path/to/socket_bpf.o");
> setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, sizeof(prog_fd[0]));
>
> - link your server with a couple of libbpf libraries (I'm
> using the kernel ones from samples/bpf) and -lelf
>
> And this is still simplified (since instead of hardcoding the subnet
> into the bpf filter it would be preferable to use maps).
>
>
> thoughts:
> - SO_BINDTOSUBNET is much simpler to configure than BPF
> - BPF requires some external C libraries and I think it would not be
> trivial to get it working with other languages than C/C++.
You can easily write BPF by hand.
See for instance attach_ebpf() in tools/testing/selftests/net/reuseport_bpf.c
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [net-next RFC 0/4] SO_BINDTOSUBNET
2016-02-23 15:27 Gilberto Bertin
2016-02-24 5:06 ` Tom Herbert
@ 2016-03-07 19:09 ` David Ahern
1 sibling, 0 replies; 15+ messages in thread
From: David Ahern @ 2016-03-07 19:09 UTC (permalink / raw)
To: Gilberto Bertin, netdev
On 2/23/16 8:27 AM, Gilberto Bertin wrote:
> This series introduces support for the SO_BINDTOSUBNET socket option, which
> allows a listener socket to bind to a subnet instead of * or a single address.
>
> Motivation:
> consider a set of servers, each one with thousands and thousands of IP
> addresses. Since assigning /32 or /128 IP individual addresses would be
> inefficient, one solution can be assigning subnets using local routes
> (with 'ip route add local').
>
> This allows a listener to listen and terminate connections going to any
> of the IP addresses of these subnets without explicitly configuring all
> of them. This is very efficient.
>
> Unfortunately there may be the need to use different subnets for
> different purposes.
> One can imagine port 80 being served by one HTTP server for some IP
> subnet, while another server used for another subnet.
> Right now Linux does not allow this.
> It is either possible to bind to *, indicating ALL traffic going to
> given port, or to individual IP addresses.
> The first only allows to accept connections from all the subnets.
> The latter does not scale well with lots of IP addresses.
Have you looked at the VRF implementation?
Documentation/networking/vrf.txt
It certainly handles some of your requirements -- e.g., create L3
domains (VRFs) for subnets of interest. Apps can bind to the VRF device
to provide service to only those networks in the domain.
^ permalink raw reply [flat|nested] 15+ messages in thread