* [RFC PATCH] packet: Add fanout support.
@ 2011-06-21 9:53 David Miller
2011-06-21 10:39 ` Victor Julien
0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2011-06-21 9:53 UTC (permalink / raw)
To: netdev; +Cc: victor
This adds demuxing support for AF_PACKET sockets. It's just to give
people an idea, I've only build tested this patch.
Basically it allows to spread the AF_PACKET processing load amongst
several AF_PACKET sockets. The distribution can either be based upon
hashing (PACKET_FANOUT_HASH) or round-robin based load-balancing
(PACKET_FANOUT_LB).
The hash based fanout takes advantage of the precomputed skb->rxhash
and only costs ~20 cpu cycles.
A restriction is that you must bind the AF_PACKET socket fully before
you add it to a fanout.
The encoding of the PACKET_FANOUT socket option argument is:
(PACKET_FANOUT_{HASH,LB} << 16) | (ID & 0xffff)
All sockets adding themselves to the same fanout ID must all use
the same PACKET_FANOUT_* type and also must be bound to the same
device/protocol.
The implementation is agnostic to the type of AF_PACKET sockets in
use. You can use mmap based, and non-mmap based, AF_PACKET sockets.
It simply doesn't care.
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 7b31863..1efa1cb 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -49,6 +49,10 @@ struct sockaddr_ll {
#define PACKET_VNET_HDR 15
#define PACKET_TX_TIMESTAMP 16
#define PACKET_TIMESTAMP 17
+#define PACKET_FANOUT 18
+
+#define PACKET_FANOUT_HASH 0
+#define PACKET_FANOUT_LB 1
struct tpacket_stats {
unsigned int tp_packets;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 461b16f..e6af2eb 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -187,9 +187,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
static void packet_flush_mclist(struct sock *sk);
+struct packet_fanout;
struct packet_sock {
/* struct sock has to be the first member of packet_sock */
struct sock sk;
+ struct packet_fanout *fanout;
struct tpacket_stats stats;
struct packet_ring_buffer rx_ring;
struct packet_ring_buffer tx_ring;
@@ -212,6 +214,22 @@ struct packet_sock {
struct packet_type prot_hook ____cacheline_aligned_in_smp;
};
+#define PACKET_FANOUT_MAX 2048
+
+struct packet_fanout {
+#ifdef CONFIG_NET_NS
+ struct net *net;
+#endif
+ int num_members;
+ u16 id;
+ u8 type;
+ u8 pad;
+ atomic_t rr_cur;
+ struct list_head list;
+ struct sock *arr[PACKET_FANOUT_MAX];
+ struct packet_type prot_hook ____cacheline_aligned_in_smp;
+};
+
struct packet_skb_cb {
unsigned int origlen;
union {
@@ -344,6 +362,164 @@ static void packet_sock_destruct(struct sock *sk)
sk_refcnt_debug_dec(sk);
}
+static int fanout_rr_next(struct packet_fanout *f)
+{
+ int x = atomic_read(&f->rr_cur) + 1;
+
+ if (x >= f->num_members)
+ x = 0;
+
+ return x;
+}
+
+static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb)
+{
+ u32 idx = ((u64)skb->rxhash * f->num_members) >> 32;
+
+ return f->arr[idx];
+}
+
+static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb)
+{
+ int cur, old;
+
+ cur = atomic_read(&f->rr_cur);
+ while ((old = atomic_cmpxchg(&f->rr_cur, cur,
+ fanout_rr_next(f))) != cur)
+ cur = old;
+ return f->arr[cur];
+}
+
+static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct packet_fanout *f = pt->af_packet_priv;
+ struct packet_sock *po;
+ struct sock *sk;
+
+ if (!net_eq(dev_net(dev), read_pnet(&f->net))) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ sk = fanout_demux_hash(f, skb);
+ po = pkt_sk(sk);
+
+ return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
+}
+
+static int packet_rcv_fanout_lb(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct packet_fanout *f = pt->af_packet_priv;
+ struct packet_sock *po;
+ struct sock *sk;
+
+ if (!net_eq(dev_net(dev), read_pnet(&f->net))) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ sk = fanout_demux_lb(f, skb);
+ po = pkt_sk(sk);
+
+ return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
+}
+
+static DEFINE_MUTEX(fanout_mutex);
+static LIST_HEAD(fanout_list);
+
+static int fanout_add(struct sock *sk, u16 id, u8 type)
+{
+ struct packet_sock *po = pkt_sk(sk);
+ struct packet_fanout *f, *match;
+ int err;
+
+ switch (type) {
+ case PACKET_FANOUT_HASH:
+ case PACKET_FANOUT_LB:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (!po->running)
+ return -EINVAL;
+
+ mutex_lock(&fanout_mutex);
+ match = NULL;
+ list_for_each_entry(f, &fanout_list, list) {
+ if (f->id == id) {
+ match = f;
+ break;
+ }
+ }
+ if (!match) {
+ match = kzalloc(sizeof(*match), GFP_KERNEL);
+ if (match) {
+ write_pnet(&match->net, sock_net(sk));
+ match->id = id;
+ match->type = type;
+ atomic_set(&match->rr_cur, 0);
+ INIT_LIST_HEAD(&match->list);
+ match->prot_hook.type = po->prot_hook.type;
+ match->prot_hook.dev = po->prot_hook.dev;
+ switch (type) {
+ case PACKET_FANOUT_HASH:
+ match->prot_hook.func = packet_rcv_fanout_hash;
+ break;
+ case PACKET_FANOUT_LB:
+ match->prot_hook.func = packet_rcv_fanout_lb;
+ break;
+ }
+ match->prot_hook.af_packet_priv = match;
+ dev_add_pack(&match->prot_hook);
+ }
+ }
+ err = -ENOMEM;
+ if (match) {
+ err = -EINVAL;
+ if (match->type == type) {
+ err = -ENOSPC;
+ if (match->num_members < PACKET_FANOUT_MAX) {
+ __dev_remove_pack(&po->prot_hook);
+ po->fanout = match;
+ match->arr[match->num_members] = sk;
+ smp_wmb();
+ match->num_members++;
+ err = 0;
+ }
+ }
+ }
+ mutex_unlock(&fanout_mutex);
+ return err;
+}
+
+static void fanout_del(struct sock *sk)
+{
+ struct packet_sock *po = pkt_sk(sk);
+ struct packet_fanout *f;
+ int i;
+
+ f = po->fanout;
+ po->fanout = NULL;
+
+ mutex_lock(&fanout_mutex);
+ for (i = 0; i < f->num_members; i++) {
+ if (f->arr[i] == sk)
+ break;
+ }
+ BUG_ON(i >= f->num_members);
+ f->arr[i] = f->arr[f->num_members - 1];
+ f->num_members--;
+
+ if (!f->num_members) {
+ list_del(&f->list);
+ dev_remove_pack(&f->prot_hook);
+ kfree(f);
+ }
+ mutex_unlock(&fanout_mutex);
+}
static const struct proto_ops packet_ops;
@@ -1343,7 +1519,10 @@ static int packet_release(struct socket *sock)
*/
po->running = 0;
po->num = 0;
- __dev_remove_pack(&po->prot_hook);
+ if (po->fanout)
+ fanout_del(sk);
+ else
+ __dev_remove_pack(&po->prot_hook);
__sock_put(sk);
}
if (po->prot_hook.dev) {
@@ -1396,9 +1575,11 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
__sock_put(sk);
po->running = 0;
po->num = 0;
- spin_unlock(&po->bind_lock);
- dev_remove_pack(&po->prot_hook);
- spin_lock(&po->bind_lock);
+ if (!po->fanout) {
+ spin_unlock(&po->bind_lock);
+ dev_remove_pack(&po->prot_hook);
+ spin_lock(&po->bind_lock);
+ }
}
po->num = protocol;
@@ -1413,7 +1594,8 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
goto out_unlock;
if (!dev || (dev->flags & IFF_UP)) {
- dev_add_pack(&po->prot_hook);
+ if (!po->fanout)
+ dev_add_pack(&po->prot_hook);
sock_hold(sk);
po->running = 1;
} else {
@@ -1542,7 +1724,8 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
if (proto) {
po->prot_hook.type = proto;
- dev_add_pack(&po->prot_hook);
+ if (!po->fanout)
+ dev_add_pack(&po->prot_hook);
sock_hold(sk);
po->running = 1;
}
@@ -2109,6 +2292,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
po->tp_tstamp = val;
return 0;
}
+ case PACKET_FANOUT:
+ {
+ int val;
+
+ if (optlen != sizeof(val))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ return fanout_add(sk, val & 0xffff, val >> 16);
+ }
default:
return -ENOPROTOOPT;
}
@@ -2207,6 +2401,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
val = po->tp_tstamp;
data = &val;
break;
+ case PACKET_FANOUT:
+ if (len > sizeof(int))
+ len = sizeof(int);
+ val = (po->fanout ?
+ ((u32)po->fanout->id |
+ ((u32)po->fanout->type << 16)) :
+ 0);
+ data = &val;
+ break;
default:
return -ENOPROTOOPT;
}
@@ -2260,7 +2463,8 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void
if (dev->ifindex == po->ifindex) {
spin_lock(&po->bind_lock);
if (po->num && !po->running) {
- dev_add_pack(&po->prot_hook);
+ if (!po->fanout)
+ dev_add_pack(&po->prot_hook);
sock_hold(sk);
po->running = 1;
}
@@ -2530,7 +2734,8 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
was_running = po->running;
num = po->num;
if (was_running) {
- __dev_remove_pack(&po->prot_hook);
+ if (!po->fanout)
+ __dev_remove_pack(&po->prot_hook);
po->num = 0;
po->running = 0;
__sock_put(sk);
@@ -2568,7 +2773,8 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
sock_hold(sk);
po->running = 1;
po->num = num;
- dev_add_pack(&po->prot_hook);
+ if (!po->fanout)
+ dev_add_pack(&po->prot_hook);
}
spin_unlock(&po->bind_lock);
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [RFC PATCH] packet: Add fanout support.
2011-06-21 9:53 [RFC PATCH] packet: Add fanout support David Miller
@ 2011-06-21 10:39 ` Victor Julien
2011-06-21 10:46 ` David Miller
0 siblings, 1 reply; 10+ messages in thread
From: Victor Julien @ 2011-06-21 10:39 UTC (permalink / raw)
To: David Miller; +Cc: netdev
On 06/21/2011 11:53 AM, David Miller wrote:
>
> This adds demuxing support for AF_PACKET sockets. It's just to give
> people an idea, I've only build tested this patch.
>
> Basically it allows to spread the AF_PACKET processing load amongst
> several AF_PACKET sockets. The distribution can either be based upon
> hashing (PACKET_FANOUT_HASH) or round-robin based load-balancing
> (PACKET_FANOUT_LB).
>
> The hash based fanout takes advantage of the precomputed skb->rxhash
> and only costs ~20 cpu cycles.
>
> A restriction is that you must bind the AF_PACKET socket fully before
> you add it to a fanout.
>
> The encoding of the PACKET_FANOUT socket option argument is:
>
> (PACKET_FANOUT_{HASH,LB} << 16) | (ID & 0xffff)
>
> All sockets adding themselves to the same fanout ID must all use
> the same PACKET_FANOUT_* type and also must be bound to the same
> device/protocol.
>
> The implementation is agnostic to the type of AF_PACKET sockets in
> use. You can use mmap based, and non-mmap based, AF_PACKET sockets.
> It simply doesn't care.
Thanks David! Looks interesting. I'm not familiar with the kernel
internals, so just a quick question. The hash based on skb->rxhash, does
that result in a "flow" based distribution over the listeners? So all
packets sharing a tuple being sent to the same socket?
Cheers,
Victor
> Signed-off-by: David S. Miller <davem@davemloft.net>
>
> diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
> index 7b31863..1efa1cb 100644
> --- a/include/linux/if_packet.h
> +++ b/include/linux/if_packet.h
> @@ -49,6 +49,10 @@ struct sockaddr_ll {
> #define PACKET_VNET_HDR 15
> #define PACKET_TX_TIMESTAMP 16
> #define PACKET_TIMESTAMP 17
> +#define PACKET_FANOUT 18
> +
> +#define PACKET_FANOUT_HASH 0
> +#define PACKET_FANOUT_LB 1
>
> struct tpacket_stats {
> unsigned int tp_packets;
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index 461b16f..e6af2eb 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -187,9 +187,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
>
> static void packet_flush_mclist(struct sock *sk);
>
> +struct packet_fanout;
> struct packet_sock {
> /* struct sock has to be the first member of packet_sock */
> struct sock sk;
> + struct packet_fanout *fanout;
> struct tpacket_stats stats;
> struct packet_ring_buffer rx_ring;
> struct packet_ring_buffer tx_ring;
> @@ -212,6 +214,22 @@ struct packet_sock {
> struct packet_type prot_hook ____cacheline_aligned_in_smp;
> };
>
> +#define PACKET_FANOUT_MAX 2048
> +
> +struct packet_fanout {
> +#ifdef CONFIG_NET_NS
> + struct net *net;
> +#endif
> + int num_members;
> + u16 id;
> + u8 type;
> + u8 pad;
> + atomic_t rr_cur;
> + struct list_head list;
> + struct sock *arr[PACKET_FANOUT_MAX];
> + struct packet_type prot_hook ____cacheline_aligned_in_smp;
> +};
> +
> struct packet_skb_cb {
> unsigned int origlen;
> union {
> @@ -344,6 +362,164 @@ static void packet_sock_destruct(struct sock *sk)
> sk_refcnt_debug_dec(sk);
> }
>
> +static int fanout_rr_next(struct packet_fanout *f)
> +{
> + int x = atomic_read(&f->rr_cur) + 1;
> +
> + if (x >= f->num_members)
> + x = 0;
> +
> + return x;
> +}
> +
> +static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb)
> +{
> + u32 idx = ((u64)skb->rxhash * f->num_members) >> 32;
> +
> + return f->arr[idx];
> +}
> +
> +static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb)
> +{
> + int cur, old;
> +
> + cur = atomic_read(&f->rr_cur);
> + while ((old = atomic_cmpxchg(&f->rr_cur, cur,
> + fanout_rr_next(f))) != cur)
> + cur = old;
> + return f->arr[cur];
> +}
> +
> +static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
> + struct packet_type *pt, struct net_device *orig_dev)
> +{
> + struct packet_fanout *f = pt->af_packet_priv;
> + struct packet_sock *po;
> + struct sock *sk;
> +
> + if (!net_eq(dev_net(dev), read_pnet(&f->net))) {
> + kfree_skb(skb);
> + return 0;
> + }
> +
> + sk = fanout_demux_hash(f, skb);
> + po = pkt_sk(sk);
> +
> + return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
> +}
> +
> +static int packet_rcv_fanout_lb(struct sk_buff *skb, struct net_device *dev,
> + struct packet_type *pt, struct net_device *orig_dev)
> +{
> + struct packet_fanout *f = pt->af_packet_priv;
> + struct packet_sock *po;
> + struct sock *sk;
> +
> + if (!net_eq(dev_net(dev), read_pnet(&f->net))) {
> + kfree_skb(skb);
> + return 0;
> + }
> +
> + sk = fanout_demux_lb(f, skb);
> + po = pkt_sk(sk);
> +
> + return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
> +}
> +
> +static DEFINE_MUTEX(fanout_mutex);
> +static LIST_HEAD(fanout_list);
> +
> +static int fanout_add(struct sock *sk, u16 id, u8 type)
> +{
> + struct packet_sock *po = pkt_sk(sk);
> + struct packet_fanout *f, *match;
> + int err;
> +
> + switch (type) {
> + case PACKET_FANOUT_HASH:
> + case PACKET_FANOUT_LB:
> + break;
> + default:
> + return -EINVAL;
> + }
> +
> + if (!po->running)
> + return -EINVAL;
> +
> + mutex_lock(&fanout_mutex);
> + match = NULL;
> + list_for_each_entry(f, &fanout_list, list) {
> + if (f->id == id) {
> + match = f;
> + break;
> + }
> + }
> + if (!match) {
> + match = kzalloc(sizeof(*match), GFP_KERNEL);
> + if (match) {
> + write_pnet(&match->net, sock_net(sk));
> + match->id = id;
> + match->type = type;
> + atomic_set(&match->rr_cur, 0);
> + INIT_LIST_HEAD(&match->list);
> + match->prot_hook.type = po->prot_hook.type;
> + match->prot_hook.dev = po->prot_hook.dev;
> + switch (type) {
> + case PACKET_FANOUT_HASH:
> + match->prot_hook.func = packet_rcv_fanout_hash;
> + break;
> + case PACKET_FANOUT_LB:
> + match->prot_hook.func = packet_rcv_fanout_lb;
> + break;
> + }
> + match->prot_hook.af_packet_priv = match;
> + dev_add_pack(&match->prot_hook);
> + }
> + }
> + err = -ENOMEM;
> + if (match) {
> + err = -EINVAL;
> + if (match->type == type) {
> + err = -ENOSPC;
> + if (match->num_members < PACKET_FANOUT_MAX) {
> + __dev_remove_pack(&po->prot_hook);
> + po->fanout = match;
> + match->arr[match->num_members] = sk;
> + smp_wmb();
> + match->num_members++;
> + err = 0;
> + }
> + }
> + }
> + mutex_unlock(&fanout_mutex);
> + return err;
> +}
> +
> +static void fanout_del(struct sock *sk)
> +{
> + struct packet_sock *po = pkt_sk(sk);
> + struct packet_fanout *f;
> + int i;
> +
> + f = po->fanout;
> + po->fanout = NULL;
> +
> + mutex_lock(&fanout_mutex);
> + for (i = 0; i < f->num_members; i++) {
> + if (f->arr[i] == sk)
> + break;
> + }
> + BUG_ON(i >= f->num_members);
> + f->arr[i] = f->arr[f->num_members - 1];
> + f->num_members--;
> +
> + if (!f->num_members) {
> + list_del(&f->list);
> + dev_remove_pack(&f->prot_hook);
> + kfree(f);
> + }
> + mutex_unlock(&fanout_mutex);
> +}
>
> static const struct proto_ops packet_ops;
>
> @@ -1343,7 +1519,10 @@ static int packet_release(struct socket *sock)
> */
> po->running = 0;
> po->num = 0;
> - __dev_remove_pack(&po->prot_hook);
> + if (po->fanout)
> + fanout_del(sk);
> + else
> + __dev_remove_pack(&po->prot_hook);
> __sock_put(sk);
> }
> if (po->prot_hook.dev) {
> @@ -1396,9 +1575,11 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
> __sock_put(sk);
> po->running = 0;
> po->num = 0;
> - spin_unlock(&po->bind_lock);
> - dev_remove_pack(&po->prot_hook);
> - spin_lock(&po->bind_lock);
> + if (!po->fanout) {
> + spin_unlock(&po->bind_lock);
> + dev_remove_pack(&po->prot_hook);
> + spin_lock(&po->bind_lock);
> + }
> }
>
> po->num = protocol;
> @@ -1413,7 +1594,8 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
> goto out_unlock;
>
> if (!dev || (dev->flags & IFF_UP)) {
> - dev_add_pack(&po->prot_hook);
> + if (!po->fanout)
> + dev_add_pack(&po->prot_hook);
> sock_hold(sk);
> po->running = 1;
> } else {
> @@ -1542,7 +1724,8 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
>
> if (proto) {
> po->prot_hook.type = proto;
> - dev_add_pack(&po->prot_hook);
> + if (!po->fanout)
> + dev_add_pack(&po->prot_hook);
> sock_hold(sk);
> po->running = 1;
> }
> @@ -2109,6 +2292,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
> po->tp_tstamp = val;
> return 0;
> }
> + case PACKET_FANOUT:
> + {
> + int val;
> +
> + if (optlen != sizeof(val))
> + return -EINVAL;
> + if (copy_from_user(&val, optval, sizeof(val)))
> + return -EFAULT;
> +
> + return fanout_add(sk, val & 0xffff, val >> 16);
> + }
> default:
> return -ENOPROTOOPT;
> }
> @@ -2207,6 +2401,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
> val = po->tp_tstamp;
> data = &val;
> break;
> + case PACKET_FANOUT:
> + if (len > sizeof(int))
> + len = sizeof(int);
> + val = (po->fanout ?
> + ((u32)po->fanout->id |
> + ((u32)po->fanout->type << 16)) :
> + 0);
> + data = &val;
> + break;
> default:
> return -ENOPROTOOPT;
> }
> @@ -2260,7 +2463,8 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void
> if (dev->ifindex == po->ifindex) {
> spin_lock(&po->bind_lock);
> if (po->num && !po->running) {
> - dev_add_pack(&po->prot_hook);
> + if (!po->fanout)
> + dev_add_pack(&po->prot_hook);
> sock_hold(sk);
> po->running = 1;
> }
> @@ -2530,7 +2734,8 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
> was_running = po->running;
> num = po->num;
> if (was_running) {
> - __dev_remove_pack(&po->prot_hook);
> + if (!po->fanout)
> + __dev_remove_pack(&po->prot_hook);
> po->num = 0;
> po->running = 0;
> __sock_put(sk);
> @@ -2568,7 +2773,8 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
> sock_hold(sk);
> po->running = 1;
> po->num = num;
> - dev_add_pack(&po->prot_hook);
> + if (!po->fanout)
> + dev_add_pack(&po->prot_hook);
> }
> spin_unlock(&po->bind_lock);
>
>
--
---------------------------------------------
Victor Julien
http://www.inliniac.net/
PGP: http://www.inliniac.net/victorjulien.asc
---------------------------------------------
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH] packet: Add fanout support.
2011-06-21 10:39 ` Victor Julien
@ 2011-06-21 10:46 ` David Miller
2011-06-21 13:05 ` Changli Gao
0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2011-06-21 10:46 UTC (permalink / raw)
To: victor; +Cc: netdev
From: Victor Julien <victor@inliniac.net>
Date: Tue, 21 Jun 2011 12:39:11 +0200
> The hash based on skb->rxhash, does that result in a "flow" based
> distribution over the listeners? So all packets sharing a tuple
> being sent to the same socket?
Yes, that's exactly right.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH] packet: Add fanout support.
2011-06-21 10:46 ` David Miller
@ 2011-06-21 13:05 ` Changli Gao
2011-06-21 13:27 ` Victor Julien
2011-06-21 21:31 ` David Miller
0 siblings, 2 replies; 10+ messages in thread
From: Changli Gao @ 2011-06-21 13:05 UTC (permalink / raw)
To: David Miller; +Cc: victor, netdev
On Tue, Jun 21, 2011 at 6:46 PM, David Miller <davem@davemloft.net> wrote:
> From: Victor Julien <victor@inliniac.net>
> Date: Tue, 21 Jun 2011 12:39:11 +0200
>
>> The hash based on skb->rxhash, does that result in a "flow" based
>> distribution over the listeners? So all packets sharing a tuple
>> being sent to the same socket?
>
> Yes, that's exactly right.
But not for fragments, in additional.
--
Regards,
Changli Gao(xiaosuo@gmail.com)
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH] packet: Add fanout support.
2011-06-21 13:05 ` Changli Gao
@ 2011-06-21 13:27 ` Victor Julien
2011-06-21 21:39 ` David Miller
2011-06-21 21:31 ` David Miller
1 sibling, 1 reply; 10+ messages in thread
From: Victor Julien @ 2011-06-21 13:27 UTC (permalink / raw)
To: Changli Gao; +Cc: David Miller, netdev
On 06/21/2011 03:05 PM, Changli Gao wrote:
> On Tue, Jun 21, 2011 at 6:46 PM, David Miller <davem@davemloft.net> wrote:
>> From: Victor Julien <victor@inliniac.net>
>> Date: Tue, 21 Jun 2011 12:39:11 +0200
>>
>>> The hash based on skb->rxhash, does that result in a "flow" based
>>> distribution over the listeners? So all packets sharing a tuple
>>> being sent to the same socket?
>>
>> Yes, that's exactly right.
>
> But not for fragments, in additional.
>From a Suricata IDS point of view, I would need to have the fragments of
a flow/tuple on the same socket.
--
---------------------------------------------
Victor Julien
http://www.inliniac.net/
PGP: http://www.inliniac.net/victorjulien.asc
---------------------------------------------
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH] packet: Add fanout support.
2011-06-21 13:05 ` Changli Gao
2011-06-21 13:27 ` Victor Julien
@ 2011-06-21 21:31 ` David Miller
1 sibling, 0 replies; 10+ messages in thread
From: David Miller @ 2011-06-21 21:31 UTC (permalink / raw)
To: xiaosuo; +Cc: victor, netdev
From: Changli Gao <xiaosuo@gmail.com>
Date: Tue, 21 Jun 2011 21:05:24 +0800
> On Tue, Jun 21, 2011 at 6:46 PM, David Miller <davem@davemloft.net> wrote:
>> From: Victor Julien <victor@inliniac.net>
>> Date: Tue, 21 Jun 2011 12:39:11 +0200
>>
>>> The hash based on skb->rxhash, does that result in a "flow" based
>>> distribution over the listeners? So all packets sharing a tuple
>>> being sent to the same socket?
>>
>> Yes, that's exactly right.
>
> But not for fragments, in additional.
That's a fundamental issue in the RFS code which I've proposed
solutions for.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH] packet: Add fanout support.
2011-06-21 13:27 ` Victor Julien
@ 2011-06-21 21:39 ` David Miller
2011-06-22 1:44 ` Changli Gao
0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2011-06-21 21:39 UTC (permalink / raw)
To: victor; +Cc: xiaosuo, netdev
From: Victor Julien <victor@inliniac.net>
Date: Tue, 21 Jun 2011 15:27:54 +0200
> From a Suricata IDS point of view, I would need to have the
> fragments of a flow/tuple on the same socket.
Currently you would, they would all go to the first socket in
the fanout.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH] packet: Add fanout support.
2011-06-21 21:39 ` David Miller
@ 2011-06-22 1:44 ` Changli Gao
2011-06-22 2:12 ` David Miller
2011-06-22 6:49 ` Victor Julien
0 siblings, 2 replies; 10+ messages in thread
From: Changli Gao @ 2011-06-22 1:44 UTC (permalink / raw)
To: David Miller; +Cc: victor, netdev
On Wed, Jun 22, 2011 at 5:39 AM, David Miller <davem@davemloft.net> wrote:
> From: Victor Julien <victor@inliniac.net>
> Date: Tue, 21 Jun 2011 15:27:54 +0200
>
>> From a Suricata IDS point of view, I would need to have the
>> fragments of a flow/tuple on the same socket.
>
> Currently you would, they would all go to the first socket in
> the fanout.
>
I think he also needs all the packets belong to the related
connections are received via the same socket. I am afraid that he has
to dispatch these kind of packets among the uesrland processes again.
:)
--
Regards,
Changli Gao(xiaosuo@gmail.com)
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH] packet: Add fanout support.
2011-06-22 1:44 ` Changli Gao
@ 2011-06-22 2:12 ` David Miller
2011-06-22 6:49 ` Victor Julien
1 sibling, 0 replies; 10+ messages in thread
From: David Miller @ 2011-06-22 2:12 UTC (permalink / raw)
To: xiaosuo; +Cc: victor, netdev
From: Changli Gao <xiaosuo@gmail.com>
Date: Wed, 22 Jun 2011 09:44:00 +0800
> I think he also needs all the packets belong to the related
> connections are received via the same socket. I am afraid that he has
> to dispatch these kind of packets among the uesrland processes again.
> :)
I mean, if we really wanted to, we could create a new ip_defrag()
client case in the rxhash receive code. But this would need to be
configurable and off by default.
It would provide the desired behavior.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [RFC PATCH] packet: Add fanout support.
2011-06-22 1:44 ` Changli Gao
2011-06-22 2:12 ` David Miller
@ 2011-06-22 6:49 ` Victor Julien
1 sibling, 0 replies; 10+ messages in thread
From: Victor Julien @ 2011-06-22 6:49 UTC (permalink / raw)
To: Changli Gao; +Cc: David Miller, netdev
On 06/22/2011 03:44 AM, Changli Gao wrote:
> On Wed, Jun 22, 2011 at 5:39 AM, David Miller <davem@davemloft.net> wrote:
>> From: Victor Julien <victor@inliniac.net>
>> Date: Tue, 21 Jun 2011 15:27:54 +0200
>>
>>> From a Suricata IDS point of view, I would need to have the
>>> fragments of a flow/tuple on the same socket.
>>
>> Currently you would, they would all go to the first socket in
>> the fanout.
>>
>
> I think he also needs all the packets belong to the related
> connections are received via the same socket. I am afraid that he has
> to dispatch these kind of packets among the uesrland processes again.
> :)
>
Indeed. Although in Suricata we *could* work around it as we distribute
the flows over threads, not processes. It would still be messy. For this
to be useful to a tool like Snort (I'm sure they're interested) I think
this would be a deal breaker.
--
---------------------------------------------
Victor Julien
http://www.inliniac.net/
PGP: http://www.inliniac.net/victorjulien.asc
---------------------------------------------
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2011-06-22 6:50 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-06-21 9:53 [RFC PATCH] packet: Add fanout support David Miller
2011-06-21 10:39 ` Victor Julien
2011-06-21 10:46 ` David Miller
2011-06-21 13:05 ` Changli Gao
2011-06-21 13:27 ` Victor Julien
2011-06-21 21:39 ` David Miller
2011-06-22 1:44 ` Changli Gao
2011-06-22 2:12 ` David Miller
2011-06-22 6:49 ` Victor Julien
2011-06-21 21:31 ` David Miller
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).