[RFC PATCH] packet: Add fanout support.

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH] packet: Add fanout support.
@ 2011-06-21  9:53 David Miller
  2011-06-21 10:39 ` Victor Julien
  0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2011-06-21  9:53 UTC (permalink / raw)
  To: netdev; +Cc: victor


This adds demuxing support for AF_PACKET sockets.  It's just to give
people an idea, I've only build tested this patch.

Basically it allows to spread the AF_PACKET processing load amongst
several AF_PACKET sockets.  The distribution can either be based upon
hashing (PACKET_FANOUT_HASH) or round-robin based load-balancing
(PACKET_FANOUT_LB).

The hash based fanout takes advantage of the precomputed skb->rxhash
and only costs ~20 cpu cycles.

A restriction is that you must bind the AF_PACKET socket fully before
you add it to a fanout.

The encoding of the PACKET_FANOUT socket option argument is:

	(PACKET_FANOUT_{HASH,LB} << 16) | (ID & 0xffff)

All sockets adding themselves to the same fanout ID must all use
the same PACKET_FANOUT_* type and also must be bound to the same
device/protocol.

The implementation is agnostic to the type of AF_PACKET sockets in
use.  You can use mmap based, and non-mmap based, AF_PACKET sockets.
It simply doesn't care.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 7b31863..1efa1cb 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -49,6 +49,10 @@ struct sockaddr_ll {
 #define PACKET_VNET_HDR			15
 #define PACKET_TX_TIMESTAMP		16
 #define PACKET_TIMESTAMP		17
+#define PACKET_FANOUT			18
+
+#define PACKET_FANOUT_HASH		0
+#define PACKET_FANOUT_LB		1
 
 struct tpacket_stats {
 	unsigned int	tp_packets;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 461b16f..e6af2eb 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -187,9 +187,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
 
 static void packet_flush_mclist(struct sock *sk);
 
+struct packet_fanout;
 struct packet_sock {
 	/* struct sock has to be the first member of packet_sock */
 	struct sock		sk;
+	struct packet_fanout	*fanout;
 	struct tpacket_stats	stats;
 	struct packet_ring_buffer	rx_ring;
 	struct packet_ring_buffer	tx_ring;
@@ -212,6 +214,22 @@ struct packet_sock {
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
 };
 
+#define PACKET_FANOUT_MAX	2048
+
+struct packet_fanout {
+#ifdef CONFIG_NET_NS
+	struct net		*net;
+#endif
+	int			num_members;
+	u16			id;
+	u8			type;
+	u8			pad;
+	atomic_t		rr_cur;
+	struct list_head	list;
+	struct sock		*arr[PACKET_FANOUT_MAX];
+	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
+};
+
 struct packet_skb_cb {
 	unsigned int origlen;
 	union {
@@ -344,6 +362,164 @@ static void packet_sock_destruct(struct sock *sk)
 	sk_refcnt_debug_dec(sk);
 }
 
+static int fanout_rr_next(struct packet_fanout *f)
+{
+	int x = atomic_read(&f->rr_cur) + 1;
+
+	if (x >= f->num_members)
+		x = 0;
+
+	return x;
+}
+
+static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb)
+{
+	u32 idx = ((u64)skb->rxhash * f->num_members) >> 32;
+
+	return f->arr[idx];
+}
+
+static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb)
+{
+	int cur, old;
+
+	cur = atomic_read(&f->rr_cur);
+	while ((old = atomic_cmpxchg(&f->rr_cur, cur,
+				     fanout_rr_next(f))) != cur)
+		cur = old;
+	return f->arr[cur];
+}
+
+static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
+				  struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct packet_fanout *f = pt->af_packet_priv;
+	struct packet_sock *po;
+	struct sock *sk;
+
+	if (!net_eq(dev_net(dev), read_pnet(&f->net))) {
+		kfree_skb(skb);
+		return 0;
+	}
+
+	sk = fanout_demux_hash(f, skb);
+	po = pkt_sk(sk);
+
+	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
+}
+
+static int packet_rcv_fanout_lb(struct sk_buff *skb, struct net_device *dev,
+				struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct packet_fanout *f = pt->af_packet_priv;
+	struct packet_sock *po;
+	struct sock *sk;
+
+	if (!net_eq(dev_net(dev), read_pnet(&f->net))) {
+		kfree_skb(skb);
+		return 0;
+	}
+
+	sk = fanout_demux_lb(f, skb);
+	po = pkt_sk(sk);
+
+	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
+}
+
+static DEFINE_MUTEX(fanout_mutex);
+static LIST_HEAD(fanout_list);
+
+static int fanout_add(struct sock *sk, u16 id, u8 type)
+{
+	struct packet_sock *po = pkt_sk(sk);
+	struct packet_fanout *f, *match;
+	int err;
+
+	switch (type) {
+	case PACKET_FANOUT_HASH:
+	case PACKET_FANOUT_LB:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (!po->running)
+		return -EINVAL;
+
+	mutex_lock(&fanout_mutex);
+	match = NULL;
+	list_for_each_entry(f, &fanout_list, list) {
+		if (f->id == id) {
+			match = f;
+			break;
+		}
+	}
+	if (!match) {
+		match = kzalloc(sizeof(*match), GFP_KERNEL);
+		if (match) {
+			write_pnet(&match->net, sock_net(sk));
+			match->id = id;
+			match->type = type;
+			atomic_set(&match->rr_cur, 0);
+			INIT_LIST_HEAD(&match->list);
+			match->prot_hook.type = po->prot_hook.type;
+			match->prot_hook.dev = po->prot_hook.dev;
+			switch (type) {
+			case PACKET_FANOUT_HASH:
+				match->prot_hook.func = packet_rcv_fanout_hash;
+				break;
+			case PACKET_FANOUT_LB:
+				match->prot_hook.func = packet_rcv_fanout_lb;
+				break;
+			}
+			match->prot_hook.af_packet_priv = match;
+			dev_add_pack(&match->prot_hook);
+		}
+	}
+	err = -ENOMEM;
+	if (match) {
+		err = -EINVAL;
+		if (match->type == type) {
+			err = -ENOSPC;
+			if (match->num_members < PACKET_FANOUT_MAX) {
+				__dev_remove_pack(&po->prot_hook);
+				po->fanout = match;
+				match->arr[match->num_members] = sk;
+				smp_wmb();
+				match->num_members++;
+				err = 0;
+			}
+		}
+	}
+	mutex_unlock(&fanout_mutex);
+	return err;
+}
+
+static void fanout_del(struct sock *sk)
+{
+	struct packet_sock *po = pkt_sk(sk);
+	struct packet_fanout *f;
+	int i;
+
+	f = po->fanout;
+	po->fanout = NULL;
+
+	mutex_lock(&fanout_mutex);
+	for (i = 0; i < f->num_members; i++) {
+		if (f->arr[i] == sk)
+			break;
+	}
+	BUG_ON(i >= f->num_members);
+	f->arr[i] = f->arr[f->num_members - 1];
+	f->num_members--;
+
+	if (!f->num_members) {
+		list_del(&f->list);
+		dev_remove_pack(&f->prot_hook);
+		kfree(f);
+	}
+	mutex_unlock(&fanout_mutex);
+}
 
 static const struct proto_ops packet_ops;
 
@@ -1343,7 +1519,10 @@ static int packet_release(struct socket *sock)
 		 */
 		po->running = 0;
 		po->num = 0;
-		__dev_remove_pack(&po->prot_hook);
+		if (po->fanout)
+			fanout_del(sk);
+		else
+			__dev_remove_pack(&po->prot_hook);
 		__sock_put(sk);
 	}
 	if (po->prot_hook.dev) {
@@ -1396,9 +1575,11 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
 		__sock_put(sk);
 		po->running = 0;
 		po->num = 0;
-		spin_unlock(&po->bind_lock);
-		dev_remove_pack(&po->prot_hook);
-		spin_lock(&po->bind_lock);
+		if (!po->fanout) {
+			spin_unlock(&po->bind_lock);
+			dev_remove_pack(&po->prot_hook);
+			spin_lock(&po->bind_lock);
+		}
 	}
 
 	po->num = protocol;
@@ -1413,7 +1594,8 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
 		goto out_unlock;
 
 	if (!dev || (dev->flags & IFF_UP)) {
-		dev_add_pack(&po->prot_hook);
+		if (!po->fanout)
+			dev_add_pack(&po->prot_hook);
 		sock_hold(sk);
 		po->running = 1;
 	} else {
@@ -1542,7 +1724,8 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 
 	if (proto) {
 		po->prot_hook.type = proto;
-		dev_add_pack(&po->prot_hook);
+		if (!po->fanout)
+			dev_add_pack(&po->prot_hook);
 		sock_hold(sk);
 		po->running = 1;
 	}
@@ -2109,6 +2292,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 		po->tp_tstamp = val;
 		return 0;
 	}
+	case PACKET_FANOUT:
+	{
+		int val;
+
+		if (optlen != sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+
+		return fanout_add(sk, val & 0xffff, val >> 16);
+	}
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -2207,6 +2401,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 		val = po->tp_tstamp;
 		data = &val;
 		break;
+	case PACKET_FANOUT:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		val = (po->fanout ?
+		       ((u32)po->fanout->id |
+			((u32)po->fanout->type << 16)) :
+		       0);
+		data = &val;
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -2260,7 +2463,8 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void
 			if (dev->ifindex == po->ifindex) {
 				spin_lock(&po->bind_lock);
 				if (po->num && !po->running) {
-					dev_add_pack(&po->prot_hook);
+					if (!po->fanout)
+						dev_add_pack(&po->prot_hook);
 					sock_hold(sk);
 					po->running = 1;
 				}
@@ -2530,7 +2734,8 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 	was_running = po->running;
 	num = po->num;
 	if (was_running) {
-		__dev_remove_pack(&po->prot_hook);
+		if (!po->fanout)
+			__dev_remove_pack(&po->prot_hook);
 		po->num = 0;
 		po->running = 0;
 		__sock_put(sk);
@@ -2568,7 +2773,8 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 		sock_hold(sk);
 		po->running = 1;
 		po->num = num;
-		dev_add_pack(&po->prot_hook);
+		if (!po->fanout)
+			dev_add_pack(&po->prot_hook);
 	}
 	spin_unlock(&po->bind_lock);
 

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] packet: Add fanout support.
  2011-06-21  9:53 [RFC PATCH] packet: Add fanout support David Miller
@ 2011-06-21 10:39 ` Victor Julien
  2011-06-21 10:46   ` David Miller
  0 siblings, 1 reply; 10+ messages in thread
From: Victor Julien @ 2011-06-21 10:39 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On 06/21/2011 11:53 AM, David Miller wrote:
> 
> This adds demuxing support for AF_PACKET sockets.  It's just to give
> people an idea, I've only build tested this patch.
> 
> Basically it allows to spread the AF_PACKET processing load amongst
> several AF_PACKET sockets.  The distribution can either be based upon
> hashing (PACKET_FANOUT_HASH) or round-robin based load-balancing
> (PACKET_FANOUT_LB).
> 
> The hash based fanout takes advantage of the precomputed skb->rxhash
> and only costs ~20 cpu cycles.
> 
> A restriction is that you must bind the AF_PACKET socket fully before
> you add it to a fanout.
> 
> The encoding of the PACKET_FANOUT socket option argument is:
> 
> 	(PACKET_FANOUT_{HASH,LB} << 16) | (ID & 0xffff)
> 
> All sockets adding themselves to the same fanout ID must all use
> the same PACKET_FANOUT_* type and also must be bound to the same
> device/protocol.
> 
> The implementation is agnostic to the type of AF_PACKET sockets in
> use.  You can use mmap based, and non-mmap based, AF_PACKET sockets.
> It simply doesn't care.

Thanks David! Looks interesting. I'm not familiar with the kernel
internals, so just a quick question. The hash based on skb->rxhash, does
that result in a "flow" based distribution over the listeners? So all
packets sharing a tuple being sent to the same socket?

Cheers,
Victor

> Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
> index 7b31863..1efa1cb 100644
> --- a/include/linux/if_packet.h
> +++ b/include/linux/if_packet.h
> @@ -49,6 +49,10 @@ struct sockaddr_ll {
>  #define PACKET_VNET_HDR			15
>  #define PACKET_TX_TIMESTAMP		16
>  #define PACKET_TIMESTAMP		17
> +#define PACKET_FANOUT			18
> +
> +#define PACKET_FANOUT_HASH		0
> +#define PACKET_FANOUT_LB		1
>  
>  struct tpacket_stats {
>  	unsigned int	tp_packets;
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index 461b16f..e6af2eb 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -187,9 +187,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
>  
>  static void packet_flush_mclist(struct sock *sk);
>  
> +struct packet_fanout;
>  struct packet_sock {
>  	/* struct sock has to be the first member of packet_sock */
>  	struct sock		sk;
> +	struct packet_fanout	*fanout;
>  	struct tpacket_stats	stats;
>  	struct packet_ring_buffer	rx_ring;
>  	struct packet_ring_buffer	tx_ring;
> @@ -212,6 +214,22 @@ struct packet_sock {
>  	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
>  };
>  
> +#define PACKET_FANOUT_MAX	2048
> +
> +struct packet_fanout {
> +#ifdef CONFIG_NET_NS
> +	struct net		*net;
> +#endif
> +	int			num_members;
> +	u16			id;
> +	u8			type;
> +	u8			pad;
> +	atomic_t		rr_cur;
> +	struct list_head	list;
> +	struct sock		*arr[PACKET_FANOUT_MAX];
> +	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
> +};
> +
>  struct packet_skb_cb {
>  	unsigned int origlen;
>  	union {
> @@ -344,6 +362,164 @@ static void packet_sock_destruct(struct sock *sk)
>  	sk_refcnt_debug_dec(sk);
>  }
>  
> +static int fanout_rr_next(struct packet_fanout *f)
> +{
> +	int x = atomic_read(&f->rr_cur) + 1;
> +
> +	if (x >= f->num_members)
> +		x = 0;
> +
> +	return x;
> +}
> +
> +static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb)
> +{
> +	u32 idx = ((u64)skb->rxhash * f->num_members) >> 32;
> +
> +	return f->arr[idx];
> +}
> +
> +static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb)
> +{
> +	int cur, old;
> +
> +	cur = atomic_read(&f->rr_cur);
> +	while ((old = atomic_cmpxchg(&f->rr_cur, cur,
> +				     fanout_rr_next(f))) != cur)
> +		cur = old;
> +	return f->arr[cur];
> +}
> +
> +static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
> +				  struct packet_type *pt, struct net_device *orig_dev)
> +{
> +	struct packet_fanout *f = pt->af_packet_priv;
> +	struct packet_sock *po;
> +	struct sock *sk;
> +
> +	if (!net_eq(dev_net(dev), read_pnet(&f->net))) {
> +		kfree_skb(skb);
> +		return 0;
> +	}
> +
> +	sk = fanout_demux_hash(f, skb);
> +	po = pkt_sk(sk);
> +
> +	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
> +}
> +
> +static int packet_rcv_fanout_lb(struct sk_buff *skb, struct net_device *dev,
> +				struct packet_type *pt, struct net_device *orig_dev)
> +{
> +	struct packet_fanout *f = pt->af_packet_priv;
> +	struct packet_sock *po;
> +	struct sock *sk;
> +
> +	if (!net_eq(dev_net(dev), read_pnet(&f->net))) {
> +		kfree_skb(skb);
> +		return 0;
> +	}
> +
> +	sk = fanout_demux_lb(f, skb);
> +	po = pkt_sk(sk);
> +
> +	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
> +}
> +
> +static DEFINE_MUTEX(fanout_mutex);
> +static LIST_HEAD(fanout_list);
> +
> +static int fanout_add(struct sock *sk, u16 id, u8 type)
> +{
> +	struct packet_sock *po = pkt_sk(sk);
> +	struct packet_fanout *f, *match;
> +	int err;
> +
> +	switch (type) {
> +	case PACKET_FANOUT_HASH:
> +	case PACKET_FANOUT_LB:
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	if (!po->running)
> +		return -EINVAL;
> +
> +	mutex_lock(&fanout_mutex);
> +	match = NULL;
> +	list_for_each_entry(f, &fanout_list, list) {
> +		if (f->id == id) {
> +			match = f;
> +			break;
> +		}
> +	}
> +	if (!match) {
> +		match = kzalloc(sizeof(*match), GFP_KERNEL);
> +		if (match) {
> +			write_pnet(&match->net, sock_net(sk));
> +			match->id = id;
> +			match->type = type;
> +			atomic_set(&match->rr_cur, 0);
> +			INIT_LIST_HEAD(&match->list);
> +			match->prot_hook.type = po->prot_hook.type;
> +			match->prot_hook.dev = po->prot_hook.dev;
> +			switch (type) {
> +			case PACKET_FANOUT_HASH:
> +				match->prot_hook.func = packet_rcv_fanout_hash;
> +				break;
> +			case PACKET_FANOUT_LB:
> +				match->prot_hook.func = packet_rcv_fanout_lb;
> +				break;
> +			}
> +			match->prot_hook.af_packet_priv = match;
> +			dev_add_pack(&match->prot_hook);
> +		}
> +	}
> +	err = -ENOMEM;
> +	if (match) {
> +		err = -EINVAL;
> +		if (match->type == type) {
> +			err = -ENOSPC;
> +			if (match->num_members < PACKET_FANOUT_MAX) {
> +				__dev_remove_pack(&po->prot_hook);
> +				po->fanout = match;
> +				match->arr[match->num_members] = sk;
> +				smp_wmb();
> +				match->num_members++;
> +				err = 0;
> +			}
> +		}
> +	}
> +	mutex_unlock(&fanout_mutex);
> +	return err;
> +}
> +
> +static void fanout_del(struct sock *sk)
> +{
> +	struct packet_sock *po = pkt_sk(sk);
> +	struct packet_fanout *f;
> +	int i;
> +
> +	f = po->fanout;
> +	po->fanout = NULL;
> +
> +	mutex_lock(&fanout_mutex);
> +	for (i = 0; i < f->num_members; i++) {
> +		if (f->arr[i] == sk)
> +			break;
> +	}
> +	BUG_ON(i >= f->num_members);
> +	f->arr[i] = f->arr[f->num_members - 1];
> +	f->num_members--;
> +
> +	if (!f->num_members) {
> +		list_del(&f->list);
> +		dev_remove_pack(&f->prot_hook);
> +		kfree(f);
> +	}
> +	mutex_unlock(&fanout_mutex);
> +}
>  
>  static const struct proto_ops packet_ops;
>  
> @@ -1343,7 +1519,10 @@ static int packet_release(struct socket *sock)
>  		 */
>  		po->running = 0;
>  		po->num = 0;
> -		__dev_remove_pack(&po->prot_hook);
> +		if (po->fanout)
> +			fanout_del(sk);
> +		else
> +			__dev_remove_pack(&po->prot_hook);
>  		__sock_put(sk);
>  	}
>  	if (po->prot_hook.dev) {
> @@ -1396,9 +1575,11 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
>  		__sock_put(sk);
>  		po->running = 0;
>  		po->num = 0;
> -		spin_unlock(&po->bind_lock);
> -		dev_remove_pack(&po->prot_hook);
> -		spin_lock(&po->bind_lock);
> +		if (!po->fanout) {
> +			spin_unlock(&po->bind_lock);
> +			dev_remove_pack(&po->prot_hook);
> +			spin_lock(&po->bind_lock);
> +		}
>  	}
>  
>  	po->num = protocol;
> @@ -1413,7 +1594,8 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
>  		goto out_unlock;
>  
>  	if (!dev || (dev->flags & IFF_UP)) {
> -		dev_add_pack(&po->prot_hook);
> +		if (!po->fanout)
> +			dev_add_pack(&po->prot_hook);
>  		sock_hold(sk);
>  		po->running = 1;
>  	} else {
> @@ -1542,7 +1724,8 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
>  
>  	if (proto) {
>  		po->prot_hook.type = proto;
> -		dev_add_pack(&po->prot_hook);
> +		if (!po->fanout)
> +			dev_add_pack(&po->prot_hook);
>  		sock_hold(sk);
>  		po->running = 1;
>  	}
> @@ -2109,6 +2292,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
>  		po->tp_tstamp = val;
>  		return 0;
>  	}
> +	case PACKET_FANOUT:
> +	{
> +		int val;
> +
> +		if (optlen != sizeof(val))
> +			return -EINVAL;
> +		if (copy_from_user(&val, optval, sizeof(val)))
> +			return -EFAULT;
> +
> +		return fanout_add(sk, val & 0xffff, val >> 16);
> +	}
>  	default:
>  		return -ENOPROTOOPT;
>  	}
> @@ -2207,6 +2401,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
>  		val = po->tp_tstamp;
>  		data = &val;
>  		break;
> +	case PACKET_FANOUT:
> +		if (len > sizeof(int))
> +			len = sizeof(int);
> +		val = (po->fanout ?
> +		       ((u32)po->fanout->id |
> +			((u32)po->fanout->type << 16)) :
> +		       0);
> +		data = &val;
> +		break;
>  	default:
>  		return -ENOPROTOOPT;
>  	}
> @@ -2260,7 +2463,8 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void
>  			if (dev->ifindex == po->ifindex) {
>  				spin_lock(&po->bind_lock);
>  				if (po->num && !po->running) {
> -					dev_add_pack(&po->prot_hook);
> +					if (!po->fanout)
> +						dev_add_pack(&po->prot_hook);
>  					sock_hold(sk);
>  					po->running = 1;
>  				}
> @@ -2530,7 +2734,8 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
>  	was_running = po->running;
>  	num = po->num;
>  	if (was_running) {
> -		__dev_remove_pack(&po->prot_hook);
> +		if (!po->fanout)
> +			__dev_remove_pack(&po->prot_hook);
>  		po->num = 0;
>  		po->running = 0;
>  		__sock_put(sk);
> @@ -2568,7 +2773,8 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
>  		sock_hold(sk);
>  		po->running = 1;
>  		po->num = num;
> -		dev_add_pack(&po->prot_hook);
> +		if (!po->fanout)
> +			dev_add_pack(&po->prot_hook);
>  	}
>  	spin_unlock(&po->bind_lock);
>  
> 


-- 
---------------------------------------------
Victor Julien
http://www.inliniac.net/
PGP: http://www.inliniac.net/victorjulien.asc
---------------------------------------------


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] packet: Add fanout support.
  2011-06-21 10:39 ` Victor Julien
@ 2011-06-21 10:46   ` David Miller
  2011-06-21 13:05     ` Changli Gao
  0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2011-06-21 10:46 UTC (permalink / raw)
  To: victor; +Cc: netdev

From: Victor Julien <victor@inliniac.net>
Date: Tue, 21 Jun 2011 12:39:11 +0200

> The hash based on skb->rxhash, does that result in a "flow" based
> distribution over the listeners? So all packets sharing a tuple
> being sent to the same socket?

Yes, that's exactly right.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] packet: Add fanout support.
  2011-06-21 10:46   ` David Miller
@ 2011-06-21 13:05     ` Changli Gao
  2011-06-21 13:27       ` Victor Julien
  2011-06-21 21:31       ` David Miller
  0 siblings, 2 replies; 10+ messages in thread
From: Changli Gao @ 2011-06-21 13:05 UTC (permalink / raw)
  To: David Miller; +Cc: victor, netdev

On Tue, Jun 21, 2011 at 6:46 PM, David Miller <davem@davemloft.net> wrote:
> From: Victor Julien <victor@inliniac.net>
> Date: Tue, 21 Jun 2011 12:39:11 +0200
>
>> The hash based on skb->rxhash, does that result in a "flow" based
>> distribution over the listeners? So all packets sharing a tuple
>> being sent to the same socket?
>
> Yes, that's exactly right.

But not for fragments, in additional.


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] packet: Add fanout support.
  2011-06-21 13:05     ` Changli Gao
@ 2011-06-21 13:27       ` Victor Julien
  2011-06-21 21:39         ` David Miller
  2011-06-21 21:31       ` David Miller
  1 sibling, 1 reply; 10+ messages in thread
From: Victor Julien @ 2011-06-21 13:27 UTC (permalink / raw)
  To: Changli Gao; +Cc: David Miller, netdev

On 06/21/2011 03:05 PM, Changli Gao wrote:
> On Tue, Jun 21, 2011 at 6:46 PM, David Miller <davem@davemloft.net> wrote:
>> From: Victor Julien <victor@inliniac.net>
>> Date: Tue, 21 Jun 2011 12:39:11 +0200
>>
>>> The hash based on skb->rxhash, does that result in a "flow" based
>>> distribution over the listeners? So all packets sharing a tuple
>>> being sent to the same socket?
>>
>> Yes, that's exactly right.
> 
> But not for fragments, in additional.

>From a Suricata IDS point of view, I would need to have the fragments of
a flow/tuple on the same socket.

-- 
---------------------------------------------
Victor Julien
http://www.inliniac.net/
PGP: http://www.inliniac.net/victorjulien.asc
---------------------------------------------


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] packet: Add fanout support.
  2011-06-21 13:05     ` Changli Gao
  2011-06-21 13:27       ` Victor Julien
@ 2011-06-21 21:31       ` David Miller
  1 sibling, 0 replies; 10+ messages in thread
From: David Miller @ 2011-06-21 21:31 UTC (permalink / raw)
  To: xiaosuo; +Cc: victor, netdev

From: Changli Gao <xiaosuo@gmail.com>
Date: Tue, 21 Jun 2011 21:05:24 +0800

> On Tue, Jun 21, 2011 at 6:46 PM, David Miller <davem@davemloft.net> wrote:
>> From: Victor Julien <victor@inliniac.net>
>> Date: Tue, 21 Jun 2011 12:39:11 +0200
>>
>>> The hash based on skb->rxhash, does that result in a "flow" based
>>> distribution over the listeners? So all packets sharing a tuple
>>> being sent to the same socket?
>>
>> Yes, that's exactly right.
> 
> But not for fragments, in additional.

That's a fundamental issue in the RFS code which I've proposed
solutions for.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] packet: Add fanout support.
  2011-06-21 13:27       ` Victor Julien
@ 2011-06-21 21:39         ` David Miller
  2011-06-22  1:44           ` Changli Gao
  0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2011-06-21 21:39 UTC (permalink / raw)
  To: victor; +Cc: xiaosuo, netdev

From: Victor Julien <victor@inliniac.net>
Date: Tue, 21 Jun 2011 15:27:54 +0200

> From a Suricata IDS point of view, I would need to have the
> fragments of a flow/tuple on the same socket.

Currently you would, they would all go to the first socket in
the fanout.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] packet: Add fanout support.
  2011-06-21 21:39         ` David Miller
@ 2011-06-22  1:44           ` Changli Gao
  2011-06-22  2:12             ` David Miller
  2011-06-22  6:49             ` Victor Julien
  0 siblings, 2 replies; 10+ messages in thread
From: Changli Gao @ 2011-06-22  1:44 UTC (permalink / raw)
  To: David Miller; +Cc: victor, netdev

On Wed, Jun 22, 2011 at 5:39 AM, David Miller <davem@davemloft.net> wrote:
> From: Victor Julien <victor@inliniac.net>
> Date: Tue, 21 Jun 2011 15:27:54 +0200
>
>> From a Suricata IDS point of view, I would need to have the
>> fragments of a flow/tuple on the same socket.
>
> Currently you would, they would all go to the first socket in
> the fanout.
>

I think he also needs all the packets belong to the related
connections are received via the same socket. I am afraid that he has
to dispatch these kind of packets among the uesrland processes again.
:)

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] packet: Add fanout support.
  2011-06-22  1:44           ` Changli Gao
@ 2011-06-22  2:12             ` David Miller
  2011-06-22  6:49             ` Victor Julien
  1 sibling, 0 replies; 10+ messages in thread
From: David Miller @ 2011-06-22  2:12 UTC (permalink / raw)
  To: xiaosuo; +Cc: victor, netdev

From: Changli Gao <xiaosuo@gmail.com>
Date: Wed, 22 Jun 2011 09:44:00 +0800

> I think he also needs all the packets belong to the related
> connections are received via the same socket. I am afraid that he has
> to dispatch these kind of packets among the uesrland processes again.
> :)

I mean, if we really wanted to, we could create a new ip_defrag()
client case in the rxhash receive code.  But this would need to be
configurable and off by default.

It would provide the desired behavior.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] packet: Add fanout support.
  2011-06-22  1:44           ` Changli Gao
  2011-06-22  2:12             ` David Miller
@ 2011-06-22  6:49             ` Victor Julien
  1 sibling, 0 replies; 10+ messages in thread
From: Victor Julien @ 2011-06-22  6:49 UTC (permalink / raw)
  To: Changli Gao; +Cc: David Miller, netdev

On 06/22/2011 03:44 AM, Changli Gao wrote:
> On Wed, Jun 22, 2011 at 5:39 AM, David Miller <davem@davemloft.net> wrote:
>> From: Victor Julien <victor@inliniac.net>
>> Date: Tue, 21 Jun 2011 15:27:54 +0200
>>
>>> From a Suricata IDS point of view, I would need to have the
>>> fragments of a flow/tuple on the same socket.
>>
>> Currently you would, they would all go to the first socket in
>> the fanout.
>>
> 
> I think he also needs all the packets belong to the related
> connections are received via the same socket. I am afraid that he has
> to dispatch these kind of packets among the uesrland processes again.
> :)
> 

Indeed. Although in Suricata we *could* work around it as we distribute
the flows over threads, not processes. It would still be messy. For this
to be useful to a tool like Snort (I'm sure they're interested) I think
this would be a deal breaker.

-- 
---------------------------------------------
Victor Julien
http://www.inliniac.net/
PGP: http://www.inliniac.net/victorjulien.asc
---------------------------------------------


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2011-06-22  6:50 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-06-21  9:53 [RFC PATCH] packet: Add fanout support David Miller
2011-06-21 10:39 ` Victor Julien
2011-06-21 10:46   ` David Miller
2011-06-21 13:05     ` Changli Gao
2011-06-21 13:27       ` Victor Julien
2011-06-21 21:39         ` David Miller
2011-06-22  1:44           ` Changli Gao
2011-06-22  2:12             ` David Miller
2011-06-22  6:49             ` Victor Julien
2011-06-21 21:31       ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).