netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] AF_PACKET fanout support
@ 2011-07-05  4:20 David Miller
  2011-07-06  0:46 ` Tom Herbert
  0 siblings, 1 reply; 8+ messages in thread
From: David Miller @ 2011-07-05  4:20 UTC (permalink / raw)
  To: victor; +Cc: netdev


This is a fully functional version, I've tested both hash and
load-balance modes successfully.  I plan to commit this to
net-next-2.6 very soon.

Below is a test program that other people can play with
if they want.  It basically creates 4 threads, and creates
an AF_PACKET fanout amongst them.  Each thread prints out
it's pid in parentheses every time it receives 10 packets.
After each thread processes 10,000 packets, it exits.

Try things like "./test eth0 hash", "./test eth0 lb", etc.

Signed-off-by: David S. Miller <davem@davemloft.net>

--------------------
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <sys/types.h>
#include <sys/wait.h>
#include <sys/socket.h>
#include <sys/ioctl.h>

#include <unistd.h>

#include <linux/if_ether.h>
#include <linux/if_packet.h>

#include <net/if.h>

static const char *device_name;
static int fanout_type;
static int fanout_id;

#ifndef PACKET_FANOUT
#define PACKET_FANOUT		18
#define PACKET_FANOUT_HASH		0
#define PACKET_FANOUT_LB		1
#endif

static int setup_socket(void)
{
	int err, fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_IP));
	struct sockaddr_ll ll;
	struct ifreq ifr;
	int fanout_arg;

	if (fd < 0) {
		perror("socket");
		return EXIT_FAILURE;
	}

	memset(&ifr, 0, sizeof(ifr));
	strcpy(ifr.ifr_name, device_name);
	err = ioctl(fd, SIOCGIFINDEX, &ifr);
	if (err < 0) {
		perror("SIOCGIFINDEX");
		return EXIT_FAILURE;
	}

	memset(&ll, 0, sizeof(ll));
	ll.sll_family = AF_PACKET;
	ll.sll_ifindex = ifr.ifr_ifindex;
	err = bind(fd, (struct sockaddr *) &ll, sizeof(ll));
	if (err < 0) {
		perror("bind");
		return EXIT_FAILURE;
	}

	fanout_arg = (fanout_id | (fanout_type << 16));
	err = setsockopt(fd, SOL_PACKET, PACKET_FANOUT,
			 &fanout_arg, sizeof(fanout_arg));
	if (err) {
		perror("setsockopt");
		return EXIT_FAILURE;
	}

	return fd;
}

static void fanout_thread(void)
{
	int fd = setup_socket();
	int limit = 10000;

	if (fd < 0)
		exit(fd);

	while (limit-- > 0) {
		char buf[1600];
		int err;

		err = read(fd, buf, sizeof(buf));
		if (err < 0) {
			perror("read");
			exit(EXIT_FAILURE);
		}
		if ((limit % 10) == 0)
			fprintf(stdout, "(%d) \n", getpid());
	}

	fprintf(stdout, "%d: Received 10000 packets\n", getpid());

	close(fd);
	exit(0);
}

int main(int argc, char **argp)
{
	int fd, err;
	int i;

	if (argc != 3) {
		fprintf(stderr, "Usage: %s INTERFACE {hash|lb}\n", argp[0]);
		return EXIT_FAILURE;
	}

	if (!strcmp(argp[2], "hash"))
		fanout_type = PACKET_FANOUT_HASH;
	else if (!strcmp(argp[2], "lb"))
		fanout_type = PACKET_FANOUT_LB;
	else {
		fprintf(stderr, "Unknown fanout type [%s]\n", argp[2]);
		exit(EXIT_FAILURE);
	}

	device_name = argp[1];
	fanout_id = getpid() & 0xffff;

	for (i = 0; i < 4; i++) {
		pid_t pid = fork();

		switch (pid) {
		case 0:
			fanout_thread();

		case -1:
			perror("fork");
			exit(EXIT_FAILURE);
		}
	}

	for (i = 0; i < 4; i++) {
		int status;

		wait(&status);
	}

	return 0;
}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 0/2] AF_PACKET fanout support
  2011-07-05  4:20 [PATCH 0/2] AF_PACKET fanout support David Miller
@ 2011-07-06  0:46 ` Tom Herbert
  2011-07-06  1:20   ` David Miller
  0 siblings, 1 reply; 8+ messages in thread
From: Tom Herbert @ 2011-07-06  0:46 UTC (permalink / raw)
  To: David Miller; +Cc: victor, netdev, Willem Bruijn

Dave,

Thanks for these patches!  Is it possible you could use an alternative
term than "fanout"?  I think this may be more often associated with a
transmit operation (e.g. multicast fanout).

Also, another useful mode of steering would be to steer packets to a
socket which was recently processed by a thread running on the same
CPU; somewhat analogous to RFS (cc'ed WIllem Bruijn who is already
working on this I believe).

Tom

On Mon, Jul 4, 2011 at 9:20 PM, David Miller <davem@davemloft.net> wrote:
>
> This is a fully functional version, I've tested both hash and
> load-balance modes successfully.  I plan to commit this to
> net-next-2.6 very soon.
>
> Below is a test program that other people can play with
> if they want.  It basically creates 4 threads, and creates
> an AF_PACKET fanout amongst them.  Each thread prints out
> it's pid in parentheses every time it receives 10 packets.
> After each thread processes 10,000 packets, it exits.
>
> Try things like "./test eth0 hash", "./test eth0 lb", etc.
>
> Signed-off-by: David S. Miller <davem@davemloft.net>
>
> --------------------
> #include <stddef.h>
> #include <stdlib.h>
> #include <stdio.h>
> #include <string.h>
>
> #include <sys/types.h>
> #include <sys/wait.h>
> #include <sys/socket.h>
> #include <sys/ioctl.h>
>
> #include <unistd.h>
>
> #include <linux/if_ether.h>
> #include <linux/if_packet.h>
>
> #include <net/if.h>
>
> static const char *device_name;
> static int fanout_type;
> static int fanout_id;
>
> #ifndef PACKET_FANOUT
> #define PACKET_FANOUT           18
> #define PACKET_FANOUT_HASH              0
> #define PACKET_FANOUT_LB                1
> #endif
>
> static int setup_socket(void)
> {
>        int err, fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_IP));
>        struct sockaddr_ll ll;
>        struct ifreq ifr;
>        int fanout_arg;
>
>        if (fd < 0) {
>                perror("socket");
>                return EXIT_FAILURE;
>        }
>
>        memset(&ifr, 0, sizeof(ifr));
>        strcpy(ifr.ifr_name, device_name);
>        err = ioctl(fd, SIOCGIFINDEX, &ifr);
>        if (err < 0) {
>                perror("SIOCGIFINDEX");
>                return EXIT_FAILURE;
>        }
>
>        memset(&ll, 0, sizeof(ll));
>        ll.sll_family = AF_PACKET;
>        ll.sll_ifindex = ifr.ifr_ifindex;
>        err = bind(fd, (struct sockaddr *) &ll, sizeof(ll));
>        if (err < 0) {
>                perror("bind");
>                return EXIT_FAILURE;
>        }
>
>        fanout_arg = (fanout_id | (fanout_type << 16));
>        err = setsockopt(fd, SOL_PACKET, PACKET_FANOUT,
>                         &fanout_arg, sizeof(fanout_arg));
>        if (err) {
>                perror("setsockopt");
>                return EXIT_FAILURE;
>        }
>
>        return fd;
> }
>
> static void fanout_thread(void)
> {
>        int fd = setup_socket();
>        int limit = 10000;
>
>        if (fd < 0)
>                exit(fd);
>
>        while (limit-- > 0) {
>                char buf[1600];
>                int err;
>
>                err = read(fd, buf, sizeof(buf));
>                if (err < 0) {
>                        perror("read");
>                        exit(EXIT_FAILURE);
>                }
>                if ((limit % 10) == 0)
>                        fprintf(stdout, "(%d) \n", getpid());
>        }
>
>        fprintf(stdout, "%d: Received 10000 packets\n", getpid());
>
>        close(fd);
>        exit(0);
> }
>
> int main(int argc, char **argp)
> {
>        int fd, err;
>        int i;
>
>        if (argc != 3) {
>                fprintf(stderr, "Usage: %s INTERFACE {hash|lb}\n", argp[0]);
>                return EXIT_FAILURE;
>        }
>
>        if (!strcmp(argp[2], "hash"))
>                fanout_type = PACKET_FANOUT_HASH;
>        else if (!strcmp(argp[2], "lb"))
>                fanout_type = PACKET_FANOUT_LB;
>        else {
>                fprintf(stderr, "Unknown fanout type [%s]\n", argp[2]);
>                exit(EXIT_FAILURE);
>        }
>
>        device_name = argp[1];
>        fanout_id = getpid() & 0xffff;
>
>        for (i = 0; i < 4; i++) {
>                pid_t pid = fork();
>
>                switch (pid) {
>                case 0:
>                        fanout_thread();
>
>                case -1:
>                        perror("fork");
>                        exit(EXIT_FAILURE);
>                }
>        }
>
>        for (i = 0; i < 4; i++) {
>                int status;
>
>                wait(&status);
>        }
>
>        return 0;
> }
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 0/2] AF_PACKET fanout support
  2011-07-06  0:46 ` Tom Herbert
@ 2011-07-06  1:20   ` David Miller
  2011-07-06  3:13     ` Tom Herbert
  0 siblings, 1 reply; 8+ messages in thread
From: David Miller @ 2011-07-06  1:20 UTC (permalink / raw)
  To: therbert; +Cc: victor, netdev, willemb

From: Tom Herbert <therbert@google.com>
Date: Tue, 5 Jul 2011 17:46:36 -0700

> Thanks for these patches!  Is it possible you could use an alternative
> term than "fanout"?  I think this may be more often associated with a
> transmit operation (e.g. multicast fanout).

I've never heard such terminology myself.

Sorry, the fanout name is staying :-)

> Also, another useful mode of steering would be to steer packets to a
> socket which was recently processed by a thread running on the same
> CPU; somewhat analogous to RFS (cc'ed WIllem Bruijn who is already
> working on this I believe).

This sounds like a good way to overload a local socket and prevent
pushing the work to lesser used sockets on other cpus.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 0/2] AF_PACKET fanout support
  2011-07-06  1:20   ` David Miller
@ 2011-07-06  3:13     ` Tom Herbert
  2011-07-06  3:19       ` David Miller
  0 siblings, 1 reply; 8+ messages in thread
From: Tom Herbert @ 2011-07-06  3:13 UTC (permalink / raw)
  To: David Miller; +Cc: victor, netdev, willemb

>> Also, another useful mode of steering would be to steer packets to a
>> socket which was recently processed by a thread running on the same
>> CPU; somewhat analogous to RFS (cc'ed WIllem Bruijn who is already
>> working on this I believe).
>
> This sounds like a good way to overload a local socket and prevent
> pushing the work to lesser used sockets on other cpus.
>
Sure, it you're not using RPS or RSS!  These should already be
distributing the RX work amongst CPUs.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 0/2] AF_PACKET fanout support
  2011-07-06  3:13     ` Tom Herbert
@ 2011-07-06  3:19       ` David Miller
  2011-07-06  4:07         ` Eric Dumazet
  0 siblings, 1 reply; 8+ messages in thread
From: David Miller @ 2011-07-06  3:19 UTC (permalink / raw)
  To: therbert; +Cc: victor, netdev, willemb

From: Tom Herbert <therbert@google.com>
Date: Tue, 5 Jul 2011 20:13:27 -0700

>>> Also, another useful mode of steering would be to steer packets to a
>>> socket which was recently processed by a thread running on the same
>>> CPU; somewhat analogous to RFS (cc'ed WIllem Bruijn who is already
>>> working on this I believe).
>>
>> This sounds like a good way to overload a local socket and prevent
>> pushing the work to lesser used sockets on other cpus.
>>
> Sure, it you're not using RPS or RSS!  These should already be
> distributing the RX work amongst CPUs.

One idea I did have while working on the PACKET_FANOUT bits was
to allow a packet socket to be bound to a particular cpu.  And
to implement this we'd have a per-cpu list of packet_type taps.

But in order for the user to make sure he gets all the traffic,
he'd have to make sure he bound one AF_PACKET socket to every
online cpu and then listened for all cpu hotplug events.

It doesn't really work.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 0/2] AF_PACKET fanout support
  2011-07-06  3:19       ` David Miller
@ 2011-07-06  4:07         ` Eric Dumazet
  2011-07-06  6:44           ` David Miller
  0 siblings, 1 reply; 8+ messages in thread
From: Eric Dumazet @ 2011-07-06  4:07 UTC (permalink / raw)
  To: David Miller; +Cc: therbert, victor, netdev, willemb

Le mardi 05 juillet 2011 à 20:19 -0700, David Miller a écrit :
> From: Tom Herbert <therbert@google.com>
> Date: Tue, 5 Jul 2011 20:13:27 -0700
> 
> >>> Also, another useful mode of steering would be to steer packets to a
> >>> socket which was recently processed by a thread running on the same
> >>> CPU; somewhat analogous to RFS (cc'ed WIllem Bruijn who is already
> >>> working on this I believe).
> >>
> >> This sounds like a good way to overload a local socket and prevent
> >> pushing the work to lesser used sockets on other cpus.
> >>
> > Sure, it you're not using RPS or RSS!  These should already be
> > distributing the RX work amongst CPUs.
> 
> One idea I did have while working on the PACKET_FANOUT bits was
> to allow a packet socket to be bound to a particular cpu.  And
> to implement this we'd have a per-cpu list of packet_type taps.
> 
> But in order for the user to make sure he gets all the traffic,
> he'd have to make sure he bound one AF_PACKET socket to every
> online cpu and then listened for all cpu hotplug events.
> 
> It doesn't really work.

It is working right now if you dont have too many cpus, adding as many
sockets as possible cpus, and convenient BPF filter (matching CPU X) per
packet socket. Of course, if a cpu is offlined, the corresponding socket
wont receive any packet.

Currently, with a multiqueue NIC, the two policies you have might be in
conflict with NIC flow distribution among its queues.

In the end, lot of different cpus will access all the sockets.

I suspect this can be solved adding a third policy : hash by CPU only





^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 0/2] AF_PACKET fanout support
  2011-07-06  4:07         ` Eric Dumazet
@ 2011-07-06  6:44           ` David Miller
  2011-07-06  6:55             ` David Miller
  0 siblings, 1 reply; 8+ messages in thread
From: David Miller @ 2011-07-06  6:44 UTC (permalink / raw)
  To: eric.dumazet; +Cc: therbert, victor, netdev, willemb

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 06 Jul 2011 06:07:46 +0200

> I suspect this can be solved adding a third policy : hash by CPU only

Agreed, I'll implement this policy.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 0/2] AF_PACKET fanout support
  2011-07-06  6:44           ` David Miller
@ 2011-07-06  6:55             ` David Miller
  0 siblings, 0 replies; 8+ messages in thread
From: David Miller @ 2011-07-06  6:55 UTC (permalink / raw)
  To: eric.dumazet; +Cc: therbert, victor, netdev, willemb

From: David Miller <davem@davemloft.net>
Date: Tue, 05 Jul 2011 23:44:24 -0700 (PDT)

> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 06 Jul 2011 06:07:46 +0200
> 
>> I suspect this can be solved adding a third policy : hash by CPU only
> 
> Agreed, I'll implement this policy.

packet: Add 'cpu' fanout policy.

Unfortunately we have to use a real modulus here as
the multiply trick won't work as effectively with cpu
numbers as it does with rxhash values.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 84e684e..c148606 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -53,6 +53,7 @@ struct sockaddr_ll {
 
 #define PACKET_FANOUT_HASH		0
 #define PACKET_FANOUT_LB		1
+#define PACKET_FANOUT_CPU		2
 #define PACKET_FANOUT_FLAG_DEFRAG	0x8000
 
 struct tpacket_stats {
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 7ba6871..41f0489 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -447,6 +447,13 @@ static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb
 	return f->arr[cur];
 }
 
+static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
+{
+	unsigned int cpu = smp_processor_id();
+
+	return f->arr[cpu % num];
+}
+
 static struct sk_buff *fanout_check_defrag(struct sk_buff *skb)
 {
 	const struct iphdr *iph;
@@ -482,8 +489,8 @@ static struct sk_buff *fanout_check_defrag(struct sk_buff *skb)
 	return skb;
 }
 
-static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
-				  struct packet_type *pt, struct net_device *orig_dev)
+static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
+			     struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct packet_fanout *f = pt->af_packet_priv;
 	unsigned int num = f->num_members;
@@ -496,35 +503,25 @@ static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
 		return 0;
 	}
 
-	if (f->defrag) {
-		skb = fanout_check_defrag(skb);
-		if (!skb)
-			return 0;
-	}
-
-	skb_get_rxhash(skb);
-
-	sk = fanout_demux_hash(f, skb, num);
-	po = pkt_sk(sk);
-
-	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
-}
-
-static int packet_rcv_fanout_lb(struct sk_buff *skb, struct net_device *dev,
-				struct packet_type *pt, struct net_device *orig_dev)
-{
-	struct packet_fanout *f = pt->af_packet_priv;
-	unsigned int num = f->num_members;
-	struct packet_sock *po;
-	struct sock *sk;
-
-	if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
-	    !num) {
-		kfree_skb(skb);
-		return 0;
+	switch (f->type) {
+	case PACKET_FANOUT_HASH:
+	default:
+		if (f->defrag) {
+			skb = fanout_check_defrag(skb);
+			if (!skb)
+				return 0;
+		}
+		skb_get_rxhash(skb);
+		sk = fanout_demux_hash(f, skb, num);
+		break;
+	case PACKET_FANOUT_LB:
+		sk = fanout_demux_lb(f, skb, num);
+		break;
+	case PACKET_FANOUT_CPU:
+		sk = fanout_demux_cpu(f, skb, num);
+		break;
 	}
 
-	sk = fanout_demux_lb(f, skb, num);
 	po = pkt_sk(sk);
 
 	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
@@ -571,6 +568,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 	switch (type) {
 	case PACKET_FANOUT_HASH:
 	case PACKET_FANOUT_LB:
+	case PACKET_FANOUT_CPU:
 		break;
 	default:
 		return -EINVAL;
@@ -606,14 +604,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 			atomic_set(&match->sk_ref, 0);
 			match->prot_hook.type = po->prot_hook.type;
 			match->prot_hook.dev = po->prot_hook.dev;
-			switch (type) {
-			case PACKET_FANOUT_HASH:
-				match->prot_hook.func = packet_rcv_fanout_hash;
-				break;
-			case PACKET_FANOUT_LB:
-				match->prot_hook.func = packet_rcv_fanout_lb;
-				break;
-			}
+			match->prot_hook.func = packet_rcv_fanout;
 			match->prot_hook.af_packet_priv = match;
 			dev_add_pack(&match->prot_hook);
 			list_add(&match->list, &fanout_list);

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2011-07-06  6:55 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-07-05  4:20 [PATCH 0/2] AF_PACKET fanout support David Miller
2011-07-06  0:46 ` Tom Herbert
2011-07-06  1:20   ` David Miller
2011-07-06  3:13     ` Tom Herbert
2011-07-06  3:19       ` David Miller
2011-07-06  4:07         ` Eric Dumazet
2011-07-06  6:44           ` David Miller
2011-07-06  6:55             ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).