[PATCH nf-next] netfilter: nfnetlink_queue: zero copy support

netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH nf-next] netfilter: nfnetlink_queue: zero copy support
@ 2013-03-18  3:15 Eric Dumazet
  2013-03-18  9:24 ` Florian Westphal
  2013-03-19 22:52 ` Pablo Neira Ayuso
  0 siblings, 2 replies; 5+ messages in thread
From: Eric Dumazet @ 2013-03-18  3:15 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netdev, Netfilter Developer Mailing List

From: Eric Dumazet <edumazet@google.com>

nfqnl_build_packet_message() actually copy the packet
inside the netlink message, while it can instead use
zero copy.

Make sure the skb 'copy' is the last component of the
cooked netlink message, as we cant add anything after it.

Patch cooked in Copenhagen at Netfilter Workshop ;)

Still to be addressed in separate patches :

-GRO/GSO packets are segmented in nf_queue()
and checksummed in nfqnl_build_packet_message().

Proper support for GSO/GRO packets (no segmentation,
and no checksumming) needs application cooperation, if we
want no regressions.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/netfilter/nfnetlink_queue_core.c |   94 +++++++++++++++++++------
 1 file changed, 72 insertions(+), 22 deletions(-)

diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 858fd52..388fb8ba3 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -217,14 +217,59 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
 	spin_unlock_bh(&queue->lock);
 }
 
+static void
+nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
+{
+	int i, j = 0;
+	int plen = 0; /* length of skb->head fragment */
+	struct page *page;
+	unsigned int offset;
+
+	/* dont bother with small payloads */
+	if (len <= skb_tailroom(to)) {
+		skb_copy_bits(from, 0, skb_put(to, len), len);
+		return;
+	}
+
+	if (hlen) {
+		skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+		len -= hlen;
+	} else {
+		plen = min_t(int, skb_headlen(from), len);
+		if (plen) {
+			page = virt_to_head_page(from->head);
+			offset = from->data - (unsigned char *)page_address(page);
+			__skb_fill_page_desc(to, 0, page, offset, plen);
+			get_page(page);
+			j = 1;
+			len -= plen;
+		}
+	}
+
+	to->truesize += len + plen;
+	to->len += len + plen;
+	to->data_len += len + plen;
+
+	for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
+		if (!len)
+			break;
+		skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
+		skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
+		len -= skb_shinfo(to)->frags[j].size;
+		skb_frag_ref(to, j);
+		j++;
+	}
+	skb_shinfo(to)->nr_frags = j;
+}
+
 static struct sk_buff *
 nfqnl_build_packet_message(struct nfqnl_instance *queue,
 			   struct nf_queue_entry *entry,
 			   __be32 **packet_id_ptr)
 {
-	sk_buff_data_t old_tail;
 	size_t size;
 	size_t data_len = 0, cap_len = 0;
+	int hlen = 0;
 	struct sk_buff *skb;
 	struct nlattr *nla;
 	struct nfqnl_msg_packet_hdr *pmsg;
@@ -246,8 +291,10 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 #endif
 		+ nla_total_size(sizeof(u_int32_t))	/* mark */
 		+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
-		+ nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)
-		+ nla_total_size(sizeof(u_int32_t)));	/* cap_len */
+		+ nla_total_size(sizeof(u_int32_t));	/* cap_len */
+
+	if (entskb->tstamp.tv64)
+		size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
 
 	outdev = entry->outdev;
 
@@ -265,7 +312,16 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 		if (data_len == 0 || data_len > entskb->len)
 			data_len = entskb->len;
 
-		size += nla_total_size(data_len);
+
+		if (!entskb->head_frag ||
+		    skb_headlen(entskb) < L1_CACHE_BYTES ||
+		    skb_shinfo(entskb)->nr_frags >= MAX_SKB_FRAGS)
+			hlen = skb_headlen(entskb);
+
+		if (skb_has_frag_list(entskb))
+			hlen = entskb->len;
+		hlen = min_t(int, data_len, hlen);
+		size += sizeof(struct nlattr) + hlen;
 		cap_len = entskb->len;
 		break;
 	}
@@ -277,7 +333,6 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 	if (!skb)
 		return NULL;
 
-	old_tail = skb->tail;
 	nlh = nlmsg_put(skb, 0, 0,
 			NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
 			sizeof(struct nfgenmsg), 0);
@@ -382,31 +437,26 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 			goto nla_put_failure;
 	}
 
+	if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0)
+		goto nla_put_failure;
+
+	if (cap_len > 0 && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len)))
+		goto nla_put_failure;
+
 	if (data_len) {
 		struct nlattr *nla;
-		int sz = nla_attr_size(data_len);
 
-		if (skb_tailroom(skb) < nla_total_size(data_len)) {
-			printk(KERN_WARNING "nf_queue: no tailroom!\n");
-			kfree_skb(skb);
-			return NULL;
-		}
+		if (skb_tailroom(skb) < sizeof(*nla) + hlen)
+			goto nla_put_failure;
 
-		nla = (struct nlattr *)skb_put(skb, nla_total_size(data_len));
+		nla = (struct nlattr *)skb_put(skb, sizeof(*nla));
 		nla->nla_type = NFQA_PAYLOAD;
-		nla->nla_len = sz;
+		nla->nla_len = nla_attr_size(data_len);
 
-		if (skb_copy_bits(entskb, 0, nla_data(nla), data_len))
-			BUG();
+		nfqnl_zcopy(skb, entskb, data_len, hlen);
 	}
 
-	if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0)
-		goto nla_put_failure;
-
-	if (cap_len > 0 && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len)))
-		goto nla_put_failure;
-
-	nlh->nlmsg_len = skb->tail - old_tail;
+	nlh->nlmsg_len = skb->len;
 	return skb;
 
 nla_put_failure:

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH nf-next] netfilter: nfnetlink_queue: zero copy support
  2013-03-18  3:15 [PATCH nf-next] netfilter: nfnetlink_queue: zero copy support Eric Dumazet
@ 2013-03-18  9:24 ` Florian Westphal
  2013-03-18 13:51   ` Eric Dumazet
  2013-03-19 22:52 ` Pablo Neira Ayuso
  1 sibling, 1 reply; 5+ messages in thread
From: Florian Westphal @ 2013-03-18  9:24 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Pablo Neira Ayuso, netdev, Netfilter Developer Mailing List

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> nfqnl_build_packet_message() actually copy the packet
> inside the netlink message, while it can instead use
> zero copy.
> 
> Make sure the skb 'copy' is the last component of the
> cooked netlink message, as we cant add anything after it.
> 
> Patch cooked in Copenhagen at Netfilter Workshop ;)

This is awesome.
Was there a consensus wrt. mmap'd netlink vs. your patch?

[ I ask because both get rid of one skb data copy ]

> Still to be addressed in separate patches :
>
> -GRO/GSO packets are segmented in nf_queue()
> and checksummed in nfqnl_build_packet_message().
> Proper support for GSO/GRO packets (no segmentation,
> and no checksumming) needs application cooperation, if we
> want no regressions.

Since ipqueue is gone we might be able to push the segmentation
down to nfnetlink_queue.  Then new userspace applications
could indicate a 'I won't verify checksums and will handle huge
packets'.

Are you working on something like this?

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH nf-next] netfilter: nfnetlink_queue: zero copy support
  2013-03-18  9:24 ` Florian Westphal
@ 2013-03-18 13:51   ` Eric Dumazet
  2013-03-18 15:36     ` Pablo Neira Ayuso
  0 siblings, 1 reply; 5+ messages in thread
From: Eric Dumazet @ 2013-03-18 13:51 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Pablo Neira Ayuso, netdev, Netfilter Developer Mailing List

On Mon, 2013-03-18 at 10:24 +0100, Florian Westphal wrote:
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > -GRO/GSO packets are segmented in nf_queue()
> > and checksummed in nfqnl_build_packet_message().
> > Proper support for GSO/GRO packets (no segmentation,
> > and no checksumming) needs application cooperation, if we
> > want no regressions.
> 
> Since ipqueue is gone we might be able to push the segmentation
> down to nfnetlink_queue.  Then new userspace applications
> could indicate a 'I won't verify checksums and will handle huge
> packets'.
> 
> Are you working on something like this?

I validated that it was only an API concern, by commenting out the code,
and got 20Gbps (link speed) using the sample program (using a bigger
buffer to receive the skbs and removing the printf() for each packet)

Pablo followed the experiments and I believe he has an idea of the
needed API. Anyway, after one week in NFWS, I wont have the time to do
it, I have a huge backlog...

Note that its not a zero copy :

Before the patch we had 2 copies. (kernel->kernel done in softirq
context, and kernel->user in process context)

After the patch we have the copy from kernel to user land, done
in process context.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH nf-next] netfilter: nfnetlink_queue: zero copy support
  2013-03-18 13:51   ` Eric Dumazet
@ 2013-03-18 15:36     ` Pablo Neira Ayuso
  0 siblings, 0 replies; 5+ messages in thread
From: Pablo Neira Ayuso @ 2013-03-18 15:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Florian Westphal, netdev, Netfilter Developer Mailing List

On Mon, Mar 18, 2013 at 06:51:19AM -0700, Eric Dumazet wrote:
> On Mon, 2013-03-18 at 10:24 +0100, Florian Westphal wrote:
> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > >
> > > -GRO/GSO packets are segmented in nf_queue()
> > > and checksummed in nfqnl_build_packet_message().
> > > Proper support for GSO/GRO packets (no segmentation,
> > > and no checksumming) needs application cooperation, if we
> > > want no regressions.
> > 
> > Since ipqueue is gone we might be able to push the segmentation
> > down to nfnetlink_queue.  Then new userspace applications
> > could indicate a 'I won't verify checksums and will handle huge
> > packets'.
> > 
> > Are you working on something like this?
> 
> I validated that it was only an API concern, by commenting out the code,
> and got 20Gbps (link speed) using the sample program (using a bigger
> buffer to receive the skbs and removing the printf() for each packet)
> 
> Pablo followed the experiments and I believe he has an idea of the
> needed API.

Will take over this. Florian, ping me if interested in helping.

Thanks a lot for the patch and ideas Eric!

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH nf-next] netfilter: nfnetlink_queue: zero copy support
  2013-03-18  3:15 [PATCH nf-next] netfilter: nfnetlink_queue: zero copy support Eric Dumazet
  2013-03-18  9:24 ` Florian Westphal
@ 2013-03-19 22:52 ` Pablo Neira Ayuso
  1 sibling, 0 replies; 5+ messages in thread
From: Pablo Neira Ayuso @ 2013-03-19 22:52 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Netfilter Developer Mailing List

On Sun, Mar 17, 2013 at 08:15:55PM -0700, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> nfqnl_build_packet_message() actually copy the packet
> inside the netlink message, while it can instead use
> zero copy.
> 
> Make sure the skb 'copy' is the last component of the
> cooked netlink message, as we cant add anything after it.
> 
> Patch cooked in Copenhagen at Netfilter Workshop ;)
> 
> Still to be addressed in separate patches :
> 
> -GRO/GSO packets are segmented in nf_queue()
> and checksummed in nfqnl_build_packet_message().
> 
> Proper support for GSO/GRO packets (no segmentation,
> and no checksumming) needs application cooperation, if we
> want no regressions.

Applied, thanks.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2013-03-19 22:52 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-03-18  3:15 [PATCH nf-next] netfilter: nfnetlink_queue: zero copy support Eric Dumazet
2013-03-18  9:24 ` Florian Westphal
2013-03-18 13:51   ` Eric Dumazet
2013-03-18 15:36     ` Pablo Neira Ayuso
2013-03-19 22:52 ` Pablo Neira Ayuso

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).