Netdev List
 help / color / mirror / Atom feed
From: Eric Dumazet <eric.dumazet@gmail.com>
To: "Maciej Żenczykowski" <maze@google.com>
Cc: David Miller <davem@davemloft.net>,
	ilpo.jarvinen@helsinki.fi, rick.jones2@hp.com,
	netdev@vger.kernel.org, therbert@google.com,
	ncardwell@google.com, ycheng@google.com
Subject: Re: [RFC] allow skb->head to point/alias to first skb frag
Date: Thu, 26 Apr 2012 15:50:52 +0200	[thread overview]
Message-ID: <1335448252.2775.41.camel@edumazet-glaptop> (raw)
In-Reply-To: <1335443554.2775.33.camel@edumazet-glaptop>

On Thu, 2012-04-26 at 14:32 +0200, Eric Dumazet wrote:
> On Thu, 2012-04-26 at 03:09 -0700, Maciej Żenczykowski wrote:
> > it would be very useful if there was an api for freeing skb->head memory...
> > 
> > I've fooled around with a single memory buffer for both the head and
> > the data and seen huge performance wins under heavy packet load (half
> > the amount of malloc/free), but could never quite get it to be 100%
> > stable.
> 
> My patch is almost ready, and seems very good.
> 
> I added a param to skb_build() to tell the memory comes from a (sub)page
> 
> tg3 uses the new thing.
> 
> GRO does the correct merging.
> 
> (by the way, I noticed GRO lies about skb truesize... since it
> accumulates frag lengths instead of allocated space... with a 1448/2048
> mismatch for MTU=1500)
> 
> And this apparently solves my slub performance issue I had on my dual
> quad core machine, since I no longer hit slub slow path.
> 
> 

Here is the raw patch. (tg3 part is dirty and needs some helpers)

I am working on the TCP coalesce part, the splice() helpers (to avoid a
copy), and split in five different patches for further
discussion/inclusion.


 drivers/net/ethernet/broadcom/bnx2.c            |    2 
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c |    4 
 drivers/net/ethernet/broadcom/tg3.c             |   61 +++++++++++---
 drivers/net/ethernet/broadcom/tg3.h             |    2 
 include/linux/skbuff.h                          |    7 +
 net/core/dev.c                                  |    5 -
 net/core/skbuff.c                               |   46 ++++++++--
 net/ipv4/tcp_input.c                            |    1 
 8 files changed, 103 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index ab55979..ac7b744 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -3006,7 +3006,7 @@ error:
 
 	dma_unmap_single(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size,
 			 PCI_DMA_FROMDEVICE);
-	skb = build_skb(data);
+	skb = build_skb(data, 0);
 	if (!skb) {
 		kfree(data);
 		goto error;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 60d5b54..9c5bc5d 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -513,7 +513,7 @@ static inline void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 	dma_unmap_single(&bp->pdev->dev, dma_unmap_addr(rx_buf, mapping),
 			 fp->rx_buf_size, DMA_FROM_DEVICE);
 	if (likely(new_data))
-		skb = build_skb(data);
+		skb = build_skb(data, 0);
 
 	if (likely(skb)) {
 #ifdef BNX2X_STOP_ON_ERROR
@@ -721,7 +721,7 @@ int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 						 dma_unmap_addr(rx_buf, mapping),
 						 fp->rx_buf_size,
 						 DMA_FROM_DEVICE);
-				skb = build_skb(data);
+				skb = build_skb(data, 0);
 				if (unlikely(!skb)) {
 					kfree(data);
 					fp->eth_q_stats.rx_skb_alloc_failed++;
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 0c3e7c7..d216076 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -5619,12 +5619,18 @@ static void tg3_tx(struct tg3_napi *tnapi)
 
 static void tg3_rx_data_free(struct tg3 *tp, struct ring_info *ri, u32 map_sz)
 {
+	unsigned int skb_size = SKB_DATA_ALIGN(map_sz + TG3_RX_OFFSET(tp)) +
+		   SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
 	if (!ri->data)
 		return;
 
 	pci_unmap_single(tp->pdev, dma_unmap_addr(ri, mapping),
 			 map_sz, PCI_DMA_FROMDEVICE);
-	kfree(ri->data);
+	if (skb_size <= 2048)
+		put_page(virt_to_head_page(ri->data));
+	else
+		kfree(ri->data);
 	ri->data = NULL;
 }
 
@@ -5640,7 +5646,8 @@ static void tg3_rx_data_free(struct tg3 *tp, struct ring_info *ri, u32 map_sz)
  * (to fetch the error flags, vlan tag, checksum, and opaque cookie).
  */
 static int tg3_alloc_rx_data(struct tg3 *tp, struct tg3_rx_prodring_set *tpr,
-			    u32 opaque_key, u32 dest_idx_unmasked)
+			     u32 opaque_key, u32 dest_idx_unmasked,
+			     unsigned int *frag_size)
 {
 	struct tg3_rx_buffer_desc *desc;
 	struct ring_info *map;
@@ -5675,16 +5682,36 @@ static int tg3_alloc_rx_data(struct tg3 *tp, struct tg3_rx_prodring_set *tpr,
 	 */
 	skb_size = SKB_DATA_ALIGN(data_size + TG3_RX_OFFSET(tp)) +
 		   SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-	data = kmalloc(skb_size, GFP_ATOMIC);
-	if (!data)
-		return -ENOMEM;
-
+	if (skb_size <= 2048) {
+		if (tpr->rx_page_size < 2048) {
+			struct page *page = alloc_page(GFP_ATOMIC);
+			if (!page)
+				return -ENOMEM;
+			atomic_add((PAGE_SIZE / 2048) - 1, &page->_count); 
+			tpr->rx_page_addr = page_address(page);
+			tpr->rx_page_size = PAGE_SIZE;
+		}
+		data = tpr->rx_page_addr;
+		tpr->rx_page_addr += 2048;
+		tpr->rx_page_size -= 2048;
+		*frag_size = 2048;
+	} else {
+		data = kmalloc(skb_size, GFP_ATOMIC);
+		if (!data)
+			return -ENOMEM;
+		*frag_size = 0;
+	}
 	mapping = pci_map_single(tp->pdev,
 				 data + TG3_RX_OFFSET(tp),
 				 data_size,
 				 PCI_DMA_FROMDEVICE);
 	if (pci_dma_mapping_error(tp->pdev, mapping)) {
-		kfree(data);
+		if (skb_size <= 2048) {
+			tpr->rx_page_addr -= 2048;
+			tpr->rx_page_size += 2048;
+		} else {
+			kfree(data);
+		}
 		return -EIO;
 	}
 
@@ -5835,18 +5862,22 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
 
 		if (len > TG3_RX_COPY_THRESH(tp)) {
 			int skb_size;
+			unsigned int frag_size;
 
 			skb_size = tg3_alloc_rx_data(tp, tpr, opaque_key,
-						    *post_ptr);
+						    *post_ptr, &frag_size);
 			if (skb_size < 0)
 				goto drop_it;
 
 			pci_unmap_single(tp->pdev, dma_addr, skb_size,
 					 PCI_DMA_FROMDEVICE);
 
-			skb = build_skb(data);
+			skb = build_skb(data, frag_size);
 			if (!skb) {
-				kfree(data);
+				if (frag_size)
+					put_page(virt_to_head_page(data));
+				else
+					kfree(data);
 				goto drop_it_no_recycle;
 			}
 			skb_reserve(skb, TG3_RX_OFFSET(tp));
@@ -7279,7 +7310,10 @@ static int tg3_rx_prodring_alloc(struct tg3 *tp,
 
 	/* Now allocate fresh SKBs for each rx ring. */
 	for (i = 0; i < tp->rx_pending; i++) {
-		if (tg3_alloc_rx_data(tp, tpr, RXD_OPAQUE_RING_STD, i) < 0) {
+		unsigned int frag_size;
+
+		if (tg3_alloc_rx_data(tp, tpr, RXD_OPAQUE_RING_STD, i,
+				      &frag_size) < 0) {
 			netdev_warn(tp->dev,
 				    "Using a smaller RX standard ring. Only "
 				    "%d out of %d buffers were allocated "
@@ -7311,7 +7345,10 @@ static int tg3_rx_prodring_alloc(struct tg3 *tp,
 	}
 
 	for (i = 0; i < tp->rx_jumbo_pending; i++) {
-		if (tg3_alloc_rx_data(tp, tpr, RXD_OPAQUE_RING_JUMBO, i) < 0) {
+		unsigned int frag_size;
+
+		if (tg3_alloc_rx_data(tp, tpr, RXD_OPAQUE_RING_JUMBO, i,
+				      &frag_size) < 0) {
 			netdev_warn(tp->dev,
 				    "Using a smaller RX jumbo ring. Only %d "
 				    "out of %d buffers were allocated "
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index 93865f8..7c85545 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -2815,6 +2815,8 @@ struct tg3_rx_prodring_set {
 	struct ring_info		*rx_jmb_buffers;
 	dma_addr_t			rx_std_mapping;
 	dma_addr_t			rx_jmb_mapping;
+	void				*rx_page_addr;
+	unsigned int			rx_page_size;
 };
 
 #define TG3_IRQ_MAX_VECS_RSS		5
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4a656b5..1161111 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -470,7 +470,8 @@ struct sk_buff {
 	__u8			wifi_acked_valid:1;
 	__u8			wifi_acked:1;
 	__u8			no_fcs:1;
-	/* 9/11 bit hole (depending on ndisc_nodetype presence) */
+	__u8			head_frag:1;
+	/* 8/10 bit hole (depending on ndisc_nodetype presence) */
 	kmemcheck_bitfield_end(flags2);
 
 #ifdef CONFIG_NET_DMA
@@ -558,11 +559,13 @@ static inline struct rtable *skb_rtable(const struct sk_buff *skb)
 }
 
 extern void kfree_skb(struct sk_buff *skb);
+extern struct kmem_cache *skbuff_head_cache;
+
 extern void consume_skb(struct sk_buff *skb);
 extern void	       __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
 				   gfp_t priority, int fclone, int node);
-extern struct sk_buff *build_skb(void *data);
+extern struct sk_buff *build_skb(void *data, unsigned int frag_size);
 static inline struct sk_buff *alloc_skb(unsigned int size,
 					gfp_t priority)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 501f3cc..9537321 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3546,7 +3546,10 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 		break;
 
 	case GRO_MERGED_FREE:
-		consume_skb(skb);
+		if (skb->head_frag)
+			kmem_cache_free(skbuff_head_cache, skb);
+		else
+			consume_skb(skb);
 		break;
 
 	case GRO_HELD:
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2342a72..d8fb4b2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -69,7 +69,7 @@
 #include <trace/events/skb.h>
 #include <linux/highmem.h>
 
-static struct kmem_cache *skbuff_head_cache __read_mostly;
+struct kmem_cache *skbuff_head_cache __read_mostly;
 static struct kmem_cache *skbuff_fclone_cache __read_mostly;
 
 static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
@@ -245,6 +245,7 @@ EXPORT_SYMBOL(__alloc_skb);
 /**
  * build_skb - build a network buffer
  * @data: data buffer provided by caller
+ * @frag_size: size of fragment, or 0 if head was kmalloced
  *
  * Allocate a new &sk_buff. Caller provides space holding head and
  * skb_shared_info. @data must have been allocated by kmalloc()
@@ -258,20 +259,21 @@ EXPORT_SYMBOL(__alloc_skb);
  *  before giving packet to stack.
  *  RX rings only contains data buffers, not full skbs.
  */
-struct sk_buff *build_skb(void *data)
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
 {
 	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
-	unsigned int size;
+	unsigned int size = frag_size ? : ksize(data);
 
 	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
 	if (!skb)
 		return NULL;
 
-	size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
 	memset(skb, 0, offsetof(struct sk_buff, tail));
 	skb->truesize = SKB_TRUESIZE(size);
+	skb->head_frag = frag_size != 0;
 	atomic_set(&skb->users, 1);
 	skb->head = data;
 	skb->data = data;
@@ -376,6 +378,14 @@ static void skb_clone_fraglist(struct sk_buff *skb)
 		skb_get(list);
 }
 
+static void skb_free_head(struct sk_buff *skb)
+{
+	if (skb->head_frag)
+		put_page(virt_to_head_page(skb->head));
+	else
+		kfree(skb->head);
+}
+
 static void skb_release_data(struct sk_buff *skb)
 {
 	if (!skb->cloned ||
@@ -402,7 +412,7 @@ static void skb_release_data(struct sk_buff *skb)
 		if (skb_has_frag_list(skb))
 			skb_drop_fraglist(skb);
 
-		kfree(skb->head);
+		skb_free_head(skb);
 	}
 }
 
@@ -644,6 +654,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 	C(tail);
 	C(end);
 	C(head);
+	C(head_frag);
 	C(data);
 	C(truesize);
 	atomic_set(&n->users, 1);
@@ -940,7 +951,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
 	}
 
-	if (fastpath &&
+	if (fastpath && !skb->head_frag &&
 	    size + sizeof(struct skb_shared_info) <= ksize(skb->head)) {
 		memmove(skb->head + size, skb_shinfo(skb),
 			offsetof(struct skb_shared_info,
@@ -967,7 +978,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
 
 	if (fastpath) {
-		kfree(skb->head);
+		skb_free_head(skb);
 	} else {
 		/* copy this zero copy skb frags */
 		if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
@@ -985,6 +996,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	off = (data + nhead) - skb->head;
 
 	skb->head     = data;
+	skb->head_frag = 0;
 adjust_others:
 	skb->data    += off;
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
@@ -2889,6 +2901,26 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 
 		NAPI_GRO_CB(skb)->free = 1;
 		goto done;
+	} else if (skb->head_frag) {
+		int nr_frags = pinfo->nr_frags;
+		skb_frag_t *frag = pinfo->frags + nr_frags;
+		struct page *page = virt_to_head_page(skb->head);
+
+
+		if (nr_frags >= MAX_SKB_FRAGS)
+			return -E2BIG;
+		offset -= headlen;
+		offset += skb->head - (unsigned char *)page_address(page);
+
+		pinfo->nr_frags = nr_frags + 1;
+
+		frag->page.p	  = page;
+		frag->page_offset = offset;
+		skb_frag_size_set(frag, len);
+
+		/* note : napi_skb_finish() will free skb, not skb->head */
+		NAPI_GRO_CB(skb)->free = 1;
+		goto done;
 	} else if (skb_gro_len(p) != pinfo->gso_size)
 		return -E2BIG;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c93b0cb..69d379a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4478,6 +4478,7 @@ merge:
 		TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
 		return true;
 	}
+	/* TODO : handle special case where from->frag_head is true */
 	if (skb_headlen(from) == 0 &&
 	    !skb_has_frag_list(to) &&
 	    !skb_has_frag_list(from) &&

  reply	other threads:[~2012-04-26 13:50 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-04-23  9:38 [PATCH 2/2 net-next] tcp: sk_add_backlog() is too agressive for TCP Eric Dumazet
2012-04-23 17:14 ` Rick Jones
2012-04-23 17:23   ` Eric Dumazet
2012-04-23 20:01     ` David Miller
2012-04-23 20:37       ` Eric Dumazet
2012-04-23 20:57         ` Rick Jones
2012-04-23 21:30           ` Eric Dumazet
2012-04-23 21:51             ` Rick Jones
2012-04-23 21:56               ` Rick Jones
2012-04-23 22:05               ` Eric Dumazet
2012-04-23 22:16                 ` Rick Jones
2012-04-24 15:25                   ` Christoph Lameter
2012-04-23 21:01         ` David Miller
2012-04-23 21:38           ` Eric Dumazet
2012-04-24  2:20         ` Eric Dumazet
2012-04-24  2:27           ` David Miller
2012-04-24  8:01           ` Ilpo Järvinen
2012-04-24  8:10             ` David Miller
2012-04-24  8:21               ` Ilpo Järvinen
2012-04-24  8:25                 ` David Miller
2012-04-24  8:40                   ` Ilpo Järvinen
2012-04-24  8:48                     ` David Miller
2012-04-24 10:32                       ` Ilpo Järvinen
2012-04-24  8:56                     ` Eric Dumazet
2012-04-26  8:10                       ` [RFC] allow skb->head to point/alias to first skb frag Eric Dumazet
2012-04-26  8:36                         ` David Miller
2012-04-26  9:10                           ` Eric Dumazet
2012-04-26  9:18                             ` David Miller
2012-04-26  9:22                               ` Eric Dumazet
2012-04-26 10:09                                 ` Maciej Żenczykowski
2012-04-26 12:32                                   ` Eric Dumazet
2012-04-26 13:50                                     ` Eric Dumazet [this message]
2012-04-26 20:12                                       ` David Miller
2012-04-26 20:18                                         ` Eric Dumazet
2012-04-24  8:49             ` [PATCH 2/2 net-next] tcp: sk_add_backlog() is too agressive for TCP Eric Dumazet
2012-04-24  8:44         ` David Laight
2012-04-24  8:53           ` Eric Dumazet

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1335448252.2775.41.camel@edumazet-glaptop \
    --to=eric.dumazet@gmail.com \
    --cc=davem@davemloft.net \
    --cc=ilpo.jarvinen@helsinki.fi \
    --cc=maze@google.com \
    --cc=ncardwell@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=rick.jones2@hp.com \
    --cc=therbert@google.com \
    --cc=ycheng@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox