PATCH zero-copy send completion callback

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* PATCH zero-copy send completion callback
@ 2006-10-16 17:25 Eric Barton
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Barton @ 2006-10-16 17:25 UTC (permalink / raw)
  To: netdev


This patch has been used with the lustre cluster file system (www.lustre.org)
to give notification when page buffers used to send bulk data via TCP/IP may be
overwritten.  It implements...

  a) A general-purpose callback to inform higher-level protocols when a
     zero-copy send of a set of pages has completed.

  b) tcp_sendpage_zccd(), a variation on tcp_sendpage() that includes a
     completion callback parameter.

How to use it ("you" are a higher-level protocol driver)...

  a) Initialise a zero-copy descriptor with your callback procedure.

  b) Pass this descriptor in all zero-copy sends for an arbitrary set of pages.
     Skbuffs that reference your pages also take a reference on your zero-copy
     callback descriptor.  They release this reference when they release their
     page references.

  c) Release your own reference when you've posted all your pages and you're
     ready for the callback.

  d) The callback occurs when the last reference is dropped.


This patch applies on branch 'master' of
git://kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

================================================================================
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 85577a4..4afaef1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -129,6 +129,36 @@ struct skb_frag_struct {
 	__u16 size;
 };
 
+/* Zero Copy Callback Descriptor
+ * This struct supports receiving notification when zero-copy network I/O has
+ * completed.  The ZCCD can be embedded in a struct containing the state of a
+ * zero-copy network send.  Every skbuff that references that send's pages also
+ * keeps a reference on the ZCCD.  When they have all been disposed of, the
+ * reference count on the ZCCD drops to zero and the callback is made, telling
+ * the original caller that the pages may now be overwritten. */
+struct zccd 
+{
+	atomic_t	 zccd_refcount;
+	void           (*zccd_callback)(struct zccd *); 
+};
+
+static inline void zccd_init (struct zccd *d, void (*callback)(struct zccd *))
+{
+	atomic_set (&d->zccd_refcount, 1);
+	d->zccd_callback = callback;
+}
+
+static inline void zccd_incref (struct zccd *d)	/* take a reference */
+{
+	atomic_inc (&d->zccd_refcount);
+}
+
+static inline void zccd_decref (struct zccd *d)	/* release a reference */
+{
+	if (atomic_dec_and_test (&d->zccd_refcount))
+		(d->zccd_callback)(d);
+}
+
 /* This data is invariant across clones and lives at
  * the end of the header data, ie. at skb->end.
  */
@@ -141,6 +171,11 @@ struct skb_shared_info {
 	unsigned short  gso_type;
 	unsigned int    ip6_frag_id;
 	struct sk_buff	*frag_list;
+	struct zccd     *zccd1;
+	struct zccd     *zccd2;
+	/* NB zero-copy data is normally whole pages.  We have 2 zccds in an
+	 * skbuff so we don't unneccessarily split the packet where pages fall
+	 * into the same packet. */
 	skb_frag_t	frags[MAX_SKB_FRAGS];
 };
 
@@ -1311,6 +1346,23 @@ #ifdef CONFIG_HIGHMEM
 #endif
 }
 
+/* This skbuf has dropped its pages: drop refs on any zero-copy callback
+ * descriptors it has. */
+static inline void skb_complete_zccd (struct sk_buff *skb)
+{
+	struct skb_shared_info *info = skb_shinfo(skb);
+	
+	if (info->zccd1 != NULL) {
+		zccd_decref(info->zccd1);
+		info->zccd1 = NULL;
+	}
+
+	if (info->zccd2 != NULL) {
+		zccd_decref(info->zccd2);
+		info->zccd2 = NULL;
+	}
+}
+
 #define skb_queue_walk(queue, skb) \
 		for (skb = (queue)->next;					\
 		     prefetch(skb->next), (skb != (struct sk_buff *)(queue));	\
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7a093d0..e02b55f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -278,6 +278,8 @@ extern int		    	tcp_v4_tw_remember_stam
 extern int			tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
 					    struct msghdr *msg, size_t size);
 extern ssize_t			tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
+extern ssize_t			tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
+						  int flags, struct zccd *zccd);
 
 extern int			tcp_ioctl(struct sock *sk, 
 					  int cmd, 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3c23760..a1d2ed0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -177,6 +177,8 @@ struct sk_buff *__alloc_skb(unsigned int
 	shinfo->gso_type = 0;
 	shinfo->ip6_frag_id = 0;
 	shinfo->frag_list = NULL;
+	shinfo->zccd1 = NULL;
+	shinfo->zccd2 = NULL;
 
 	if (fclone) {
 		struct sk_buff *child = skb + 1;
@@ -242,6 +244,8 @@ struct sk_buff *alloc_skb_from_cache(kme
 	skb_shinfo(skb)->gso_segs = 0;
 	skb_shinfo(skb)->gso_type = 0;
 	skb_shinfo(skb)->frag_list = NULL;
+	skb_shinfo(skb)->zccd1 = NULL;
+	skb_shinfo(skb)->zccd2 = NULL;
 out:
 	return skb;
 nodata:
@@ -307,6 +311,9 @@ static void skb_release_data(struct sk_b
 	if (!skb->cloned ||
 	    !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
 			       &skb_shinfo(skb)->dataref)) {
+		/* complete zero-copy callbacks (if any) */
+		skb_complete_zccd(skb);
+
 		if (skb_shinfo(skb)->nr_frags) {
 			int i;
 			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
@@ -650,6 +657,18 @@ struct sk_buff *pskb_copy(struct sk_buff
 			get_page(skb_shinfo(n)->frags[i].page);
 		}
 		skb_shinfo(n)->nr_frags = i;
+
+		if (skb_shinfo(skb)->zccd1 != NULL) {
+			BUG_TRAP(skb_shinfo(n)->zccd1 == NULL);
+			skb_shinfo(n)->zccd1 = skb_shinfo(skb)->zccd1;
+			zccd_incref(skb_shinfo(n)->zccd1);
+		}
+
+		if (skb_shinfo(skb)->zccd2 != NULL) {
+			BUG_TRAP(skb_shinfo(n)->zccd2 == NULL);
+			skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
+			zccd_incref(skb_shinfo(n)->zccd2);
+		}
 	}
 
 	if (skb_shinfo(skb)->frag_list) {
@@ -700,6 +719,13 @@ int pskb_expand_head(struct sk_buff *skb
 	memcpy(data + nhead, skb->head, skb->tail - skb->head);
 	memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
 
+	/* zero-copy descriptors have been copied into the new shinfo - 
+	 * account the new references */
+	if (skb_shinfo(skb)->zccd1 != NULL)
+	   zccd_incref(skb_shinfo(skb)->zccd1);
+	if (skb_shinfo(skb)->zccd2 != NULL)
+	   zccd_incref(skb_shinfo(skb)->zccd2);
+	
 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 		get_page(skb_shinfo(skb)->frags[i].page);
 
@@ -881,6 +907,8 @@ int ___pskb_trim(struct sk_buff *skb, un
 
 drop_pages:
 		skb_shinfo(skb)->nr_frags = i;
+		if (i == 0)
+			skb_complete_zccd(skb);
 
 		for (; i < nfrags; i++)
 			put_page(skb_shinfo(skb)->frags[i].page);
@@ -1066,6 +1094,9 @@ pull_pages:
 	}
 	skb_shinfo(skb)->nr_frags = k;
 
+	if (k == 0)				/* dropped all the pages */
+		skb_complete_zccd(skb);		/* drop zccd refs */
+		
 	skb->tail     += delta;
 	skb->data_len -= delta;
 
@@ -1598,6 +1629,15 @@ static inline void skb_split_inside_head
 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 		skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
 
+	/* Transfer zero-copy callback descriptors */
+	BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
+	skb_shinfo(skb1)->zccd1    = skb_shinfo(skb)->zccd1;
+	skb_shinfo(skb)->zccd1     = NULL;
+
+	BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
+	skb_shinfo(skb1)->zccd2    = skb_shinfo(skb)->zccd2;
+	skb_shinfo(skb)->zccd2     = NULL;
+
 	skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
 	skb_shinfo(skb)->nr_frags  = 0;
 	skb1->data_len		   = skb->data_len;
@@ -1646,6 +1686,30 @@ static inline void skb_split_no_header(s
 		pos += size;
 	}
 	skb_shinfo(skb1)->nr_frags = k;
+
+	if (k != 0) {				
+		/* skb1 has pages. Transfer or clone the zccds */
+
+		if (skb_shinfo(skb)->zccd1 != NULL) {
+			BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
+			skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1;
+
+			if (skb_shinfo(skb)->nr_frags == 0)
+				skb_shinfo(skb)->zccd1 = NULL;
+			else
+				zccd_incref(skb_shinfo(skb)->zccd1);
+		}
+		
+		if (skb_shinfo(skb)->zccd2 != NULL) {
+			BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
+			skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2;
+
+			if (skb_shinfo(skb)->nr_frags == 0)
+				skb_shinfo(skb)->zccd2 = NULL;
+			else
+				zccd_incref(skb_shinfo(skb)->zccd2);
+		}
+	}
 }
 
 /**
@@ -2024,6 +2088,21 @@ struct sk_buff *skb_segment(struct sk_bu
 			frag++;
 		}
 
+		if (k != 0) {
+			/* nskb has pages.  Clone the zccds */
+			if (skb_shinfo(skb)->zccd1 != NULL) {
+				BUG_TRAP(skb_shinfo(nskb)->zccd1 == NULL);
+				skb_shinfo(nskb)->zccd1 = skb_shinfo(skb)->zccd1;
+				zccd_incref(skb_shinfo(skb)->zccd1);
+			}
+		
+			if (skb_shinfo(skb)->zccd2 != NULL) {
+				BUG_TRAP(skb_shinfo(nskb)->zccd2 == NULL);
+				skb_shinfo(nskb)->zccd2 = skb_shinfo(skb)->zccd2;
+				zccd_incref(skb_shinfo(skb)->zccd2);
+			}
+		}
+		
 		skb_shinfo(nskb)->nr_frags = k;
 		nskb->data_len = len - hsize;
 		nskb->len += nskb->data_len;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 66e9a72..515c8b4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -499,8 +499,9 @@ static inline void tcp_push(struct sock 
 	}
 }
 
+/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
-			 size_t psize, int flags)
+				size_t psize, int flags, struct zccd *zccd)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int mss_now, size_goal;
@@ -548,6 +549,16 @@ new_segment:
 			copy = size;
 
 		i = skb_shinfo(skb)->nr_frags;
+
+		if (zccd != NULL &&                   /* completion callback wanted */
+		    skb_shinfo(skb)->zccd1 != NULL && /* no room for zccd */
+		    skb_shinfo(skb)->zccd2 != NULL && 
+		    skb_shinfo(skb)->zccd1 != zccd && /* room needed */
+		    skb_shinfo(skb)->zccd2 != zccd) {
+			tcp_mark_push (tp, skb);
+			goto new_segment;
+		}
+
 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
 		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
 			tcp_mark_push(tp, skb);
@@ -563,6 +574,18 @@ new_segment:
 			skb_fill_page_desc(skb, i, page, offset, copy);
 		}
 
+		if (zccd != NULL &&		      /* completion callback wanted */
+		    skb_shinfo(skb)->zccd1 != zccd && /* new to this skbuf */
+		    skb_shinfo(skb)->zccd2 != zccd) {
+			if (skb_shinfo(skb)->zccd1 == NULL) {
+				skb_shinfo(skb)->zccd1 = zccd;
+			} else {
+				BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
+				skb_shinfo(skb)->zccd2 = zccd;
+			}
+			zccd_incref(zccd);	      /* new reference */
+		}
+
 		skb->len += copy;
 		skb->data_len += copy;
 		skb->truesize += copy;
@@ -616,8 +639,8 @@ out_err:
 	return sk_stream_error(sk, flags, err);
 }
 
-ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
-		     size_t size, int flags)
+ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset,
+			  size_t size, int flags, struct zccd *zccd)
 {
 	ssize_t res;
 	struct sock *sk = sock->sk;
@@ -628,12 +651,18 @@ ssize_t tcp_sendpage(struct socket *sock
 
 	lock_sock(sk);
 	TCP_CHECK_TIMER(sk);
-	res = do_tcp_sendpages(sk, &page, offset, size, flags);
+	res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
 	TCP_CHECK_TIMER(sk);
 	release_sock(sk);
 	return res;
 }
 
+ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
+		     size_t size, int flags)
+{
+	return tcp_sendpage_zccd(sock, page, offset, size, flags, NULL);
+}
+
 #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
 #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
 
@@ -2347,6 +2376,7 @@ EXPORT_SYMBOL(tcp_read_sock);
 EXPORT_SYMBOL(tcp_recvmsg);
 EXPORT_SYMBOL(tcp_sendmsg);
 EXPORT_SYMBOL(tcp_sendpage);
+EXPORT_SYMBOL(tcp_sendpage_zccd);
 EXPORT_SYMBOL(tcp_setsockopt);
 EXPORT_SYMBOL(tcp_shutdown);
 EXPORT_SYMBOL(tcp_statistics);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f22536e..943bc7b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -680,6 +680,9 @@ static void __pskb_trim_head(struct sk_b
 	}
 	skb_shinfo(skb)->nr_frags = k;
 
+	if (k == 0)				/* dropped all pages */
+		skb_complete_zccd(skb);
+	
 	skb->tail = skb->data;
 	skb->data_len -= len;
 	skb->len = skb->data_len;


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* PATCH zero-copy send completion callback
@ 2006-10-16 18:21 Eric Barton
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Barton @ 2006-10-16 18:21 UTC (permalink / raw)
  To: netdev


This patch has been used with the lustre cluster file system (www.lustre.org)
to give notification when page buffers used to send bulk data via TCP/IP may be
overwritten.  It implements...

  a) A general-purpose callback to inform higher-level protocols when a
     zero-copy send of a set of pages has completed.

  b) tcp_sendpage_zccd(), a variation on tcp_sendpage() that includes a
     completion callback parameter.

How to use it ("you" are a higher-level protocol driver)...

  a) Initialise a zero-copy descriptor with your callback procedure.

  b) Pass this descriptor in all zero-copy sends for an arbitrary set of pages.
     Skbuffs that reference your pages also take a reference on your zero-copy
     callback descriptor.  They release this reference when they release their
     page references.

  c) Release your own reference when you've posted all your pages and you're
     ready for the callback.

  d) The callback occurs when the last reference is dropped.


This patch applies on branch 'master' of
git://kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

================================================================================
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 85577a4..4afaef1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -129,6 +129,36 @@ struct skb_frag_struct {
 	__u16 size;
 };
 
+/* Zero Copy Callback Descriptor
+ * This struct supports receiving notification when zero-copy network I/O has
+ * completed.  The ZCCD can be embedded in a struct containing the state of a
+ * zero-copy network send.  Every skbuff that references that send's pages also
+ * keeps a reference on the ZCCD.  When they have all been disposed of, the
+ * reference count on the ZCCD drops to zero and the callback is made, telling
+ * the original caller that the pages may now be overwritten. */
+struct zccd 
+{
+	atomic_t	 zccd_refcount;
+	void           (*zccd_callback)(struct zccd *); 
+};
+
+static inline void zccd_init (struct zccd *d, void (*callback)(struct zccd *))
+{
+	atomic_set (&d->zccd_refcount, 1);
+	d->zccd_callback = callback;
+}
+
+static inline void zccd_incref (struct zccd *d)	/* take a reference */
+{
+	atomic_inc (&d->zccd_refcount);
+}
+
+static inline void zccd_decref (struct zccd *d)	/* release a reference */
+{
+	if (atomic_dec_and_test (&d->zccd_refcount))
+		(d->zccd_callback)(d);
+}
+
 /* This data is invariant across clones and lives at
  * the end of the header data, ie. at skb->end.
  */
@@ -141,6 +171,11 @@ struct skb_shared_info {
 	unsigned short  gso_type;
 	unsigned int    ip6_frag_id;
 	struct sk_buff	*frag_list;
+	struct zccd     *zccd1;
+	struct zccd     *zccd2;
+	/* NB zero-copy data is normally whole pages.  We have 2 zccds in an
+	 * skbuff so we don't unneccessarily split the packet where pages fall
+	 * into the same packet. */
 	skb_frag_t	frags[MAX_SKB_FRAGS];
 };
 
@@ -1311,6 +1346,23 @@ #ifdef CONFIG_HIGHMEM
 #endif
 }
 
+/* This skbuf has dropped its pages: drop refs on any zero-copy callback
+ * descriptors it has. */
+static inline void skb_complete_zccd (struct sk_buff *skb)
+{
+	struct skb_shared_info *info = skb_shinfo(skb);
+	
+	if (info->zccd1 != NULL) {
+		zccd_decref(info->zccd1);
+		info->zccd1 = NULL;
+	}
+
+	if (info->zccd2 != NULL) {
+		zccd_decref(info->zccd2);
+		info->zccd2 = NULL;
+	}
+}
+
 #define skb_queue_walk(queue, skb) \
 		for (skb = (queue)->next;					\
 		     prefetch(skb->next), (skb != (struct sk_buff *)(queue));	\
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7a093d0..e02b55f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -278,6 +278,8 @@ extern int		    	tcp_v4_tw_remember_stam
 extern int			tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
 					    struct msghdr *msg, size_t size);
 extern ssize_t			tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
+extern ssize_t			tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
+						  int flags, struct zccd *zccd);
 
 extern int			tcp_ioctl(struct sock *sk, 
 					  int cmd, 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3c23760..a1d2ed0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -177,6 +177,8 @@ struct sk_buff *__alloc_skb(unsigned int
 	shinfo->gso_type = 0;
 	shinfo->ip6_frag_id = 0;
 	shinfo->frag_list = NULL;
+	shinfo->zccd1 = NULL;
+	shinfo->zccd2 = NULL;
 
 	if (fclone) {
 		struct sk_buff *child = skb + 1;
@@ -242,6 +244,8 @@ struct sk_buff *alloc_skb_from_cache(kme
 	skb_shinfo(skb)->gso_segs = 0;
 	skb_shinfo(skb)->gso_type = 0;
 	skb_shinfo(skb)->frag_list = NULL;
+	skb_shinfo(skb)->zccd1 = NULL;
+	skb_shinfo(skb)->zccd2 = NULL;
 out:
 	return skb;
 nodata:
@@ -307,6 +311,9 @@ static void skb_release_data(struct sk_b
 	if (!skb->cloned ||
 	    !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
 			       &skb_shinfo(skb)->dataref)) {
+		/* complete zero-copy callbacks (if any) */
+		skb_complete_zccd(skb);
+
 		if (skb_shinfo(skb)->nr_frags) {
 			int i;
 			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
@@ -650,6 +657,18 @@ struct sk_buff *pskb_copy(struct sk_buff
 			get_page(skb_shinfo(n)->frags[i].page);
 		}
 		skb_shinfo(n)->nr_frags = i;
+
+		if (skb_shinfo(skb)->zccd1 != NULL) {
+			BUG_TRAP(skb_shinfo(n)->zccd1 == NULL);
+			skb_shinfo(n)->zccd1 = skb_shinfo(skb)->zccd1;
+			zccd_incref(skb_shinfo(n)->zccd1);
+		}
+
+		if (skb_shinfo(skb)->zccd2 != NULL) {
+			BUG_TRAP(skb_shinfo(n)->zccd2 == NULL);
+			skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
+			zccd_incref(skb_shinfo(n)->zccd2);
+		}
 	}
 
 	if (skb_shinfo(skb)->frag_list) {
@@ -700,6 +719,13 @@ int pskb_expand_head(struct sk_buff *skb
 	memcpy(data + nhead, skb->head, skb->tail - skb->head);
 	memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
 
+	/* zero-copy descriptors have been copied into the new shinfo - 
+	 * account the new references */
+	if (skb_shinfo(skb)->zccd1 != NULL)
+	   zccd_incref(skb_shinfo(skb)->zccd1);
+	if (skb_shinfo(skb)->zccd2 != NULL)
+	   zccd_incref(skb_shinfo(skb)->zccd2);
+	
 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 		get_page(skb_shinfo(skb)->frags[i].page);
 
@@ -881,6 +907,8 @@ int ___pskb_trim(struct sk_buff *skb, un
 
 drop_pages:
 		skb_shinfo(skb)->nr_frags = i;
+		if (i == 0)
+			skb_complete_zccd(skb);
 
 		for (; i < nfrags; i++)
 			put_page(skb_shinfo(skb)->frags[i].page);
@@ -1066,6 +1094,9 @@ pull_pages:
 	}
 	skb_shinfo(skb)->nr_frags = k;
 
+	if (k == 0)				/* dropped all the pages */
+		skb_complete_zccd(skb);		/* drop zccd refs */
+		
 	skb->tail     += delta;
 	skb->data_len -= delta;
 
@@ -1598,6 +1629,15 @@ static inline void skb_split_inside_head
 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 		skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
 
+	/* Transfer zero-copy callback descriptors */
+	BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
+	skb_shinfo(skb1)->zccd1    = skb_shinfo(skb)->zccd1;
+	skb_shinfo(skb)->zccd1     = NULL;
+
+	BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
+	skb_shinfo(skb1)->zccd2    = skb_shinfo(skb)->zccd2;
+	skb_shinfo(skb)->zccd2     = NULL;
+
 	skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
 	skb_shinfo(skb)->nr_frags  = 0;
 	skb1->data_len		   = skb->data_len;
@@ -1646,6 +1686,30 @@ static inline void skb_split_no_header(s
 		pos += size;
 	}
 	skb_shinfo(skb1)->nr_frags = k;
+
+	if (k != 0) {				
+		/* skb1 has pages. Transfer or clone the zccds */
+
+		if (skb_shinfo(skb)->zccd1 != NULL) {
+			BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
+			skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1;
+
+			if (skb_shinfo(skb)->nr_frags == 0)
+				skb_shinfo(skb)->zccd1 = NULL;
+			else
+				zccd_incref(skb_shinfo(skb)->zccd1);
+		}
+		
+		if (skb_shinfo(skb)->zccd2 != NULL) {
+			BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
+			skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2;
+
+			if (skb_shinfo(skb)->nr_frags == 0)
+				skb_shinfo(skb)->zccd2 = NULL;
+			else
+				zccd_incref(skb_shinfo(skb)->zccd2);
+		}
+	}
 }
 
 /**
@@ -2024,6 +2088,21 @@ struct sk_buff *skb_segment(struct sk_bu
 			frag++;
 		}
 
+		if (k != 0) {
+			/* nskb has pages.  Clone the zccds */
+			if (skb_shinfo(skb)->zccd1 != NULL) {
+				BUG_TRAP(skb_shinfo(nskb)->zccd1 == NULL);
+				skb_shinfo(nskb)->zccd1 = skb_shinfo(skb)->zccd1;
+				zccd_incref(skb_shinfo(skb)->zccd1);
+			}
+		
+			if (skb_shinfo(skb)->zccd2 != NULL) {
+				BUG_TRAP(skb_shinfo(nskb)->zccd2 == NULL);
+				skb_shinfo(nskb)->zccd2 = skb_shinfo(skb)->zccd2;
+				zccd_incref(skb_shinfo(skb)->zccd2);
+			}
+		}
+		
 		skb_shinfo(nskb)->nr_frags = k;
 		nskb->data_len = len - hsize;
 		nskb->len += nskb->data_len;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 66e9a72..515c8b4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -499,8 +499,9 @@ static inline void tcp_push(struct sock 
 	}
 }
 
+/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
-			 size_t psize, int flags)
+				size_t psize, int flags, struct zccd *zccd)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int mss_now, size_goal;
@@ -548,6 +549,16 @@ new_segment:
 			copy = size;
 
 		i = skb_shinfo(skb)->nr_frags;
+
+		if (zccd != NULL &&                   /* completion callback wanted */
+		    skb_shinfo(skb)->zccd1 != NULL && /* no room for zccd */
+		    skb_shinfo(skb)->zccd2 != NULL && 
+		    skb_shinfo(skb)->zccd1 != zccd && /* room needed */
+		    skb_shinfo(skb)->zccd2 != zccd) {
+			tcp_mark_push (tp, skb);
+			goto new_segment;
+		}
+
 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
 		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
 			tcp_mark_push(tp, skb);
@@ -563,6 +574,18 @@ new_segment:
 			skb_fill_page_desc(skb, i, page, offset, copy);
 		}
 
+		if (zccd != NULL &&		      /* completion callback wanted */
+		    skb_shinfo(skb)->zccd1 != zccd && /* new to this skbuf */
+		    skb_shinfo(skb)->zccd2 != zccd) {
+			if (skb_shinfo(skb)->zccd1 == NULL) {
+				skb_shinfo(skb)->zccd1 = zccd;
+			} else {
+				BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
+				skb_shinfo(skb)->zccd2 = zccd;
+			}
+			zccd_incref(zccd);	      /* new reference */
+		}
+
 		skb->len += copy;
 		skb->data_len += copy;
 		skb->truesize += copy;
@@ -616,8 +639,8 @@ out_err:
 	return sk_stream_error(sk, flags, err);
 }
 
-ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
-		     size_t size, int flags)
+ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset,
+			  size_t size, int flags, struct zccd *zccd)
 {
 	ssize_t res;
 	struct sock *sk = sock->sk;
@@ -628,12 +651,18 @@ ssize_t tcp_sendpage(struct socket *sock
 
 	lock_sock(sk);
 	TCP_CHECK_TIMER(sk);
-	res = do_tcp_sendpages(sk, &page, offset, size, flags);
+	res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
 	TCP_CHECK_TIMER(sk);
 	release_sock(sk);
 	return res;
 }
 
+ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
+		     size_t size, int flags)
+{
+	return tcp_sendpage_zccd(sock, page, offset, size, flags, NULL);
+}
+
 #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
 #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
 
@@ -2347,6 +2376,7 @@ EXPORT_SYMBOL(tcp_read_sock);
 EXPORT_SYMBOL(tcp_recvmsg);
 EXPORT_SYMBOL(tcp_sendmsg);
 EXPORT_SYMBOL(tcp_sendpage);
+EXPORT_SYMBOL(tcp_sendpage_zccd);
 EXPORT_SYMBOL(tcp_setsockopt);
 EXPORT_SYMBOL(tcp_shutdown);
 EXPORT_SYMBOL(tcp_statistics);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f22536e..943bc7b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -680,6 +680,9 @@ static void __pskb_trim_head(struct sk_b
 	}
 	skb_shinfo(skb)->nr_frags = k;
 
+	if (k == 0)				/* dropped all pages */
+		skb_complete_zccd(skb);
+	
 	skb->tail = skb->data;
 	skb->data_len -= len;
 	skb->len = skb->data_len;


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* RE: PATCH zero-copy send completion callback
       [not found] <20061016.135222.78711520.davem@davemloft.net>
@ 2006-10-17  0:53 ` Eric Barton
  2006-10-17  9:01   ` Eric Dumazet
  2006-10-17 11:19   ` Evgeniy Polyakov
  0 siblings, 2 replies; 10+ messages in thread
From: Eric Barton @ 2006-10-17  0:53 UTC (permalink / raw)
  To: 'David Miller'; +Cc: netdev

David,

> Also, the correct mailing list to get to the networking developers
> is netdev@vger.kernel.org.  "linux-net" is for users.

Noted.

> Finally, I very much doubt you have much chance getting this
> change in, the infrastructure is implemented in a very ad-hoc
> fashion and it takes into consideration none of the potential
> other users of such a thing.  

Are you referring to the absence of a callback argument other than the
callback descriptor itself?  It seemed natural to me to contain the
descriptor in whatever state the higher-level protocol associates with the
message it's sending, and to derive this from the descriptor address in the
callback.

If this isn't what you mean, could you explain?  I'm not at all religious
about it.

> And these days we're trying to figure
> out how to eliminate skbuff and skb_shared_info struct members
> whereas you're adding 16-bytes of space on 64-bit platforms.

Do you think the general concept of a zero-copy completion callback is
useful?

If so, do you have any ideas about how to do it more economically?  It's 2
pointers rather than 1 to avoid forcing an unnecessary packet boundary
between successive zero-copy sends.  But I guess that might not be hugely
significant since you're generally sending many pages when zero-copy is
needed for performance.  Also, (please correct me if I'm wrong) I didn't
think this would push the allocation over to the next entry in
'malloc_sizes'.

                Cheers,
                        Eric

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: PATCH zero-copy send completion callback
  2006-10-17  0:53 ` Eric Barton
@ 2006-10-17  9:01   ` Eric Dumazet
  2006-10-17 12:23     ` Eric Barton
  2006-10-17 11:19   ` Evgeniy Polyakov
  1 sibling, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2006-10-17  9:01 UTC (permalink / raw)
  To: Eric Barton; +Cc: 'David Miller', netdev

On Tuesday 17 October 2006 02:53, Eric Barton wrote:
> If so, do you have any ideas about how to do it more economically?  It's 2
> pointers rather than 1 to avoid forcing an unnecessary packet boundary
> between successive zero-copy sends.  But I guess that might not be hugely
> significant since you're generally sending many pages when zero-copy is
> needed for performance.  Also, (please correct me if I'm wrong) I didn't
> think this would push the allocation over to the next entry in
> 'malloc_sizes'.

Well, skbuff heads are allocated from dedicated kmem_cache 
(skbuff_fclone_cache & skbuff_head_cache), and these caches are not 
constrained by the sizes available in malloc_sizes. Their size are a multiple 
of L1 CACHE size, which is 64 bytes for most common machines.

Even if your two pointers addition (16 bytes on x86_64) doesnt cross a 64bytes 
line (I didn't checked), they are going to be set to NULL each time a skbuff 
is allocated , and checked against NULL each time a skbuff is destroyed.

Eric

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: PATCH zero-copy send completion callback
  2006-10-17  0:53 ` Eric Barton
  2006-10-17  9:01   ` Eric Dumazet
@ 2006-10-17 11:19   ` Evgeniy Polyakov
  1 sibling, 0 replies; 10+ messages in thread
From: Evgeniy Polyakov @ 2006-10-17 11:19 UTC (permalink / raw)
  To: Eric Barton; +Cc: 'David Miller', netdev

On Tue, Oct 17, 2006 at 01:53:02AM +0100, Eric Barton (eeb@bartonsoftware.com) wrote:
> > And these days we're trying to figure
> > out how to eliminate skbuff and skb_shared_info struct members
> > whereas you're adding 16-bytes of space on 64-bit platforms.
> 
> Do you think the general concept of a zero-copy completion callback is
> useful?

You can use existing skb destructor and appropriate reference counter is
already there. In your own destructor you need to call old one of
course, and it's type can be determined from the analysis of the headers
and skb itself (there are not so much destructor's types actually).
If that level of abstraction is not enough, it is possible to change
skb_release_data()/__kfree_skb() so that it would be possible in
skb->destructor() to determine if attached pages will be freed or not.

> If so, do you have any ideas about how to do it more economically?  It's 2
> pointers rather than 1 to avoid forcing an unnecessary packet boundary
> between successive zero-copy sends.  But I guess that might not be hugely
> significant since you're generally sending many pages when zero-copy is

Existing sendfile() implementation is synchronous, it does not require
async callback. It looks like lustre sets number of pages to be sent
asyncrhonously and report to user that everything is ok, and when
appropriate callback is invoked, it updates it's metadata? Fair enough,
it looks similar to VFS cache in case of usual write.

> needed for performance.  Also, (please correct me if I'm wrong) I didn't
> think this would push the allocation over to the next entry in
> 'malloc_sizes'.

skbs are allocated from own cache, and the smaller it is, the better.

>                 Cheers,
>                         Eric

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: PATCH zero-copy send completion callback
  2006-10-17  9:01   ` Eric Dumazet
@ 2006-10-17 12:23     ` Eric Barton
  2006-10-17 21:45       ` David Miller
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Barton @ 2006-10-17 12:23 UTC (permalink / raw)
  To: 'Eric Dumazet'; +Cc: 'David Miller', netdev

> > Also, (please correct me if I'm wrong) I didn't
> > think this would push the allocation over to the next entry in
> > 'malloc_sizes'.
> 
> Well, skbuff heads are allocated from dedicated kmem_cache 
> (skbuff_fclone_cache & skbuff_head_cache), and these caches are not 
> constrained by the sizes available in malloc_sizes. Their 
> size are a multiple 
> of L1 CACHE size, which is 64 bytes for most common machines.

Indeed, struct skbuff is so allocated.  But I added the callback
pointers to struct skb_shared_info where the page pointers are stored,
and this struct is allocated along with the packet header using kmalloc.

> Even if your two pointers addition (16 bytes on x86_64) 
> doesnt cross a 64bytes 
> line (I didn't checked), they are going to be set to NULL 
> each time a skbuff 
> is allocated , and checked against NULL each time a skbuff is 
> destroyed.

Indeed.  Do you think that's significant?

                Cheers,
                        Eric

---------------------------------------------------
|Eric Barton        Barton Software               |
|9 York Gardens     Tel:    +44 (117) 330 1575    |
|Clifton            Mobile: +44 (7909) 680 356    |
|Bristol BS8 4LL    Fax:    call first            |
|United Kingdom     E-Mail: eeb@bartonsoftware.com|
---------------------------------------------------



^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: PATCH zero-copy send completion callback
       [not found] <20061017094643.GA28926@infradead.org>
@ 2006-10-17 12:27 ` Eric Barton
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Barton @ 2006-10-17 12:27 UTC (permalink / raw)
  To: 'Christoph Hellwig', 'David Miller'; +Cc: netdev

> In addition to that I'm pretty sure I remember that some clusterfs
> person already posted these patches a while ago and got ripped apart
> in the same way.

Yes - unfortunately I didn't submit my patch personally.  And I've
rewritten it since to to avoid the obvious criticisms.  This time
around, I find the comments much more to the point.

                Cheers,
                        Eric



^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: PATCH zero-copy send completion callback
@ 2006-10-17 12:50 Eric Barton
  2006-10-17 13:13 ` Evgeniy Polyakov
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Barton @ 2006-10-17 12:50 UTC (permalink / raw)
  To: 'Evgeniy Polyakov'; +Cc: 'David Miller', netdev

Evgeniy,

> You can use existing skb destructor and appropriate reference
> counter is already there. In your own destructor you need to
> call old one of course, and it's type can be determined from
> the analysis of the headers and skb itself (there are not so
> much destructor's types actually).  If that level of
> abstraction is not enough, it is possible to change
> skb_release_data()/__kfree_skb() so that it would be possible
> in skb->destructor() to determine if attached pages will be
> freed or not.

Yes absolutely.  My first thought was to use the skbuf destructor
but I was paranoid I might screw up the destructor stacking.
Maybe I should have been braver?

Since the callback descriptor needs to track the pages in
skb_shinfo() rather than the skbuf itself, it seemed "natural"
to make skb_release_data() the trigger.

> Existing sendfile() implementation is synchronous, it does not
> require async callback. 

Is it not true that you cannot know when it is safe to overwrite
pages sent in this way?

> skbs are allocated from own cache, and the smaller it is, the better.

As I mentioned in another reply, skbs are indeed allocated from
their own cache, but skb_shinfo() is allocated contiguously with
the packet header using kmalloc.

-- 

                Cheers,
                        Eric

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: PATCH zero-copy send completion callback
  2006-10-17 12:50 Eric Barton
@ 2006-10-17 13:13 ` Evgeniy Polyakov
  0 siblings, 0 replies; 10+ messages in thread
From: Evgeniy Polyakov @ 2006-10-17 13:13 UTC (permalink / raw)
  To: Eric Barton; +Cc: 'David Miller', netdev

On Tue, Oct 17, 2006 at 01:50:04PM +0100, Eric Barton (eeb@bartonsoftware.com) wrote:
> Evgeniy,
> 
> > You can use existing skb destructor and appropriate reference
> > counter is already there. In your own destructor you need to
> > call old one of course, and it's type can be determined from
> > the analysis of the headers and skb itself (there are not so
> > much destructor's types actually).  If that level of
> > abstraction is not enough, it is possible to change
> > skb_release_data()/__kfree_skb() so that it would be possible
> > in skb->destructor() to determine if attached pages will be
> > freed or not.
> 
> Yes absolutely.  My first thought was to use the skbuf destructor
> but I was paranoid I might screw up the destructor stacking.
> Maybe I should have been braver?

It depends on the results quality...

> Since the callback descriptor needs to track the pages in
> skb_shinfo() rather than the skbuf itself, it seemed "natural"
> to make skb_release_data() the trigger.
> 
> > Existing sendfile() implementation is synchronous, it does not
> > require async callback. 
> 
> Is it not true that you cannot know when it is safe to overwrite
> pages sent in this way?

There are tricks all over the place in sendfile. First one is sendpage()
imeplementation, which copies data if hardware does not support
checksumming and scater-gather, and simultaneous writing is "protected" in
the higher layer (check do_generic_mapping_read()). We do not care about
'later' writing, i.e. while skb was in some queue on the local machine,
since new data will be transferred in that case.
truncation is also protected by the fact, that page's reference counter
is increased, so the same page can not be freed and reused.

It was design decision not to care about page overwrites (and thus no
page locking) - either smart hardware transfers new data, or we do copy 
and send old data.

> > skbs are allocated from own cache, and the smaller it is, the better.
> 
> As I mentioned in another reply, skbs are indeed allocated from
> their own cache, but skb_shinfo() is allocated contiguously with
> the packet header using kmalloc.

Yes, skb itself is not touched.

You probably saw a lot of discussions about problems with e1000
hardware, memory fragmentation and jumbo frames.
Since skb_shared_info is added to the actual data, it frequently forces
next order allocations, so one of the solution is to put skb_shared_info
into separate allocations in some cases, although those discussions are
sleeping right now, problem still exists and if your current needs can
be handled within existing interfaces it should be tried first.

> -- 
> 
>                 Cheers,
>                         Eric
> 

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: PATCH zero-copy send completion callback
  2006-10-17 12:23     ` Eric Barton
@ 2006-10-17 21:45       ` David Miller
  0 siblings, 0 replies; 10+ messages in thread
From: David Miller @ 2006-10-17 21:45 UTC (permalink / raw)
  To: eeb; +Cc: dada1, netdev

From: "Eric Barton" <eeb@bartonsoftware.com>
Date: Tue, 17 Oct 2006 13:23:10 +0100

> > Even if your two pointers addition (16 bytes on x86_64) 
> > doesnt cross a 64bytes 
> > line (I didn't checked), they are going to be set to NULL 
> > each time a skbuff 
> > is allocated , and checked against NULL each time a skbuff is 
> > destroyed.
> 
> Indeed.  Do you think that's significant?

On a machine routing a million packets per second, it
definitely is.  It is the most crucial data structure
for performance in all of the networking.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2006-10-17 21:44 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-10-16 17:25 PATCH zero-copy send completion callback Eric Barton
  -- strict thread matches above, loose matches on Subject: below --
2006-10-16 18:21 Eric Barton
     [not found] <20061016.135222.78711520.davem@davemloft.net>
2006-10-17  0:53 ` Eric Barton
2006-10-17  9:01   ` Eric Dumazet
2006-10-17 12:23     ` Eric Barton
2006-10-17 21:45       ` David Miller
2006-10-17 11:19   ` Evgeniy Polyakov
     [not found] <20061017094643.GA28926@infradead.org>
2006-10-17 12:27 ` Eric Barton
2006-10-17 12:50 Eric Barton
2006-10-17 13:13 ` Evgeniy Polyakov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).