* PATCH zero-copy send completion callback
@ 2006-10-16 17:25 Eric Barton
0 siblings, 0 replies; 10+ messages in thread
From: Eric Barton @ 2006-10-16 17:25 UTC (permalink / raw)
To: netdev
This patch has been used with the lustre cluster file system (www.lustre.org)
to give notification when page buffers used to send bulk data via TCP/IP may be
overwritten. It implements...
a) A general-purpose callback to inform higher-level protocols when a
zero-copy send of a set of pages has completed.
b) tcp_sendpage_zccd(), a variation on tcp_sendpage() that includes a
completion callback parameter.
How to use it ("you" are a higher-level protocol driver)...
a) Initialise a zero-copy descriptor with your callback procedure.
b) Pass this descriptor in all zero-copy sends for an arbitrary set of pages.
Skbuffs that reference your pages also take a reference on your zero-copy
callback descriptor. They release this reference when they release their
page references.
c) Release your own reference when you've posted all your pages and you're
ready for the callback.
d) The callback occurs when the last reference is dropped.
This patch applies on branch 'master' of
git://kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
================================================================================
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 85577a4..4afaef1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -129,6 +129,36 @@ struct skb_frag_struct {
__u16 size;
};
+/* Zero Copy Callback Descriptor
+ * This struct supports receiving notification when zero-copy network I/O has
+ * completed. The ZCCD can be embedded in a struct containing the state of a
+ * zero-copy network send. Every skbuff that references that send's pages also
+ * keeps a reference on the ZCCD. When they have all been disposed of, the
+ * reference count on the ZCCD drops to zero and the callback is made, telling
+ * the original caller that the pages may now be overwritten. */
+struct zccd
+{
+ atomic_t zccd_refcount;
+ void (*zccd_callback)(struct zccd *);
+};
+
+static inline void zccd_init (struct zccd *d, void (*callback)(struct zccd *))
+{
+ atomic_set (&d->zccd_refcount, 1);
+ d->zccd_callback = callback;
+}
+
+static inline void zccd_incref (struct zccd *d) /* take a reference */
+{
+ atomic_inc (&d->zccd_refcount);
+}
+
+static inline void zccd_decref (struct zccd *d) /* release a reference */
+{
+ if (atomic_dec_and_test (&d->zccd_refcount))
+ (d->zccd_callback)(d);
+}
+
/* This data is invariant across clones and lives at
* the end of the header data, ie. at skb->end.
*/
@@ -141,6 +171,11 @@ struct skb_shared_info {
unsigned short gso_type;
unsigned int ip6_frag_id;
struct sk_buff *frag_list;
+ struct zccd *zccd1;
+ struct zccd *zccd2;
+ /* NB zero-copy data is normally whole pages. We have 2 zccds in an
+ * skbuff so we don't unneccessarily split the packet where pages fall
+ * into the same packet. */
skb_frag_t frags[MAX_SKB_FRAGS];
};
@@ -1311,6 +1346,23 @@ #ifdef CONFIG_HIGHMEM
#endif
}
+/* This skbuf has dropped its pages: drop refs on any zero-copy callback
+ * descriptors it has. */
+static inline void skb_complete_zccd (struct sk_buff *skb)
+{
+ struct skb_shared_info *info = skb_shinfo(skb);
+
+ if (info->zccd1 != NULL) {
+ zccd_decref(info->zccd1);
+ info->zccd1 = NULL;
+ }
+
+ if (info->zccd2 != NULL) {
+ zccd_decref(info->zccd2);
+ info->zccd2 = NULL;
+ }
+}
+
#define skb_queue_walk(queue, skb) \
for (skb = (queue)->next; \
prefetch(skb->next), (skb != (struct sk_buff *)(queue)); \
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7a093d0..e02b55f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -278,6 +278,8 @@ extern int tcp_v4_tw_remember_stam
extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg, size_t size);
extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
+extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
+ int flags, struct zccd *zccd);
extern int tcp_ioctl(struct sock *sk,
int cmd,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3c23760..a1d2ed0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -177,6 +177,8 @@ struct sk_buff *__alloc_skb(unsigned int
shinfo->gso_type = 0;
shinfo->ip6_frag_id = 0;
shinfo->frag_list = NULL;
+ shinfo->zccd1 = NULL;
+ shinfo->zccd2 = NULL;
if (fclone) {
struct sk_buff *child = skb + 1;
@@ -242,6 +244,8 @@ struct sk_buff *alloc_skb_from_cache(kme
skb_shinfo(skb)->gso_segs = 0;
skb_shinfo(skb)->gso_type = 0;
skb_shinfo(skb)->frag_list = NULL;
+ skb_shinfo(skb)->zccd1 = NULL;
+ skb_shinfo(skb)->zccd2 = NULL;
out:
return skb;
nodata:
@@ -307,6 +311,9 @@ static void skb_release_data(struct sk_b
if (!skb->cloned ||
!atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
&skb_shinfo(skb)->dataref)) {
+ /* complete zero-copy callbacks (if any) */
+ skb_complete_zccd(skb);
+
if (skb_shinfo(skb)->nr_frags) {
int i;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
@@ -650,6 +657,18 @@ struct sk_buff *pskb_copy(struct sk_buff
get_page(skb_shinfo(n)->frags[i].page);
}
skb_shinfo(n)->nr_frags = i;
+
+ if (skb_shinfo(skb)->zccd1 != NULL) {
+ BUG_TRAP(skb_shinfo(n)->zccd1 == NULL);
+ skb_shinfo(n)->zccd1 = skb_shinfo(skb)->zccd1;
+ zccd_incref(skb_shinfo(n)->zccd1);
+ }
+
+ if (skb_shinfo(skb)->zccd2 != NULL) {
+ BUG_TRAP(skb_shinfo(n)->zccd2 == NULL);
+ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
+ zccd_incref(skb_shinfo(n)->zccd2);
+ }
}
if (skb_shinfo(skb)->frag_list) {
@@ -700,6 +719,13 @@ int pskb_expand_head(struct sk_buff *skb
memcpy(data + nhead, skb->head, skb->tail - skb->head);
memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
+ /* zero-copy descriptors have been copied into the new shinfo -
+ * account the new references */
+ if (skb_shinfo(skb)->zccd1 != NULL)
+ zccd_incref(skb_shinfo(skb)->zccd1);
+ if (skb_shinfo(skb)->zccd2 != NULL)
+ zccd_incref(skb_shinfo(skb)->zccd2);
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
get_page(skb_shinfo(skb)->frags[i].page);
@@ -881,6 +907,8 @@ int ___pskb_trim(struct sk_buff *skb, un
drop_pages:
skb_shinfo(skb)->nr_frags = i;
+ if (i == 0)
+ skb_complete_zccd(skb);
for (; i < nfrags; i++)
put_page(skb_shinfo(skb)->frags[i].page);
@@ -1066,6 +1094,9 @@ pull_pages:
}
skb_shinfo(skb)->nr_frags = k;
+ if (k == 0) /* dropped all the pages */
+ skb_complete_zccd(skb); /* drop zccd refs */
+
skb->tail += delta;
skb->data_len -= delta;
@@ -1598,6 +1629,15 @@ static inline void skb_split_inside_head
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
+ /* Transfer zero-copy callback descriptors */
+ BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
+ skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1;
+ skb_shinfo(skb)->zccd1 = NULL;
+
+ BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
+ skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2;
+ skb_shinfo(skb)->zccd2 = NULL;
+
skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
skb_shinfo(skb)->nr_frags = 0;
skb1->data_len = skb->data_len;
@@ -1646,6 +1686,30 @@ static inline void skb_split_no_header(s
pos += size;
}
skb_shinfo(skb1)->nr_frags = k;
+
+ if (k != 0) {
+ /* skb1 has pages. Transfer or clone the zccds */
+
+ if (skb_shinfo(skb)->zccd1 != NULL) {
+ BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
+ skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1;
+
+ if (skb_shinfo(skb)->nr_frags == 0)
+ skb_shinfo(skb)->zccd1 = NULL;
+ else
+ zccd_incref(skb_shinfo(skb)->zccd1);
+ }
+
+ if (skb_shinfo(skb)->zccd2 != NULL) {
+ BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
+ skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2;
+
+ if (skb_shinfo(skb)->nr_frags == 0)
+ skb_shinfo(skb)->zccd2 = NULL;
+ else
+ zccd_incref(skb_shinfo(skb)->zccd2);
+ }
+ }
}
/**
@@ -2024,6 +2088,21 @@ struct sk_buff *skb_segment(struct sk_bu
frag++;
}
+ if (k != 0) {
+ /* nskb has pages. Clone the zccds */
+ if (skb_shinfo(skb)->zccd1 != NULL) {
+ BUG_TRAP(skb_shinfo(nskb)->zccd1 == NULL);
+ skb_shinfo(nskb)->zccd1 = skb_shinfo(skb)->zccd1;
+ zccd_incref(skb_shinfo(skb)->zccd1);
+ }
+
+ if (skb_shinfo(skb)->zccd2 != NULL) {
+ BUG_TRAP(skb_shinfo(nskb)->zccd2 == NULL);
+ skb_shinfo(nskb)->zccd2 = skb_shinfo(skb)->zccd2;
+ zccd_incref(skb_shinfo(skb)->zccd2);
+ }
+ }
+
skb_shinfo(nskb)->nr_frags = k;
nskb->data_len = len - hsize;
nskb->len += nskb->data_len;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 66e9a72..515c8b4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -499,8 +499,9 @@ static inline void tcp_push(struct sock
}
}
+/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
- size_t psize, int flags)
+ size_t psize, int flags, struct zccd *zccd)
{
struct tcp_sock *tp = tcp_sk(sk);
int mss_now, size_goal;
@@ -548,6 +549,16 @@ new_segment:
copy = size;
i = skb_shinfo(skb)->nr_frags;
+
+ if (zccd != NULL && /* completion callback wanted */
+ skb_shinfo(skb)->zccd1 != NULL && /* no room for zccd */
+ skb_shinfo(skb)->zccd2 != NULL &&
+ skb_shinfo(skb)->zccd1 != zccd && /* room needed */
+ skb_shinfo(skb)->zccd2 != zccd) {
+ tcp_mark_push (tp, skb);
+ goto new_segment;
+ }
+
can_coalesce = skb_can_coalesce(skb, i, page, offset);
if (!can_coalesce && i >= MAX_SKB_FRAGS) {
tcp_mark_push(tp, skb);
@@ -563,6 +574,18 @@ new_segment:
skb_fill_page_desc(skb, i, page, offset, copy);
}
+ if (zccd != NULL && /* completion callback wanted */
+ skb_shinfo(skb)->zccd1 != zccd && /* new to this skbuf */
+ skb_shinfo(skb)->zccd2 != zccd) {
+ if (skb_shinfo(skb)->zccd1 == NULL) {
+ skb_shinfo(skb)->zccd1 = zccd;
+ } else {
+ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
+ skb_shinfo(skb)->zccd2 = zccd;
+ }
+ zccd_incref(zccd); /* new reference */
+ }
+
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
@@ -616,8 +639,8 @@ out_err:
return sk_stream_error(sk, flags, err);
}
-ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
- size_t size, int flags)
+ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags, struct zccd *zccd)
{
ssize_t res;
struct sock *sk = sock->sk;
@@ -628,12 +651,18 @@ ssize_t tcp_sendpage(struct socket *sock
lock_sock(sk);
TCP_CHECK_TIMER(sk);
- res = do_tcp_sendpages(sk, &page, offset, size, flags);
+ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
TCP_CHECK_TIMER(sk);
release_sock(sk);
return res;
}
+ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags)
+{
+ return tcp_sendpage_zccd(sock, page, offset, size, flags, NULL);
+}
+
#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
#define TCP_OFF(sk) (sk->sk_sndmsg_off)
@@ -2347,6 +2376,7 @@ EXPORT_SYMBOL(tcp_read_sock);
EXPORT_SYMBOL(tcp_recvmsg);
EXPORT_SYMBOL(tcp_sendmsg);
EXPORT_SYMBOL(tcp_sendpage);
+EXPORT_SYMBOL(tcp_sendpage_zccd);
EXPORT_SYMBOL(tcp_setsockopt);
EXPORT_SYMBOL(tcp_shutdown);
EXPORT_SYMBOL(tcp_statistics);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f22536e..943bc7b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -680,6 +680,9 @@ static void __pskb_trim_head(struct sk_b
}
skb_shinfo(skb)->nr_frags = k;
+ if (k == 0) /* dropped all pages */
+ skb_complete_zccd(skb);
+
skb->tail = skb->data;
skb->data_len -= len;
skb->len = skb->data_len;
^ permalink raw reply related [flat|nested] 10+ messages in thread
* PATCH zero-copy send completion callback
@ 2006-10-16 18:21 Eric Barton
0 siblings, 0 replies; 10+ messages in thread
From: Eric Barton @ 2006-10-16 18:21 UTC (permalink / raw)
To: netdev
This patch has been used with the lustre cluster file system (www.lustre.org)
to give notification when page buffers used to send bulk data via TCP/IP may be
overwritten. It implements...
a) A general-purpose callback to inform higher-level protocols when a
zero-copy send of a set of pages has completed.
b) tcp_sendpage_zccd(), a variation on tcp_sendpage() that includes a
completion callback parameter.
How to use it ("you" are a higher-level protocol driver)...
a) Initialise a zero-copy descriptor with your callback procedure.
b) Pass this descriptor in all zero-copy sends for an arbitrary set of pages.
Skbuffs that reference your pages also take a reference on your zero-copy
callback descriptor. They release this reference when they release their
page references.
c) Release your own reference when you've posted all your pages and you're
ready for the callback.
d) The callback occurs when the last reference is dropped.
This patch applies on branch 'master' of
git://kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
================================================================================
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 85577a4..4afaef1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -129,6 +129,36 @@ struct skb_frag_struct {
__u16 size;
};
+/* Zero Copy Callback Descriptor
+ * This struct supports receiving notification when zero-copy network I/O has
+ * completed. The ZCCD can be embedded in a struct containing the state of a
+ * zero-copy network send. Every skbuff that references that send's pages also
+ * keeps a reference on the ZCCD. When they have all been disposed of, the
+ * reference count on the ZCCD drops to zero and the callback is made, telling
+ * the original caller that the pages may now be overwritten. */
+struct zccd
+{
+ atomic_t zccd_refcount;
+ void (*zccd_callback)(struct zccd *);
+};
+
+static inline void zccd_init (struct zccd *d, void (*callback)(struct zccd *))
+{
+ atomic_set (&d->zccd_refcount, 1);
+ d->zccd_callback = callback;
+}
+
+static inline void zccd_incref (struct zccd *d) /* take a reference */
+{
+ atomic_inc (&d->zccd_refcount);
+}
+
+static inline void zccd_decref (struct zccd *d) /* release a reference */
+{
+ if (atomic_dec_and_test (&d->zccd_refcount))
+ (d->zccd_callback)(d);
+}
+
/* This data is invariant across clones and lives at
* the end of the header data, ie. at skb->end.
*/
@@ -141,6 +171,11 @@ struct skb_shared_info {
unsigned short gso_type;
unsigned int ip6_frag_id;
struct sk_buff *frag_list;
+ struct zccd *zccd1;
+ struct zccd *zccd2;
+ /* NB zero-copy data is normally whole pages. We have 2 zccds in an
+ * skbuff so we don't unneccessarily split the packet where pages fall
+ * into the same packet. */
skb_frag_t frags[MAX_SKB_FRAGS];
};
@@ -1311,6 +1346,23 @@ #ifdef CONFIG_HIGHMEM
#endif
}
+/* This skbuf has dropped its pages: drop refs on any zero-copy callback
+ * descriptors it has. */
+static inline void skb_complete_zccd (struct sk_buff *skb)
+{
+ struct skb_shared_info *info = skb_shinfo(skb);
+
+ if (info->zccd1 != NULL) {
+ zccd_decref(info->zccd1);
+ info->zccd1 = NULL;
+ }
+
+ if (info->zccd2 != NULL) {
+ zccd_decref(info->zccd2);
+ info->zccd2 = NULL;
+ }
+}
+
#define skb_queue_walk(queue, skb) \
for (skb = (queue)->next; \
prefetch(skb->next), (skb != (struct sk_buff *)(queue)); \
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7a093d0..e02b55f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -278,6 +278,8 @@ extern int tcp_v4_tw_remember_stam
extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg, size_t size);
extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
+extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
+ int flags, struct zccd *zccd);
extern int tcp_ioctl(struct sock *sk,
int cmd,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3c23760..a1d2ed0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -177,6 +177,8 @@ struct sk_buff *__alloc_skb(unsigned int
shinfo->gso_type = 0;
shinfo->ip6_frag_id = 0;
shinfo->frag_list = NULL;
+ shinfo->zccd1 = NULL;
+ shinfo->zccd2 = NULL;
if (fclone) {
struct sk_buff *child = skb + 1;
@@ -242,6 +244,8 @@ struct sk_buff *alloc_skb_from_cache(kme
skb_shinfo(skb)->gso_segs = 0;
skb_shinfo(skb)->gso_type = 0;
skb_shinfo(skb)->frag_list = NULL;
+ skb_shinfo(skb)->zccd1 = NULL;
+ skb_shinfo(skb)->zccd2 = NULL;
out:
return skb;
nodata:
@@ -307,6 +311,9 @@ static void skb_release_data(struct sk_b
if (!skb->cloned ||
!atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
&skb_shinfo(skb)->dataref)) {
+ /* complete zero-copy callbacks (if any) */
+ skb_complete_zccd(skb);
+
if (skb_shinfo(skb)->nr_frags) {
int i;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
@@ -650,6 +657,18 @@ struct sk_buff *pskb_copy(struct sk_buff
get_page(skb_shinfo(n)->frags[i].page);
}
skb_shinfo(n)->nr_frags = i;
+
+ if (skb_shinfo(skb)->zccd1 != NULL) {
+ BUG_TRAP(skb_shinfo(n)->zccd1 == NULL);
+ skb_shinfo(n)->zccd1 = skb_shinfo(skb)->zccd1;
+ zccd_incref(skb_shinfo(n)->zccd1);
+ }
+
+ if (skb_shinfo(skb)->zccd2 != NULL) {
+ BUG_TRAP(skb_shinfo(n)->zccd2 == NULL);
+ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
+ zccd_incref(skb_shinfo(n)->zccd2);
+ }
}
if (skb_shinfo(skb)->frag_list) {
@@ -700,6 +719,13 @@ int pskb_expand_head(struct sk_buff *skb
memcpy(data + nhead, skb->head, skb->tail - skb->head);
memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
+ /* zero-copy descriptors have been copied into the new shinfo -
+ * account the new references */
+ if (skb_shinfo(skb)->zccd1 != NULL)
+ zccd_incref(skb_shinfo(skb)->zccd1);
+ if (skb_shinfo(skb)->zccd2 != NULL)
+ zccd_incref(skb_shinfo(skb)->zccd2);
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
get_page(skb_shinfo(skb)->frags[i].page);
@@ -881,6 +907,8 @@ int ___pskb_trim(struct sk_buff *skb, un
drop_pages:
skb_shinfo(skb)->nr_frags = i;
+ if (i == 0)
+ skb_complete_zccd(skb);
for (; i < nfrags; i++)
put_page(skb_shinfo(skb)->frags[i].page);
@@ -1066,6 +1094,9 @@ pull_pages:
}
skb_shinfo(skb)->nr_frags = k;
+ if (k == 0) /* dropped all the pages */
+ skb_complete_zccd(skb); /* drop zccd refs */
+
skb->tail += delta;
skb->data_len -= delta;
@@ -1598,6 +1629,15 @@ static inline void skb_split_inside_head
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
+ /* Transfer zero-copy callback descriptors */
+ BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
+ skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1;
+ skb_shinfo(skb)->zccd1 = NULL;
+
+ BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
+ skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2;
+ skb_shinfo(skb)->zccd2 = NULL;
+
skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
skb_shinfo(skb)->nr_frags = 0;
skb1->data_len = skb->data_len;
@@ -1646,6 +1686,30 @@ static inline void skb_split_no_header(s
pos += size;
}
skb_shinfo(skb1)->nr_frags = k;
+
+ if (k != 0) {
+ /* skb1 has pages. Transfer or clone the zccds */
+
+ if (skb_shinfo(skb)->zccd1 != NULL) {
+ BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
+ skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1;
+
+ if (skb_shinfo(skb)->nr_frags == 0)
+ skb_shinfo(skb)->zccd1 = NULL;
+ else
+ zccd_incref(skb_shinfo(skb)->zccd1);
+ }
+
+ if (skb_shinfo(skb)->zccd2 != NULL) {
+ BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
+ skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2;
+
+ if (skb_shinfo(skb)->nr_frags == 0)
+ skb_shinfo(skb)->zccd2 = NULL;
+ else
+ zccd_incref(skb_shinfo(skb)->zccd2);
+ }
+ }
}
/**
@@ -2024,6 +2088,21 @@ struct sk_buff *skb_segment(struct sk_bu
frag++;
}
+ if (k != 0) {
+ /* nskb has pages. Clone the zccds */
+ if (skb_shinfo(skb)->zccd1 != NULL) {
+ BUG_TRAP(skb_shinfo(nskb)->zccd1 == NULL);
+ skb_shinfo(nskb)->zccd1 = skb_shinfo(skb)->zccd1;
+ zccd_incref(skb_shinfo(skb)->zccd1);
+ }
+
+ if (skb_shinfo(skb)->zccd2 != NULL) {
+ BUG_TRAP(skb_shinfo(nskb)->zccd2 == NULL);
+ skb_shinfo(nskb)->zccd2 = skb_shinfo(skb)->zccd2;
+ zccd_incref(skb_shinfo(skb)->zccd2);
+ }
+ }
+
skb_shinfo(nskb)->nr_frags = k;
nskb->data_len = len - hsize;
nskb->len += nskb->data_len;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 66e9a72..515c8b4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -499,8 +499,9 @@ static inline void tcp_push(struct sock
}
}
+/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
- size_t psize, int flags)
+ size_t psize, int flags, struct zccd *zccd)
{
struct tcp_sock *tp = tcp_sk(sk);
int mss_now, size_goal;
@@ -548,6 +549,16 @@ new_segment:
copy = size;
i = skb_shinfo(skb)->nr_frags;
+
+ if (zccd != NULL && /* completion callback wanted */
+ skb_shinfo(skb)->zccd1 != NULL && /* no room for zccd */
+ skb_shinfo(skb)->zccd2 != NULL &&
+ skb_shinfo(skb)->zccd1 != zccd && /* room needed */
+ skb_shinfo(skb)->zccd2 != zccd) {
+ tcp_mark_push (tp, skb);
+ goto new_segment;
+ }
+
can_coalesce = skb_can_coalesce(skb, i, page, offset);
if (!can_coalesce && i >= MAX_SKB_FRAGS) {
tcp_mark_push(tp, skb);
@@ -563,6 +574,18 @@ new_segment:
skb_fill_page_desc(skb, i, page, offset, copy);
}
+ if (zccd != NULL && /* completion callback wanted */
+ skb_shinfo(skb)->zccd1 != zccd && /* new to this skbuf */
+ skb_shinfo(skb)->zccd2 != zccd) {
+ if (skb_shinfo(skb)->zccd1 == NULL) {
+ skb_shinfo(skb)->zccd1 = zccd;
+ } else {
+ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
+ skb_shinfo(skb)->zccd2 = zccd;
+ }
+ zccd_incref(zccd); /* new reference */
+ }
+
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
@@ -616,8 +639,8 @@ out_err:
return sk_stream_error(sk, flags, err);
}
-ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
- size_t size, int flags)
+ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags, struct zccd *zccd)
{
ssize_t res;
struct sock *sk = sock->sk;
@@ -628,12 +651,18 @@ ssize_t tcp_sendpage(struct socket *sock
lock_sock(sk);
TCP_CHECK_TIMER(sk);
- res = do_tcp_sendpages(sk, &page, offset, size, flags);
+ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
TCP_CHECK_TIMER(sk);
release_sock(sk);
return res;
}
+ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags)
+{
+ return tcp_sendpage_zccd(sock, page, offset, size, flags, NULL);
+}
+
#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
#define TCP_OFF(sk) (sk->sk_sndmsg_off)
@@ -2347,6 +2376,7 @@ EXPORT_SYMBOL(tcp_read_sock);
EXPORT_SYMBOL(tcp_recvmsg);
EXPORT_SYMBOL(tcp_sendmsg);
EXPORT_SYMBOL(tcp_sendpage);
+EXPORT_SYMBOL(tcp_sendpage_zccd);
EXPORT_SYMBOL(tcp_setsockopt);
EXPORT_SYMBOL(tcp_shutdown);
EXPORT_SYMBOL(tcp_statistics);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f22536e..943bc7b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -680,6 +680,9 @@ static void __pskb_trim_head(struct sk_b
}
skb_shinfo(skb)->nr_frags = k;
+ if (k == 0) /* dropped all pages */
+ skb_complete_zccd(skb);
+
skb->tail = skb->data;
skb->data_len -= len;
skb->len = skb->data_len;
^ permalink raw reply related [flat|nested] 10+ messages in thread
* RE: PATCH zero-copy send completion callback
[not found] <20061016.135222.78711520.davem@davemloft.net>
@ 2006-10-17 0:53 ` Eric Barton
2006-10-17 9:01 ` Eric Dumazet
2006-10-17 11:19 ` Evgeniy Polyakov
0 siblings, 2 replies; 10+ messages in thread
From: Eric Barton @ 2006-10-17 0:53 UTC (permalink / raw)
To: 'David Miller'; +Cc: netdev
David,
> Also, the correct mailing list to get to the networking developers
> is netdev@vger.kernel.org. "linux-net" is for users.
Noted.
> Finally, I very much doubt you have much chance getting this
> change in, the infrastructure is implemented in a very ad-hoc
> fashion and it takes into consideration none of the potential
> other users of such a thing.
Are you referring to the absence of a callback argument other than the
callback descriptor itself? It seemed natural to me to contain the
descriptor in whatever state the higher-level protocol associates with the
message it's sending, and to derive this from the descriptor address in the
callback.
If this isn't what you mean, could you explain? I'm not at all religious
about it.
> And these days we're trying to figure
> out how to eliminate skbuff and skb_shared_info struct members
> whereas you're adding 16-bytes of space on 64-bit platforms.
Do you think the general concept of a zero-copy completion callback is
useful?
If so, do you have any ideas about how to do it more economically? It's 2
pointers rather than 1 to avoid forcing an unnecessary packet boundary
between successive zero-copy sends. But I guess that might not be hugely
significant since you're generally sending many pages when zero-copy is
needed for performance. Also, (please correct me if I'm wrong) I didn't
think this would push the allocation over to the next entry in
'malloc_sizes'.
Cheers,
Eric
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: PATCH zero-copy send completion callback
2006-10-17 0:53 ` Eric Barton
@ 2006-10-17 9:01 ` Eric Dumazet
2006-10-17 12:23 ` Eric Barton
2006-10-17 11:19 ` Evgeniy Polyakov
1 sibling, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2006-10-17 9:01 UTC (permalink / raw)
To: Eric Barton; +Cc: 'David Miller', netdev
On Tuesday 17 October 2006 02:53, Eric Barton wrote:
> If so, do you have any ideas about how to do it more economically? It's 2
> pointers rather than 1 to avoid forcing an unnecessary packet boundary
> between successive zero-copy sends. But I guess that might not be hugely
> significant since you're generally sending many pages when zero-copy is
> needed for performance. Also, (please correct me if I'm wrong) I didn't
> think this would push the allocation over to the next entry in
> 'malloc_sizes'.
Well, skbuff heads are allocated from dedicated kmem_cache
(skbuff_fclone_cache & skbuff_head_cache), and these caches are not
constrained by the sizes available in malloc_sizes. Their size are a multiple
of L1 CACHE size, which is 64 bytes for most common machines.
Even if your two pointers addition (16 bytes on x86_64) doesnt cross a 64bytes
line (I didn't checked), they are going to be set to NULL each time a skbuff
is allocated , and checked against NULL each time a skbuff is destroyed.
Eric
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: PATCH zero-copy send completion callback
2006-10-17 0:53 ` Eric Barton
2006-10-17 9:01 ` Eric Dumazet
@ 2006-10-17 11:19 ` Evgeniy Polyakov
1 sibling, 0 replies; 10+ messages in thread
From: Evgeniy Polyakov @ 2006-10-17 11:19 UTC (permalink / raw)
To: Eric Barton; +Cc: 'David Miller', netdev
On Tue, Oct 17, 2006 at 01:53:02AM +0100, Eric Barton (eeb@bartonsoftware.com) wrote:
> > And these days we're trying to figure
> > out how to eliminate skbuff and skb_shared_info struct members
> > whereas you're adding 16-bytes of space on 64-bit platforms.
>
> Do you think the general concept of a zero-copy completion callback is
> useful?
You can use existing skb destructor and appropriate reference counter is
already there. In your own destructor you need to call old one of
course, and it's type can be determined from the analysis of the headers
and skb itself (there are not so much destructor's types actually).
If that level of abstraction is not enough, it is possible to change
skb_release_data()/__kfree_skb() so that it would be possible in
skb->destructor() to determine if attached pages will be freed or not.
> If so, do you have any ideas about how to do it more economically? It's 2
> pointers rather than 1 to avoid forcing an unnecessary packet boundary
> between successive zero-copy sends. But I guess that might not be hugely
> significant since you're generally sending many pages when zero-copy is
Existing sendfile() implementation is synchronous, it does not require
async callback. It looks like lustre sets number of pages to be sent
asyncrhonously and report to user that everything is ok, and when
appropriate callback is invoked, it updates it's metadata? Fair enough,
it looks similar to VFS cache in case of usual write.
> needed for performance. Also, (please correct me if I'm wrong) I didn't
> think this would push the allocation over to the next entry in
> 'malloc_sizes'.
skbs are allocated from own cache, and the smaller it is, the better.
> Cheers,
> Eric
--
Evgeniy Polyakov
^ permalink raw reply [flat|nested] 10+ messages in thread
* RE: PATCH zero-copy send completion callback
2006-10-17 9:01 ` Eric Dumazet
@ 2006-10-17 12:23 ` Eric Barton
2006-10-17 21:45 ` David Miller
0 siblings, 1 reply; 10+ messages in thread
From: Eric Barton @ 2006-10-17 12:23 UTC (permalink / raw)
To: 'Eric Dumazet'; +Cc: 'David Miller', netdev
> > Also, (please correct me if I'm wrong) I didn't
> > think this would push the allocation over to the next entry in
> > 'malloc_sizes'.
>
> Well, skbuff heads are allocated from dedicated kmem_cache
> (skbuff_fclone_cache & skbuff_head_cache), and these caches are not
> constrained by the sizes available in malloc_sizes. Their
> size are a multiple
> of L1 CACHE size, which is 64 bytes for most common machines.
Indeed, struct skbuff is so allocated. But I added the callback
pointers to struct skb_shared_info where the page pointers are stored,
and this struct is allocated along with the packet header using kmalloc.
> Even if your two pointers addition (16 bytes on x86_64)
> doesnt cross a 64bytes
> line (I didn't checked), they are going to be set to NULL
> each time a skbuff
> is allocated , and checked against NULL each time a skbuff is
> destroyed.
Indeed. Do you think that's significant?
Cheers,
Eric
---------------------------------------------------
|Eric Barton Barton Software |
|9 York Gardens Tel: +44 (117) 330 1575 |
|Clifton Mobile: +44 (7909) 680 356 |
|Bristol BS8 4LL Fax: call first |
|United Kingdom E-Mail: eeb@bartonsoftware.com|
---------------------------------------------------
^ permalink raw reply [flat|nested] 10+ messages in thread
* RE: PATCH zero-copy send completion callback
[not found] <20061017094643.GA28926@infradead.org>
@ 2006-10-17 12:27 ` Eric Barton
0 siblings, 0 replies; 10+ messages in thread
From: Eric Barton @ 2006-10-17 12:27 UTC (permalink / raw)
To: 'Christoph Hellwig', 'David Miller'; +Cc: netdev
> In addition to that I'm pretty sure I remember that some clusterfs
> person already posted these patches a while ago and got ripped apart
> in the same way.
Yes - unfortunately I didn't submit my patch personally. And I've
rewritten it since to to avoid the obvious criticisms. This time
around, I find the comments much more to the point.
Cheers,
Eric
^ permalink raw reply [flat|nested] 10+ messages in thread
* RE: PATCH zero-copy send completion callback
@ 2006-10-17 12:50 Eric Barton
2006-10-17 13:13 ` Evgeniy Polyakov
0 siblings, 1 reply; 10+ messages in thread
From: Eric Barton @ 2006-10-17 12:50 UTC (permalink / raw)
To: 'Evgeniy Polyakov'; +Cc: 'David Miller', netdev
Evgeniy,
> You can use existing skb destructor and appropriate reference
> counter is already there. In your own destructor you need to
> call old one of course, and it's type can be determined from
> the analysis of the headers and skb itself (there are not so
> much destructor's types actually). If that level of
> abstraction is not enough, it is possible to change
> skb_release_data()/__kfree_skb() so that it would be possible
> in skb->destructor() to determine if attached pages will be
> freed or not.
Yes absolutely. My first thought was to use the skbuf destructor
but I was paranoid I might screw up the destructor stacking.
Maybe I should have been braver?
Since the callback descriptor needs to track the pages in
skb_shinfo() rather than the skbuf itself, it seemed "natural"
to make skb_release_data() the trigger.
> Existing sendfile() implementation is synchronous, it does not
> require async callback.
Is it not true that you cannot know when it is safe to overwrite
pages sent in this way?
> skbs are allocated from own cache, and the smaller it is, the better.
As I mentioned in another reply, skbs are indeed allocated from
their own cache, but skb_shinfo() is allocated contiguously with
the packet header using kmalloc.
--
Cheers,
Eric
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: PATCH zero-copy send completion callback
2006-10-17 12:50 Eric Barton
@ 2006-10-17 13:13 ` Evgeniy Polyakov
0 siblings, 0 replies; 10+ messages in thread
From: Evgeniy Polyakov @ 2006-10-17 13:13 UTC (permalink / raw)
To: Eric Barton; +Cc: 'David Miller', netdev
On Tue, Oct 17, 2006 at 01:50:04PM +0100, Eric Barton (eeb@bartonsoftware.com) wrote:
> Evgeniy,
>
> > You can use existing skb destructor and appropriate reference
> > counter is already there. In your own destructor you need to
> > call old one of course, and it's type can be determined from
> > the analysis of the headers and skb itself (there are not so
> > much destructor's types actually). If that level of
> > abstraction is not enough, it is possible to change
> > skb_release_data()/__kfree_skb() so that it would be possible
> > in skb->destructor() to determine if attached pages will be
> > freed or not.
>
> Yes absolutely. My first thought was to use the skbuf destructor
> but I was paranoid I might screw up the destructor stacking.
> Maybe I should have been braver?
It depends on the results quality...
> Since the callback descriptor needs to track the pages in
> skb_shinfo() rather than the skbuf itself, it seemed "natural"
> to make skb_release_data() the trigger.
>
> > Existing sendfile() implementation is synchronous, it does not
> > require async callback.
>
> Is it not true that you cannot know when it is safe to overwrite
> pages sent in this way?
There are tricks all over the place in sendfile. First one is sendpage()
imeplementation, which copies data if hardware does not support
checksumming and scater-gather, and simultaneous writing is "protected" in
the higher layer (check do_generic_mapping_read()). We do not care about
'later' writing, i.e. while skb was in some queue on the local machine,
since new data will be transferred in that case.
truncation is also protected by the fact, that page's reference counter
is increased, so the same page can not be freed and reused.
It was design decision not to care about page overwrites (and thus no
page locking) - either smart hardware transfers new data, or we do copy
and send old data.
> > skbs are allocated from own cache, and the smaller it is, the better.
>
> As I mentioned in another reply, skbs are indeed allocated from
> their own cache, but skb_shinfo() is allocated contiguously with
> the packet header using kmalloc.
Yes, skb itself is not touched.
You probably saw a lot of discussions about problems with e1000
hardware, memory fragmentation and jumbo frames.
Since skb_shared_info is added to the actual data, it frequently forces
next order allocations, so one of the solution is to put skb_shared_info
into separate allocations in some cases, although those discussions are
sleeping right now, problem still exists and if your current needs can
be handled within existing interfaces it should be tried first.
> --
>
> Cheers,
> Eric
>
--
Evgeniy Polyakov
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: PATCH zero-copy send completion callback
2006-10-17 12:23 ` Eric Barton
@ 2006-10-17 21:45 ` David Miller
0 siblings, 0 replies; 10+ messages in thread
From: David Miller @ 2006-10-17 21:45 UTC (permalink / raw)
To: eeb; +Cc: dada1, netdev
From: "Eric Barton" <eeb@bartonsoftware.com>
Date: Tue, 17 Oct 2006 13:23:10 +0100
> > Even if your two pointers addition (16 bytes on x86_64)
> > doesnt cross a 64bytes
> > line (I didn't checked), they are going to be set to NULL
> > each time a skbuff
> > is allocated , and checked against NULL each time a skbuff is
> > destroyed.
>
> Indeed. Do you think that's significant?
On a machine routing a million packets per second, it
definitely is. It is the most crucial data structure
for performance in all of the networking.
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2006-10-17 21:44 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-10-16 17:25 PATCH zero-copy send completion callback Eric Barton
-- strict thread matches above, loose matches on Subject: below --
2006-10-16 18:21 Eric Barton
[not found] <20061016.135222.78711520.davem@davemloft.net>
2006-10-17 0:53 ` Eric Barton
2006-10-17 9:01 ` Eric Dumazet
2006-10-17 12:23 ` Eric Barton
2006-10-17 21:45 ` David Miller
2006-10-17 11:19 ` Evgeniy Polyakov
[not found] <20061017094643.GA28926@infradead.org>
2006-10-17 12:27 ` Eric Barton
2006-10-17 12:50 Eric Barton
2006-10-17 13:13 ` Evgeniy Polyakov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).