From: David Miller <davem@davemloft.net>
To: daniel@iogearbox.net
Cc: chamaken@gmail.com, fw@strlen.de, netdev@vger.kernel.org
Subject: Re: [PATCH net] netlink, mmap: transform mmap skb into full skb on taps
Date: Fri, 11 Sep 2015 12:42:05 -0700 (PDT) [thread overview]
Message-ID: <20150911.124205.1992250802389107845.davem@davemloft.net> (raw)
In-Reply-To: <55F2AC29.3030209@iogearbox.net>
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 11 Sep 2015 12:25:45 +0200
> Already calling into skb_clone() is an issue itself, as the data
> area is user space buffer, and skb_clone() as well as skb_copy()
> access skb_shinfo() area. :/ So in that regard netlink mmap skbs are
> even further restrictive on what we can do than netlink large skbs.
Indeed, this is fatal.
So we'd still need something special like your
netlink_to_full_skb_clone to elide trying to touch the skb_shinfo
area.
I thought briefly about somehow cobbling up extra space in the ring
entries so we could have a real skb_shinfo() there, but that's illegal
too as the user could scribble all over it randomly while we interpret
the contents. We don't own that memory. So this doesn't work.
We could rename the clone_preserves_destructor and have it also mean
that the SKB lacks frags and skb_shinfo() should not be inspected.
Something like this:
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 639e9b8..47d5875 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -97,22 +97,6 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
void netlink_detachskb(struct sock *sk, struct sk_buff *skb);
int netlink_sendskb(struct sock *sk, struct sk_buff *skb);
-static inline struct sk_buff *
-netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
-{
- struct sk_buff *nskb;
-
- nskb = skb_clone(skb, gfp_mask);
- if (!nskb)
- return NULL;
-
- /* This is a large skb, set destructor callback to release head */
- if (is_vmalloc_addr(skb->head))
- nskb->destructor = skb->destructor;
-
- return nskb;
-}
-
/*
* skb should fit one page. This choice is good for headerless malloc.
* But we should limit to 8K so that userspace does not have to
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2738d35..898c53d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -584,8 +584,9 @@ struct sk_buff {
fclone:2,
peeked:1,
head_frag:1,
- xmit_more:1;
- /* one bit hole */
+ xmit_more:1,
+ private_buffers:1;
+
kmemcheck_bitfield_end(flags1);
/* fields enclosed in headers_start/headers_end are copied
@@ -2220,7 +2221,8 @@ static inline void skb_orphan(struct sk_buff *skb)
*/
static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
{
- if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY)))
+ if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) ||
+ skb->private_buffers))
return 0;
return skb_copy_ubufs(skb, gfp_mask);
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dad4dd3..54f9d6e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -825,7 +825,10 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
n->cloned = 1;
n->nohdr = 0;
- n->destructor = NULL;
+ if (!skb->private_buffers)
+ n->destructor = NULL;
+ else
+ C(destructor);
C(tail);
C(end);
C(head);
@@ -1675,6 +1678,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
to += copy;
}
+ if (skb->private_buffers)
+ goto out;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
@@ -1720,7 +1726,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
}
start = end;
}
-
+out:
if (!len)
return 0;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 6fcbd21..2ec5425 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1075,7 +1075,7 @@ static void nl_fib_input(struct sk_buff *skb)
nlmsg_len(nlh) < sizeof(*frn))
return;
- skb = netlink_skb_clone(skb, GFP_KERNEL);
+ skb = skb_clone(skb, GFP_KERNEL);
if (!skb)
return;
nlh = nlmsg_hdr(skb);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 70277b1..4c612481 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -291,7 +291,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
replay:
status = 0;
- skb = netlink_skb_clone(oskb, GFP_KERNEL);
+ skb = skb_clone(oskb, GFP_KERNEL);
if (!skb)
return netlink_ack(oskb, nlh, -ENOMEM);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 7f86d3b..523adac 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -719,6 +719,7 @@ static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
skb->end = skb->tail + size;
skb->len = 0;
+ skb->private_buffers = 1;
skb->destructor = netlink_skb_destructor;
NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
NETLINK_CB(skb).sk = sk;
@@ -854,6 +855,14 @@ static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm) 0
#endif /* CONFIG_NETLINK_MMAP */
+static bool skb_can_release_head(struct sk_buff *skb)
+{
+ if (!skb->cloned ||
+ !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
+ return true;
+ return false;
+}
+
static void netlink_skb_destructor(struct sk_buff *skb)
{
#ifdef CONFIG_NETLINK_MMAP
@@ -866,31 +875,35 @@ static void netlink_skb_destructor(struct sk_buff *skb)
* the status. In the direction userspace to kernel, the status is
* always reset here after the packet was processed and freed.
*/
- if (netlink_skb_is_mmaped(skb)) {
- hdr = netlink_mmap_hdr(skb);
- sk = NETLINK_CB(skb).sk;
+ if (!netlink_skb_is_mmaped(skb))
+ goto not_mmaped;
- if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
- netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
- ring = &nlk_sk(sk)->tx_ring;
- } else {
- if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
- hdr->nm_len = 0;
- netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
- }
- ring = &nlk_sk(sk)->rx_ring;
- }
+ if (!skb_can_release_head(skb))
+ goto clone_refs_remain;
- WARN_ON(atomic_read(&ring->pending) == 0);
- atomic_dec(&ring->pending);
- sock_put(sk);
+ hdr = netlink_mmap_hdr(skb);
+ sk = NETLINK_CB(skb).sk;
- skb->head = NULL;
+ if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
+ netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
+ ring = &nlk_sk(sk)->tx_ring;
+ } else {
+ if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
+ hdr->nm_len = 0;
+ netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
+ }
+ ring = &nlk_sk(sk)->rx_ring;
}
+
+ WARN_ON(atomic_read(&ring->pending) == 0);
+ atomic_dec(&ring->pending);
+ sock_put(sk);
+clone_refs_remain:
+ skb->head = NULL;
+not_mmaped:
#endif
if (is_vmalloc_addr(skb->head)) {
- if (!skb->cloned ||
- !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
+ if (skb_can_release_head(skb))
vfree(skb->head);
skb->head = NULL;
@@ -1669,6 +1682,7 @@ static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
if (data == NULL)
return NULL;
+ skb->private_buffers = 1;
skb = __build_skb(data, size);
if (skb == NULL)
vfree(data);
next prev parent reply other threads:[~2015-09-11 19:42 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-09-10 18:05 [PATCH net] netlink, mmap: transform mmap skb into full skb on taps Daniel Borkmann
2015-09-11 5:11 ` David Miller
2015-09-11 10:25 ` Daniel Borkmann
2015-09-11 19:42 ` David Miller [this message]
2015-09-11 20:35 ` Daniel Borkmann
2015-09-11 21:34 ` David Miller
2015-09-11 22:18 ` Daniel Borkmann
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20150911.124205.1992250802389107845.davem@davemloft.net \
--to=davem@davemloft.net \
--cc=chamaken@gmail.com \
--cc=daniel@iogearbox.net \
--cc=fw@strlen.de \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).