* [PATCH nf,v3] netfilter: nf_tables: coalesce multiple notifications into one skbuff
@ 2020-09-02 16:37 Pablo Neira Ayuso
2020-09-02 16:39 ` Phil Sutter
0 siblings, 1 reply; 4+ messages in thread
From: Pablo Neira Ayuso @ 2020-09-02 16:37 UTC (permalink / raw)
To: netfilter-devel; +Cc: phil, eric
On x86_64, each notification results in one skbuff allocation which
consumes at least 768 bytes due to the skbuff overhead.
This patch coalesces several notifications into one single skbuff, so
each notification consumes at least ~211 bytes, that ~3.5 times less
memory consumption. As a result, this is reducing the chances to exhaust
the netlink socket receive buffer.
Rule of thumb is that each notification batch only contains netlink
messages whose report flag is the same, nfnetlink_send() requires this
to do appropriately delivery to userspace, either via unicast (echo
mode) or multicast (monitor mode).
The skbuff control buffer is used to annotate the report flag for later
handling at the new coalescing routine.
The batch skbuff notification size is NLMSG_GOODSIZE, using a larger
skbuff would allow for more socket receiver buffer savings (to amortize
the cost of the skbuff even more), however, going over that size might
break userspace applications, so let's be conservative and stick to
NLMSG_GOODSIZE.
Reported-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
v3: remove unneeded assignment, per Phil Sutter.
include/net/netns/nftables.h | 1 +
net/netfilter/nf_tables_api.c | 70 ++++++++++++++++++++++++++++-------
2 files changed, 58 insertions(+), 13 deletions(-)
diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index a1a8d45adb42..6c0806bd8d1e 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -8,6 +8,7 @@ struct netns_nftables {
struct list_head tables;
struct list_head commit_list;
struct list_head module_list;
+ struct list_head notify_list;
struct mutex commit_mutex;
unsigned int base_seq;
u8 gencursor;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b7dc1cbf40ea..4603b667973a 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -684,6 +684,18 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
return -1;
}
+struct nftnl_skb_parms {
+ bool report;
+};
+#define NFT_CB(skb) (*(struct nftnl_skb_parms*)&((skb)->cb))
+
+static void nft_notify_enqueue(struct sk_buff *skb, bool report,
+ struct list_head *notify_list)
+{
+ NFT_CB(skb).report = report;
+ list_add_tail(&skb->list, notify_list);
+}
+
static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
{
struct sk_buff *skb;
@@ -715,8 +727,7 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -1468,8 +1479,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -2807,8 +2817,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx,
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -3837,8 +3846,7 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx,
goto err;
}
- nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, ctx->report,
- gfp_flags);
+ nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list);
return;
err:
nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -4959,8 +4967,7 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
goto err;
}
- nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report,
- GFP_KERNEL);
+ nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list);
return;
err:
nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -6275,7 +6282,7 @@ void nft_obj_notify(struct net *net, const struct nft_table *table,
goto err;
}
- nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp);
+ nft_notify_enqueue(skb, report, &net->nft.notify_list);
return;
err:
nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -7085,8 +7092,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -7695,6 +7701,41 @@ static void nf_tables_commit_release(struct net *net)
mutex_unlock(&net->nft.commit_mutex);
}
+static void nft_commit_notify(struct net *net, u32 portid)
+{
+ struct sk_buff *batch_skb = NULL, *nskb, *skb;
+ unsigned char *data;
+ int len;
+
+ list_for_each_entry_safe(skb, nskb, &net->nft.notify_list, list) {
+ if (!batch_skb) {
+new_batch:
+ batch_skb = skb;
+ len = NLMSG_GOODSIZE - skb->len;
+ list_del(&skb->list);
+ continue;
+ }
+ len -= skb->len;
+ if (len > 0 && NFT_CB(skb).report == NFT_CB(batch_skb).report) {
+ data = skb_put(batch_skb, skb->len);
+ memcpy(data, skb->data, skb->len);
+ list_del(&skb->list);
+ kfree_skb(skb);
+ continue;
+ }
+ nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES,
+ NFT_CB(batch_skb).report, GFP_KERNEL);
+ goto new_batch;
+ }
+
+ if (batch_skb) {
+ nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES,
+ NFT_CB(batch_skb).report, GFP_KERNEL);
+ }
+
+ WARN_ON_ONCE(!list_empty(&net->nft.notify_list));
+}
+
static int nf_tables_commit(struct net *net, struct sk_buff *skb)
{
struct nft_trans *trans, *next;
@@ -7897,6 +7938,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
}
}
+ nft_commit_notify(net, NETLINK_CB(skb).portid);
nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
nf_tables_commit_release(net);
@@ -8721,6 +8763,7 @@ static int __net_init nf_tables_init_net(struct net *net)
INIT_LIST_HEAD(&net->nft.tables);
INIT_LIST_HEAD(&net->nft.commit_list);
INIT_LIST_HEAD(&net->nft.module_list);
+ INIT_LIST_HEAD(&net->nft.notify_list);
mutex_init(&net->nft.commit_mutex);
net->nft.base_seq = 1;
net->nft.validate_state = NFT_VALIDATE_SKIP;
@@ -8737,6 +8780,7 @@ static void __net_exit nf_tables_exit_net(struct net *net)
mutex_unlock(&net->nft.commit_mutex);
WARN_ON_ONCE(!list_empty(&net->nft.tables));
WARN_ON_ONCE(!list_empty(&net->nft.module_list));
+ WARN_ON_ONCE(!list_empty(&net->nft.notify_list));
}
static struct pernet_operations nf_tables_net_ops = {
--
2.20.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH nf,v3] netfilter: nf_tables: coalesce multiple notifications into one skbuff
2020-09-02 16:37 [PATCH nf,v3] netfilter: nf_tables: coalesce multiple notifications into one skbuff Pablo Neira Ayuso
@ 2020-09-02 16:39 ` Phil Sutter
2020-09-02 16:54 ` Pablo Neira Ayuso
0 siblings, 1 reply; 4+ messages in thread
From: Phil Sutter @ 2020-09-02 16:39 UTC (permalink / raw)
To: Pablo Neira Ayuso; +Cc: netfilter-devel, eric
On Wed, Sep 02, 2020 at 06:37:43PM +0200, Pablo Neira Ayuso wrote:
> On x86_64, each notification results in one skbuff allocation which
> consumes at least 768 bytes due to the skbuff overhead.
>
> This patch coalesces several notifications into one single skbuff, so
> each notification consumes at least ~211 bytes, that ~3.5 times less
> memory consumption. As a result, this is reducing the chances to exhaust
> the netlink socket receive buffer.
>
> Rule of thumb is that each notification batch only contains netlink
> messages whose report flag is the same, nfnetlink_send() requires this
> to do appropriately delivery to userspace, either via unicast (echo
> mode) or multicast (monitor mode).
>
> The skbuff control buffer is used to annotate the report flag for later
> handling at the new coalescing routine.
>
> The batch skbuff notification size is NLMSG_GOODSIZE, using a larger
> skbuff would allow for more socket receiver buffer savings (to amortize
> the cost of the skbuff even more), however, going over that size might
> break userspace applications, so let's be conservative and stick to
> NLMSG_GOODSIZE.
>
> Reported-by: Phil Sutter <phil@nwl.cc>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Phil Sutter <phil@nwl.cc>
Thanks, Phil
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH nf,v3] netfilter: nf_tables: coalesce multiple notifications into one skbuff
2020-09-02 16:39 ` Phil Sutter
@ 2020-09-02 16:54 ` Pablo Neira Ayuso
2020-09-03 8:47 ` Phil Sutter
0 siblings, 1 reply; 4+ messages in thread
From: Pablo Neira Ayuso @ 2020-09-02 16:54 UTC (permalink / raw)
To: Phil Sutter, netfilter-devel, eric
On Wed, Sep 02, 2020 at 06:39:34PM +0200, Phil Sutter wrote:
> On Wed, Sep 02, 2020 at 06:37:43PM +0200, Pablo Neira Ayuso wrote:
> > On x86_64, each notification results in one skbuff allocation which
> > consumes at least 768 bytes due to the skbuff overhead.
> >
> > This patch coalesces several notifications into one single skbuff, so
> > each notification consumes at least ~211 bytes, that ~3.5 times less
> > memory consumption. As a result, this is reducing the chances to exhaust
> > the netlink socket receive buffer.
> >
> > Rule of thumb is that each notification batch only contains netlink
> > messages whose report flag is the same, nfnetlink_send() requires this
> > to do appropriately delivery to userspace, either via unicast (echo
> > mode) or multicast (monitor mode).
> >
> > The skbuff control buffer is used to annotate the report flag for later
> > handling at the new coalescing routine.
> >
> > The batch skbuff notification size is NLMSG_GOODSIZE, using a larger
> > skbuff would allow for more socket receiver buffer savings (to amortize
> > the cost of the skbuff even more), however, going over that size might
> > break userspace applications, so let's be conservative and stick to
> > NLMSG_GOODSIZE.
> >
> > Reported-by: Phil Sutter <phil@nwl.cc>
> > Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
>
> Acked-by: Phil Sutter <phil@nwl.cc>
Thanks, I'll place this into nf.git
BTW, I assume this mitigates the problem that Eric reported? Is it
not so easy to trigger the problem after this patch?
I forgot to say, probably it would be good to monitor
/proc/net/netlink to catch how busy the socket receive buffer is
getting with your firewalld ruleset.
Thanks.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH nf,v3] netfilter: nf_tables: coalesce multiple notifications into one skbuff
2020-09-02 16:54 ` Pablo Neira Ayuso
@ 2020-09-03 8:47 ` Phil Sutter
0 siblings, 0 replies; 4+ messages in thread
From: Phil Sutter @ 2020-09-03 8:47 UTC (permalink / raw)
To: Pablo Neira Ayuso; +Cc: netfilter-devel, eric
Hi Pablo,
On Wed, Sep 02, 2020 at 06:54:42PM +0200, Pablo Neira Ayuso wrote:
> On Wed, Sep 02, 2020 at 06:39:34PM +0200, Phil Sutter wrote:
> > On Wed, Sep 02, 2020 at 06:37:43PM +0200, Pablo Neira Ayuso wrote:
> > > On x86_64, each notification results in one skbuff allocation which
> > > consumes at least 768 bytes due to the skbuff overhead.
> > >
> > > This patch coalesces several notifications into one single skbuff, so
> > > each notification consumes at least ~211 bytes, that ~3.5 times less
> > > memory consumption. As a result, this is reducing the chances to exhaust
> > > the netlink socket receive buffer.
> > >
> > > Rule of thumb is that each notification batch only contains netlink
> > > messages whose report flag is the same, nfnetlink_send() requires this
> > > to do appropriately delivery to userspace, either via unicast (echo
> > > mode) or multicast (monitor mode).
> > >
> > > The skbuff control buffer is used to annotate the report flag for later
> > > handling at the new coalescing routine.
> > >
> > > The batch skbuff notification size is NLMSG_GOODSIZE, using a larger
> > > skbuff would allow for more socket receiver buffer savings (to amortize
> > > the cost of the skbuff even more), however, going over that size might
> > > break userspace applications, so let's be conservative and stick to
> > > NLMSG_GOODSIZE.
> > >
> > > Reported-by: Phil Sutter <phil@nwl.cc>
> > > Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> >
> > Acked-by: Phil Sutter <phil@nwl.cc>
>
> Thanks, I'll place this into nf.git
Thanks!
> BTW, I assume this mitigates the problem that Eric reported? Is it
> not so easy to trigger the problem after this patch?
Eric plans to push zones individually into the kernel from firewalld so
the problem shouldn't occur anymore unless one uses a ridiculously large
zone.
> I forgot to say, probably it would be good to monitor
> /proc/net/netlink to catch how busy the socket receive buffer is
> getting with your firewalld ruleset.
The socket doesn't live long enough to monitor it this way, but I tested
at which point things start failing again:
In firewalld, I see startup errors when having more than eight zones
configured. This is not too much, but given that we're talking about a
restrictive environment and the above change is planned anyway, it's not
a real problem.
The simple reproducer script I pasted earlier fails if the number of
rules exceeds 382. The error message is:
| netlink: Error: Could not process rule: Message too long
So I assume it is simply exhausting netlink send buffer space.
BTW: Outside of lxc, my script still succeeds for 100k rules and 1M
rules triggers OOM killer. :)
Cheers, Phil
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2020-09-03 8:47 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-09-02 16:37 [PATCH nf,v3] netfilter: nf_tables: coalesce multiple notifications into one skbuff Pablo Neira Ayuso
2020-09-02 16:39 ` Phil Sutter
2020-09-02 16:54 ` Pablo Neira Ayuso
2020-09-03 8:47 ` Phil Sutter
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).