* [PATCH v1 net-next 01/16] af_unix: Add struct unix_vertex in struct unix_sock.
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 02/16] af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd Kuniyuki Iwashima
` (14 subsequent siblings)
15 siblings, 0 replies; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
We will replace the garbage collection algorithm for AF_UNIX,
where we will consider each inflight file descriptor of AF_UNIX
sockets as an edge in a directed graph.
Here, we introduce a new struct unix_vertex representing a vertex
in the graph and add it to struct unix_sock.
In the following patch, we will allocate another struct per edge,
which we finally link to the inflight socket's unix_vertex.edges
when sendmsg() succeeds.
The first time an AF_UNIX socket is passed successfully, we will
link its unix_vertex.entry to a global list.
Also, we will count the number of edges as unix_vertex.out_degree.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/af_unix.h | 8 ++++++++
net/unix/af_unix.c | 1 +
net/unix/garbage.c | 9 +++++++++
3 files changed, 18 insertions(+)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 627ea8e2d915..664f6bff60ab 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -22,9 +22,16 @@ extern unsigned int unix_tot_inflight;
void unix_inflight(struct user_struct *user, struct file *fp);
void unix_notinflight(struct user_struct *user, struct file *fp);
+void unix_init_vertex(struct unix_sock *u);
void unix_gc(void);
void wait_for_unix_gc(struct scm_fp_list *fpl);
+struct unix_vertex {
+ struct list_head edges;
+ struct list_head entry;
+ unsigned long out_degree;
+};
+
struct sock *unix_peer_get(struct sock *sk);
#define UNIX_HASH_MOD (256 - 1)
@@ -62,6 +69,7 @@ struct unix_sock {
struct path path;
struct mutex iolock, bindlock;
struct sock *peer;
+ struct unix_vertex vertex;
struct list_head link;
unsigned long inflight;
spinlock_t lock;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 4892e9428c9f..ae145b6f77d8 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -996,6 +996,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
u->path.dentry = NULL;
u->path.mnt = NULL;
spin_lock_init(&u->lock);
+ unix_init_vertex(u);
INIT_LIST_HEAD(&u->link);
mutex_init(&u->iolock); /* single task reading lock */
mutex_init(&u->bindlock); /* single task binding lock */
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 9b8473dd79a4..db9ac289ce08 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -101,6 +101,15 @@ struct unix_sock *unix_get_socket(struct file *filp)
return NULL;
}
+void unix_init_vertex(struct unix_sock *u)
+{
+ struct unix_vertex *vertex = &u->vertex;
+
+ vertex->out_degree = 0;
+ INIT_LIST_HEAD(&vertex->edges);
+ INIT_LIST_HEAD(&vertex->entry);
+}
+
DEFINE_SPINLOCK(unix_gc_lock);
unsigned int unix_tot_inflight;
static LIST_HEAD(gc_candidates);
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v1 net-next 02/16] af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd.
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 01/16] af_unix: Add struct unix_vertex in struct unix_sock Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-03 20:20 ` kernel test robot
2024-02-03 3:00 ` [PATCH v1 net-next 03/16] af_unix: Link struct unix_edge when queuing skb Kuniyuki Iwashima
` (13 subsequent siblings)
15 siblings, 1 reply; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
When we send fd using SCM_RIGHTS message, we allocate struct
scm_fp_list to struct scm_cookie in scm_fp_copy(). Also, we bump
each refcount of the inflight fds' struct file and save them in
scm_fp_list.fp.
Later, we clone scm_fp_list of scm_cookie and set it to skb in
unix_attach_fds().
Then, we just preallocate to skb's scm_fp_list an array of struct
unix_edge in the number of inflight AF_UNIX fds and do not use them
there because sendmsg() could fail after this point. The actual
use will be in the next patch.
When we queue skb with inflight edges, we will set the inflight
socket's unix_vertex as unix_edge->predecessor and the receiver's
vertex as successor, and then we will link the edge to the inflight
socket's unix_vertex.edges.
Note that we set NULL to cloned scm_fp_list.edges in scm_fp_dup()
so that MSG_PEEK does not change the shape of the directed graph.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/af_unix.h | 8 ++++++++
include/net/scm.h | 7 +++++++
net/core/scm.c | 3 +++
net/unix/af_unix.c | 5 +++++
net/unix/garbage.c | 18 ++++++++++++++++++
5 files changed, 41 insertions(+)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 664f6bff60ab..cab9dfb666f3 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -23,6 +23,8 @@ extern unsigned int unix_tot_inflight;
void unix_inflight(struct user_struct *user, struct file *fp);
void unix_notinflight(struct user_struct *user, struct file *fp);
void unix_init_vertex(struct unix_sock *u);
+int unix_alloc_edges(struct scm_fp_list *fpl);
+void unix_free_edges(struct scm_fp_list *fpl);
void unix_gc(void);
void wait_for_unix_gc(struct scm_fp_list *fpl);
@@ -32,6 +34,12 @@ struct unix_vertex {
unsigned long out_degree;
};
+struct unix_edge {
+ struct unix_vertex *predecessor;
+ struct unix_vertex *successor;
+ struct list_head entry;
+};
+
struct sock *unix_peer_get(struct sock *sk);
#define UNIX_HASH_MOD (256 - 1)
diff --git a/include/net/scm.h b/include/net/scm.h
index 92276a2c5543..a1142dee086c 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -23,10 +23,17 @@ struct scm_creds {
kgid_t gid;
};
+#ifdef CONFIG_UNIX
+struct unix_edge;
+#endif
+
struct scm_fp_list {
short count;
short count_unix;
short max;
+#ifdef CONFIG_UNIX
+ struct unix_edge *edges;
+#endif
struct user_struct *user;
struct file *fp[SCM_MAX_FD];
};
diff --git a/net/core/scm.c b/net/core/scm.c
index 9cd4b0a01cd6..8661524ed6e5 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -87,6 +87,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
*fplp = fpl;
fpl->count = 0;
fpl->count_unix = 0;
+ fpl->edges = NULL;
fpl->max = SCM_MAX_FD;
fpl->user = NULL;
}
@@ -376,6 +377,8 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
if (new_fpl) {
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
+
+ new_fpl->edges = NULL;
new_fpl->max = new_fpl->count;
new_fpl->user = get_uid(fpl->user);
}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index ae145b6f77d8..0391f66546a6 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1819,6 +1819,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
for (i = scm->fp->count - 1; i >= 0; i--)
unix_inflight(scm->fp->user, scm->fp->fp[i]);
+ if (unix_alloc_edges(UNIXCB(skb).fp))
+ return -ENOMEM;
+
return 0;
}
@@ -1829,6 +1832,8 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
scm->fp = UNIXCB(skb).fp;
UNIXCB(skb).fp = NULL;
+ unix_free_edges(scm->fp);
+
for (i = scm->fp->count - 1; i >= 0; i--)
unix_notinflight(scm->fp->user, scm->fp->fp[i]);
}
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index db9ac289ce08..6a3572e43b9f 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -110,6 +110,24 @@ void unix_init_vertex(struct unix_sock *u)
INIT_LIST_HEAD(&vertex->entry);
}
+int unix_alloc_edges(struct scm_fp_list *fpl)
+{
+ if (!fpl->count_unix)
+ return 0;
+
+ fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges),
+ GFP_KERNEL_ACCOUNT);
+ if (!fpl->edges)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void unix_free_edges(struct scm_fp_list *fpl)
+{
+ kvfree(fpl->edges);
+}
+
DEFINE_SPINLOCK(unix_gc_lock);
unsigned int unix_tot_inflight;
static LIST_HEAD(gc_candidates);
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* Re: [PATCH v1 net-next 02/16] af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd.
2024-02-03 3:00 ` [PATCH v1 net-next 02/16] af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd Kuniyuki Iwashima
@ 2024-02-03 20:20 ` kernel test robot
0 siblings, 0 replies; 21+ messages in thread
From: kernel test robot @ 2024-02-03 20:20 UTC (permalink / raw)
To: Kuniyuki Iwashima, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: llvm, oe-kbuild-all, netdev, Kuniyuki Iwashima
Hi Kuniyuki,
kernel test robot noticed the following build errors:
[auto build test ERROR on net-next/main]
url: https://github.com/intel-lab-lkp/linux/commits/Kuniyuki-Iwashima/af_unix-Add-struct-unix_vertex-in-struct-unix_sock/20240203-110847
base: net-next/main
patch link: https://lore.kernel.org/r/20240203030058.60750-3-kuniyu%40amazon.com
patch subject: [PATCH v1 net-next 02/16] af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd.
config: i386-buildonly-randconfig-004-20240203 (https://download.01.org/0day-ci/archive/20240204/202402040416.3h4bSPKV-lkp@intel.com/config)
compiler: clang version 17.0.6 (https://github.com/llvm/llvm-project 6009708b4367171ccdbf4b5905cb6a803753fe18)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240204/202402040416.3h4bSPKV-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202402040416.3h4bSPKV-lkp@intel.com/
All errors (new ones prefixed by >>):
>> net/core/scm.c:90:8: error: no member named 'edges' in 'struct scm_fp_list'
90 | fpl->edges = NULL;
| ~~~ ^
net/core/scm.c:381:12: error: no member named 'edges' in 'struct scm_fp_list'
381 | new_fpl->edges = NULL;
| ~~~~~~~ ^
2 errors generated.
vim +90 net/core/scm.c
66
67 static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
68 {
69 int *fdp = (int*)CMSG_DATA(cmsg);
70 struct scm_fp_list *fpl = *fplp;
71 struct file **fpp;
72 int i, num;
73
74 num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int);
75
76 if (num <= 0)
77 return 0;
78
79 if (num > SCM_MAX_FD)
80 return -EINVAL;
81
82 if (!fpl)
83 {
84 fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT);
85 if (!fpl)
86 return -ENOMEM;
87 *fplp = fpl;
88 fpl->count = 0;
89 fpl->count_unix = 0;
> 90 fpl->edges = NULL;
91 fpl->max = SCM_MAX_FD;
92 fpl->user = NULL;
93 }
94 fpp = &fpl->fp[fpl->count];
95
96 if (fpl->count + num > fpl->max)
97 return -EINVAL;
98
99 /*
100 * Verify the descriptors and increment the usage count.
101 */
102
103 for (i=0; i< num; i++)
104 {
105 int fd = fdp[i];
106 struct file *file;
107
108 if (fd < 0 || !(file = fget_raw(fd)))
109 return -EBADF;
110 /* don't allow io_uring files */
111 if (io_is_uring_fops(file)) {
112 fput(file);
113 return -EINVAL;
114 }
115 if (unix_get_socket(file))
116 fpl->count_unix++;
117
118 *fpp++ = file;
119 fpl->count++;
120 }
121
122 if (!fpl->user)
123 fpl->user = get_uid(current_user());
124
125 return num;
126 }
127
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1 net-next 03/16] af_unix: Link struct unix_edge when queuing skb.
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 01/16] af_unix: Add struct unix_vertex in struct unix_sock Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 02/16] af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-20 12:06 ` Paolo Abeni
2024-02-03 3:00 ` [PATCH v1 net-next 04/16] af_unix: Save listener for embryo socket Kuniyuki Iwashima
` (12 subsequent siblings)
15 siblings, 1 reply; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
Just before queuing skb with inflight fds, we call scm_stat_add(),
which is a good place to set up the preallocated struct unix_edge
in UNIXCB(skb).fp->edges.
Then, we call unix_add_edges() and construct the directed graph
as follows:
1. Set the inflight socket's unix_vertex to unix_edge.predecessor
2. Set the receiver's unix_vertex to unix_edge.successor
3. Link unix_edge.entry to the inflight socket's unix_vertex.edges
4. Link inflight socket's unix_vertex.entry to unix_unvisited_vertices.
Let's say we pass the fd of AF_UNIX socket A to B and the fd of B
to C. The graph looks like this:
+-------------------------+
| unix_unvisited_vertices | <------------------------.
+-------------------------+ |
+ |
| +-------------+ +-------------+ | +-------------+
| | unix_sock A | | unix_sock B | | | unix_sock C |
| +-------------+ +-------------+ | +-------------+
| | unix_vertex | <----. .----> | unix_vertex | <-|--. .----> | unix_vertex |
| | +-----------+ | | | +-----------+ | | | | +-----------+
`-> | | entry | +------------> | | entry | +-' | | | | entry |
| |-----------| | | | |-----------| | | | |-----------|
| | edges | <-. | | | | edges | <-. | | | | edges |
+-+-----------+ | | | +-+-----------+ | | | +-+-----------+
| | | | | |
.---------------------' | | .---------------------' | |
| | | | | |
| +-------------+ | | | +-------------+ | |
| | unix_edge | | | | | unix_edge | | |
| +-------------+ | | | +-------------+ | |
`-> | entry | | | `-> | entry | | |
|-------------| | | |-------------| | |
| predecessor | +----' | | predecessor | +----' |
|-------------| | |-------------| |
| successor | +-------' | successor | +-------'
+-------------+ +-------------+
Henceforth, we denote such a graph as A -> B (-> C).
Now, we can express all inflight fd graphs that do not contain
embryo sockets. The following two patches will support the
particular case.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/af_unix.h | 2 ++
include/net/scm.h | 1 +
net/core/scm.c | 2 ++
net/unix/af_unix.c | 8 +++++--
net/unix/garbage.c | 56 ++++++++++++++++++++++++++++++++++++++++++-
5 files changed, 66 insertions(+), 3 deletions(-)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index cab9dfb666f3..54d62467a70b 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -23,6 +23,8 @@ extern unsigned int unix_tot_inflight;
void unix_inflight(struct user_struct *user, struct file *fp);
void unix_notinflight(struct user_struct *user, struct file *fp);
void unix_init_vertex(struct unix_sock *u);
+void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
+void unix_del_edges(struct scm_fp_list *fpl);
int unix_alloc_edges(struct scm_fp_list *fpl);
void unix_free_edges(struct scm_fp_list *fpl);
void unix_gc(void);
diff --git a/include/net/scm.h b/include/net/scm.h
index a1142dee086c..7d807fe466a3 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -32,6 +32,7 @@ struct scm_fp_list {
short count_unix;
short max;
#ifdef CONFIG_UNIX
+ bool inflight;
struct unix_edge *edges;
#endif
struct user_struct *user;
diff --git a/net/core/scm.c b/net/core/scm.c
index 8661524ed6e5..d141c00eb116 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -87,6 +87,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
*fplp = fpl;
fpl->count = 0;
fpl->count_unix = 0;
+ fpl->inflight = false;
fpl->edges = NULL;
fpl->max = SCM_MAX_FD;
fpl->user = NULL;
@@ -378,6 +379,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
+ new_fpl->inflight = false;
new_fpl->edges = NULL;
new_fpl->max = new_fpl->count;
new_fpl->user = get_uid(fpl->user);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 0391f66546a6..ea7bac18a781 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1956,8 +1956,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk);
- if (unlikely(fp && fp->count))
+ if (unlikely(fp && fp->count)) {
atomic_add(fp->count, &u->scm_stat.nr_fds);
+ unix_add_edges(fp, u);
+ }
}
static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
@@ -1965,8 +1967,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk);
- if (unlikely(fp && fp->count))
+ if (unlikely(fp && fp->count)) {
atomic_sub(fp->count, &u->scm_stat.nr_fds);
+ unix_del_edges(fp);
+ }
}
/*
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 6a3572e43b9f..572ac0994c69 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -110,6 +110,58 @@ void unix_init_vertex(struct unix_sock *u)
INIT_LIST_HEAD(&vertex->entry);
}
+DEFINE_SPINLOCK(unix_gc_lock);
+static LIST_HEAD(unix_unvisited_vertices);
+
+void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
+{
+ int i = 0, j = 0;
+
+ spin_lock(&unix_gc_lock);
+
+ while (i < fpl->count_unix) {
+ struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]);
+ struct unix_edge *edge;
+
+ if (!inflight)
+ continue;
+
+ edge = fpl->edges + i++;
+ edge->predecessor = &inflight->vertex;
+ edge->successor = &receiver->vertex;
+
+ if (!edge->predecessor->out_degree++)
+ list_add_tail(&edge->predecessor->entry, &unix_unvisited_vertices);
+
+ INIT_LIST_HEAD(&edge->entry);
+ list_add_tail(&edge->entry, &edge->predecessor->edges);
+ }
+
+ spin_unlock(&unix_gc_lock);
+
+ fpl->inflight = true;
+}
+
+void unix_del_edges(struct scm_fp_list *fpl)
+{
+ int i = 0;
+
+ spin_lock(&unix_gc_lock);
+
+ while (i < fpl->count_unix) {
+ struct unix_edge *edge = fpl->edges + i++;
+
+ list_del(&edge->entry);
+
+ if (!--edge->predecessor->out_degree)
+ list_del_init(&edge->predecessor->entry);
+ }
+
+ spin_unlock(&unix_gc_lock);
+
+ fpl->inflight = false;
+}
+
int unix_alloc_edges(struct scm_fp_list *fpl)
{
if (!fpl->count_unix)
@@ -125,10 +177,12 @@ int unix_alloc_edges(struct scm_fp_list *fpl)
void unix_free_edges(struct scm_fp_list *fpl)
{
+ if (fpl->inflight)
+ unix_del_edges(fpl);
+
kvfree(fpl->edges);
}
-DEFINE_SPINLOCK(unix_gc_lock);
unsigned int unix_tot_inflight;
static LIST_HEAD(gc_candidates);
static LIST_HEAD(gc_inflight_list);
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* Re: [PATCH v1 net-next 03/16] af_unix: Link struct unix_edge when queuing skb.
2024-02-03 3:00 ` [PATCH v1 net-next 03/16] af_unix: Link struct unix_edge when queuing skb Kuniyuki Iwashima
@ 2024-02-20 12:06 ` Paolo Abeni
0 siblings, 0 replies; 21+ messages in thread
From: Paolo Abeni @ 2024-02-20 12:06 UTC (permalink / raw)
To: Kuniyuki Iwashima, David S. Miller, Eric Dumazet, Jakub Kicinski
Cc: Kuniyuki Iwashima, netdev
On Fri, 2024-02-02 at 19:00 -0800, Kuniyuki Iwashima wrote:
> Just before queuing skb with inflight fds, we call scm_stat_add(),
> which is a good place to set up the preallocated struct unix_edge
> in UNIXCB(skb).fp->edges.
>
> Then, we call unix_add_edges() and construct the directed graph
> as follows:
>
> 1. Set the inflight socket's unix_vertex to unix_edge.predecessor
> 2. Set the receiver's unix_vertex to unix_edge.successor
> 3. Link unix_edge.entry to the inflight socket's unix_vertex.edges
> 4. Link inflight socket's unix_vertex.entry to unix_unvisited_vertices.
>
> Let's say we pass the fd of AF_UNIX socket A to B and the fd of B
> to C. The graph looks like this:
>
> +-------------------------+
> | unix_unvisited_vertices | <------------------------.
> +-------------------------+ |
> + |
> | +-------------+ +-------------+ | +-------------+
> | | unix_sock A | | unix_sock B | | | unix_sock C |
> | +-------------+ +-------------+ | +-------------+
> | | unix_vertex | <----. .----> | unix_vertex | <-|--. .----> | unix_vertex |
> | | +-----------+ | | | +-----------+ | | | | +-----------+
> `-> | | entry | +------------> | | entry | +-' | | | | entry |
> | |-----------| | | | |-----------| | | | |-----------|
> | | edges | <-. | | | | edges | <-. | | | | edges |
> +-+-----------+ | | | +-+-----------+ | | | +-+-----------+
> | | | | | |
> .---------------------' | | .---------------------' | |
> | | | | | |
> | +-------------+ | | | +-------------+ | |
> | | unix_edge | | | | | unix_edge | | |
> | +-------------+ | | | +-------------+ | |
> `-> | entry | | | `-> | entry | | |
> |-------------| | | |-------------| | |
> | predecessor | +----' | | predecessor | +----' |
> |-------------| | |-------------| |
> | successor | +-------' | successor | +-------'
> +-------------+ +-------------+
>
> Henceforth, we denote such a graph as A -> B (-> C).
>
> Now, we can express all inflight fd graphs that do not contain
> embryo sockets. The following two patches will support the
> particular case.
>
> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> ---
> include/net/af_unix.h | 2 ++
> include/net/scm.h | 1 +
> net/core/scm.c | 2 ++
> net/unix/af_unix.c | 8 +++++--
> net/unix/garbage.c | 56 ++++++++++++++++++++++++++++++++++++++++++-
> 5 files changed, 66 insertions(+), 3 deletions(-)
>
> diff --git a/include/net/af_unix.h b/include/net/af_unix.h
> index cab9dfb666f3..54d62467a70b 100644
> --- a/include/net/af_unix.h
> +++ b/include/net/af_unix.h
> @@ -23,6 +23,8 @@ extern unsigned int unix_tot_inflight;
> void unix_inflight(struct user_struct *user, struct file *fp);
> void unix_notinflight(struct user_struct *user, struct file *fp);
> void unix_init_vertex(struct unix_sock *u);
> +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
> +void unix_del_edges(struct scm_fp_list *fpl);
> int unix_alloc_edges(struct scm_fp_list *fpl);
> void unix_free_edges(struct scm_fp_list *fpl);
> void unix_gc(void);
> diff --git a/include/net/scm.h b/include/net/scm.h
> index a1142dee086c..7d807fe466a3 100644
> --- a/include/net/scm.h
> +++ b/include/net/scm.h
> @@ -32,6 +32,7 @@ struct scm_fp_list {
> short count_unix;
> short max;
> #ifdef CONFIG_UNIX
> + bool inflight;
> struct unix_edge *edges;
> #endif
> struct user_struct *user;
> diff --git a/net/core/scm.c b/net/core/scm.c
> index 8661524ed6e5..d141c00eb116 100644
> --- a/net/core/scm.c
> +++ b/net/core/scm.c
> @@ -87,6 +87,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
> *fplp = fpl;
> fpl->count = 0;
> fpl->count_unix = 0;
> + fpl->inflight = false;
> fpl->edges = NULL;
> fpl->max = SCM_MAX_FD;
> fpl->user = NULL;
> @@ -378,6 +379,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
> for (i = 0; i < fpl->count; i++)
> get_file(fpl->fp[i]);
>
> + new_fpl->inflight = false;
> new_fpl->edges = NULL;
> new_fpl->max = new_fpl->count;
> new_fpl->user = get_uid(fpl->user);
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 0391f66546a6..ea7bac18a781 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -1956,8 +1956,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
> struct scm_fp_list *fp = UNIXCB(skb).fp;
> struct unix_sock *u = unix_sk(sk);
>
> - if (unlikely(fp && fp->count))
> + if (unlikely(fp && fp->count)) {
> atomic_add(fp->count, &u->scm_stat.nr_fds);
> + unix_add_edges(fp, u);
> + }
> }
>
> static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
> @@ -1965,8 +1967,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
> struct scm_fp_list *fp = UNIXCB(skb).fp;
> struct unix_sock *u = unix_sk(sk);
>
> - if (unlikely(fp && fp->count))
> + if (unlikely(fp && fp->count)) {
> atomic_sub(fp->count, &u->scm_stat.nr_fds);
> + unix_del_edges(fp);
> + }
> }
>
> /*
> diff --git a/net/unix/garbage.c b/net/unix/garbage.c
> index 6a3572e43b9f..572ac0994c69 100644
> --- a/net/unix/garbage.c
> +++ b/net/unix/garbage.c
> @@ -110,6 +110,58 @@ void unix_init_vertex(struct unix_sock *u)
> INIT_LIST_HEAD(&vertex->entry);
> }
>
> +DEFINE_SPINLOCK(unix_gc_lock);
> +static LIST_HEAD(unix_unvisited_vertices);
> +
> +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
> +{
> + int i = 0, j = 0;
> +
> + spin_lock(&unix_gc_lock);
> +
> + while (i < fpl->count_unix) {
> + struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]);
> + struct unix_edge *edge;
> +
> + if (!inflight)
> + continue;
> +
> + edge = fpl->edges + i++;
> + edge->predecessor = &inflight->vertex;
> + edge->successor = &receiver->vertex;
> +
> + if (!edge->predecessor->out_degree++)
> + list_add_tail(&edge->predecessor->entry, &unix_unvisited_vertices);
> +
> + INIT_LIST_HEAD(&edge->entry);
Here 'edge->predecessor->entry' and 'edge->entry' refer to different
object types right ? edge vs vertices. Perhaps using different field
names could clarify the code a bit?
Also the edge->entry initialization just before the list_add_tail below
looks strange/suspect. Perhaps it would be better to the init at
allocation time?
Thanks!
Paolo
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1 net-next 04/16] af_unix: Save listener for embryo socket.
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
` (2 preceding siblings ...)
2024-02-03 3:00 ` [PATCH v1 net-next 03/16] af_unix: Link struct unix_edge when queuing skb Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 05/16] af_unix: Fix up unix_edge.successor " Kuniyuki Iwashima
` (11 subsequent siblings)
15 siblings, 0 replies; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
This is a prep patch for the following change, where we need to
fetch the listening socket from an embryo socket in sendmsg().
We add a new field to struct unix_sock to save a pointer to a
listening socket.
We set it when connect() creates a new socket, and clear it when
accept() is called.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/af_unix.h | 1 +
net/unix/af_unix.c | 5 ++++-
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 54d62467a70b..438d2a18ba2e 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -79,6 +79,7 @@ struct unix_sock {
struct path path;
struct mutex iolock, bindlock;
struct sock *peer;
+ struct sock *listener;
struct unix_vertex vertex;
struct list_head link;
unsigned long inflight;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index ea7bac18a781..1ebc3c15f972 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -992,6 +992,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
sk->sk_destruct = unix_sock_destructor;
u = unix_sk(sk);
+ u->listener = NULL;
u->inflight = 0;
u->path.dentry = NULL;
u->path.mnt = NULL;
@@ -1611,6 +1612,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
newsk->sk_type = sk->sk_type;
init_peercred(newsk);
newu = unix_sk(newsk);
+ newu->listener = other;
RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
otheru = unix_sk(other);
@@ -1706,8 +1708,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
bool kern)
{
struct sock *sk = sock->sk;
- struct sock *tsk;
struct sk_buff *skb;
+ struct sock *tsk;
int err;
err = -EOPNOTSUPP;
@@ -1732,6 +1734,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
}
tsk = skb->sk;
+ unix_sk(tsk)->listener = NULL;
skb_free_datagram(sk, skb);
wake_up_interruptible(&unix_sk(sk)->peer_wait);
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v1 net-next 05/16] af_unix: Fix up unix_edge.successor for embryo socket.
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
` (3 preceding siblings ...)
2024-02-03 3:00 ` [PATCH v1 net-next 04/16] af_unix: Save listener for embryo socket Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 06/16] af_unix: Bulk update unix_tot_inflight/unix_inflight when queuing skb Kuniyuki Iwashima
` (10 subsequent siblings)
15 siblings, 0 replies; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
To garbage collect inflight AF_UNIX sockets, we must define the
cyclic reference appropriately. This is a bit tricky if the loop
consists of embryo sockets.
Suppose that the fd of AF_UNIX socket A is passed to D and the fd B
to C and that C and D are embryo sockets of A and B, respectively.
It may appear that there are two separate graphs, A (-> D) and
B (-> C), but this is not correct.
A --. .-- B
X
C <-' `-> D
Now, D holds A's refcount, and C has B's refcount, so unix_release()
will never be called for A and B when we close() them. However, no
one can call close() for D and C to free skbs holding refcounts of A
and B because C/D is in A/B's receive queue, which should have been
purged by unix_release() for A and B.
So, here's a new type of cyclic reference. When a fd of an AF_UNIX
socket is passed to an embryo socket, the reference is indirectly
held by its parent listening socket.
.-> A .-> B
| `- sk_receive_queue | `- sk_receive_queue
| `- skb | `- skb
| `- sk == C | `- sk == D
| `- sk_receive_queue | `- sk_receive_queue
| `- skb +----------' `- skb +--.
| |
`-----------------------------------------------------------'
Technically, the graph must be denoted as A <-> B instead of A (-> D)
and B (-> C) to find such a cyclic reference without touching each
socket's receive queue.
.-> A --. .-- B <-.
| X | == A <-> B
`-- C <-' `-> D --'
We apply this fixup in unix_add_edges() if the receiver is an embryo
socket.
We also link such edges to the embryo socket because we need
to restore the original separte graphs A (-> D) and B (-> C) in
unix_update_edges() once accept() is called.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/af_unix.h | 2 ++
net/unix/af_unix.c | 2 +-
net/unix/garbage.c | 29 ++++++++++++++++++++++++++++-
3 files changed, 31 insertions(+), 2 deletions(-)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 438d2a18ba2e..2d8e93775e61 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -25,6 +25,7 @@ void unix_notinflight(struct user_struct *user, struct file *fp);
void unix_init_vertex(struct unix_sock *u);
void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
void unix_del_edges(struct scm_fp_list *fpl);
+void unix_update_edges(struct unix_sock *receiver);
int unix_alloc_edges(struct scm_fp_list *fpl);
void unix_free_edges(struct scm_fp_list *fpl);
void unix_gc(void);
@@ -40,6 +41,7 @@ struct unix_edge {
struct unix_vertex *predecessor;
struct unix_vertex *successor;
struct list_head entry;
+ struct list_head embryo_entry;
};
struct sock *unix_peer_get(struct sock *sk);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 1ebc3c15f972..dab5d8d96e87 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1734,7 +1734,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
}
tsk = skb->sk;
- unix_sk(tsk)->listener = NULL;
+ unix_update_edges(unix_sk(tsk));
skb_free_datagram(sk, skb);
wake_up_interruptible(&unix_sk(sk)->peer_wait);
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 572ac0994c69..d731438d3c2b 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -115,10 +115,16 @@ static LIST_HEAD(unix_unvisited_vertices);
void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
{
+ struct unix_vertex *successor;
int i = 0, j = 0;
spin_lock(&unix_gc_lock);
+ if (receiver->listener)
+ successor = &unix_sk(receiver->listener)->vertex;
+ else
+ successor = &receiver->vertex;
+
while (i < fpl->count_unix) {
struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]);
struct unix_edge *edge;
@@ -128,13 +134,18 @@ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
edge = fpl->edges + i++;
edge->predecessor = &inflight->vertex;
- edge->successor = &receiver->vertex;
+ edge->successor = successor;
if (!edge->predecessor->out_degree++)
list_add_tail(&edge->predecessor->entry, &unix_unvisited_vertices);
INIT_LIST_HEAD(&edge->entry);
list_add_tail(&edge->entry, &edge->predecessor->edges);
+
+ if (receiver->listener) {
+ INIT_LIST_HEAD(&edge->embryo_entry);
+ list_add_tail(&edge->embryo_entry, &receiver->vertex.edges);
+ }
}
spin_unlock(&unix_gc_lock);
@@ -162,6 +173,22 @@ void unix_del_edges(struct scm_fp_list *fpl)
fpl->inflight = false;
}
+void unix_update_edges(struct unix_sock *receiver)
+{
+ struct unix_edge *edge;
+
+ spin_lock(&unix_gc_lock);
+
+ list_for_each_entry(edge, &receiver->vertex.edges, embryo_entry)
+ edge->successor = &receiver->vertex;
+
+ list_del_init(&receiver->vertex.edges);
+
+ receiver->listener = NULL;
+
+ spin_unlock(&unix_gc_lock);
+}
+
int unix_alloc_edges(struct scm_fp_list *fpl)
{
if (!fpl->count_unix)
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v1 net-next 06/16] af_unix: Bulk update unix_tot_inflight/unix_inflight when queuing skb.
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
` (4 preceding siblings ...)
2024-02-03 3:00 ` [PATCH v1 net-next 05/16] af_unix: Fix up unix_edge.successor " Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 07/16] af_unix: Detect Strongly Connected Components Kuniyuki Iwashima
` (9 subsequent siblings)
15 siblings, 0 replies; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
Currently, we track the number of inflight sockets in two variables.
unix_tot_inflight is the total number of inflight AF_UNIX sockets on
the host, and user->unix_inflight is the number of inflight fds per
user.
We update them one by one in unix_inflight(), which can be done once
in batch. Also, sendmsg() could fail even after unix_inflight(), then
we need to acquire unix_gc_lock only to decrease the counters.
Let's bulk update the counters in unix_add_edges() and unix_del_edges(),
which is called only for successfully passed fds.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/unix/garbage.c | 18 +++++++-----------
1 file changed, 7 insertions(+), 11 deletions(-)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index d731438d3c2b..42ed886c75d1 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -112,6 +112,7 @@ void unix_init_vertex(struct unix_sock *u)
DEFINE_SPINLOCK(unix_gc_lock);
static LIST_HEAD(unix_unvisited_vertices);
+unsigned int unix_tot_inflight;
void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
{
@@ -148,6 +149,9 @@ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
}
}
+ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix);
+ WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count);
+
spin_unlock(&unix_gc_lock);
fpl->inflight = true;
@@ -168,6 +172,9 @@ void unix_del_edges(struct scm_fp_list *fpl)
list_del_init(&edge->predecessor->entry);
}
+ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix);
+ WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count);
+
spin_unlock(&unix_gc_lock);
fpl->inflight = false;
@@ -210,7 +217,6 @@ void unix_free_edges(struct scm_fp_list *fpl)
kvfree(fpl->edges);
}
-unsigned int unix_tot_inflight;
static LIST_HEAD(gc_candidates);
static LIST_HEAD(gc_inflight_list);
@@ -231,13 +237,8 @@ void unix_inflight(struct user_struct *user, struct file *filp)
WARN_ON_ONCE(list_empty(&u->link));
}
u->inflight++;
-
- /* Paired with READ_ONCE() in wait_for_unix_gc() */
- WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
}
- WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1);
-
spin_unlock(&unix_gc_lock);
}
@@ -254,13 +255,8 @@ void unix_notinflight(struct user_struct *user, struct file *filp)
u->inflight--;
if (!u->inflight)
list_del_init(&u->link);
-
- /* Paired with READ_ONCE() in wait_for_unix_gc() */
- WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
}
- WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1);
-
spin_unlock(&unix_gc_lock);
}
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v1 net-next 07/16] af_unix: Detect Strongly Connected Components.
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
` (5 preceding siblings ...)
2024-02-03 3:00 ` [PATCH v1 net-next 06/16] af_unix: Bulk update unix_tot_inflight/unix_inflight when queuing skb Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-03 19:59 ` kernel test robot
2024-02-03 21:36 ` kernel test robot
2024-02-03 3:00 ` [PATCH v1 net-next 08/16] af_unix: Save O(n) setup of Tarjan's algo Kuniyuki Iwashima
` (8 subsequent siblings)
15 siblings, 2 replies; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In the new GC, we use a simple graph algorithm, Tarjan's Strongly
Connected Components (SCC) algorithm, to find cyclic references.
The algorithm visits every vertex exactly once using depth-first
search (DFS). We implement it without recursion so that no one can
abuse it.
There could be multiple graphs, so we iterate unix_unvisited_vertices
in unix_walk_scc() and do DFS in __unix_walk_scc(), where we move
visited vertices to another list, unix_visited_vertices, not to
restart DFS twice on a visited vertex later in unix_walk_scc().
DFS starts by pushing an input vertex to a stack and assigning it
a unique number. Two fields, index and lowlink, are initialised
with the number, but lowlink could be updated later during DFS.
If a vertex has an edge to an unvisited inflight vertex, we visit
it and do the same processing. So, we will have vertices in the
stack in the order they appear and number them consecutively in
the same order.
If a vertex has an edge to a visited vertex in stack, so-called
back-edge, we update the predecessor's lowlink with the successor's
index.
After iterating edges from the vertex, we check if its index
equals its lowlink.
If the lowlink is different from the index, it shows there was a
back-edge. Then, we propagate the lowlink to its predecessor.
If the lowlink is the same as the index, we pop vertices before
and including the vertex from the stack. Then, the set of vertices
is SCC, possibly forming a cycle. At the same time, we move the
vertices to unix_visited_vertices.
When we finish the algorithm, all vertices in each SCC will be
linked via unix_vertex.scc_entry.
Let's take an example. We have a graph including five inflight
vertices (F is not inflight):
A -> B -> C -> D -> E (-> F)
^ |
`---------'
Suppose that we start DFS from C. We will visit C, D, and B first
and initialise their index and lowlink. Then, the stack looks like
this:
> B = (3, 3) (index, lowlink)
D = (2, 2)
C = (1, 1)
When checking B's edge to C, we update B's lowlink with C's index
and propagate it to D.
> B = (3, 1) (index, lowlink)
D = (2, 1)
C = (1, 1)
Next, we visit E, which has no edge to an inflight vertex.
> E = (4, 4) (index, lowlink)
B = (3, 1)
D = (2, 1)
C = (1, 1)
When we leave from E, its index and lowlink are the same, so we pop
E from the stack.
B = (3, 1) (index, lowlink)
D = (2, 1)
> C = (1, 1)
Then, we leave C, whose index and lowlink are the same, so we pop
B, D and C as SCC.
Last, we do DFS for the rest of vertices, A, which is also a
single-vertex SCC.
Finally, each unix_vertex.scc_entry is linked as follows:
A -. B -> C -> D E -.
^ | ^ | ^ |
`--' `---------' `--'
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/af_unix.h | 5 +++
net/unix/garbage.c | 80 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 85 insertions(+)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 2d8e93775e61..0874f6b41adc 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -34,7 +34,11 @@ void wait_for_unix_gc(struct scm_fp_list *fpl);
struct unix_vertex {
struct list_head edges;
struct list_head entry;
+ struct list_head scc_entry;
unsigned long out_degree;
+ unsigned long index;
+ unsigned long lowlink;
+ bool on_stack;
};
struct unix_edge {
@@ -42,6 +46,7 @@ struct unix_edge {
struct unix_vertex *successor;
struct list_head entry;
struct list_head embryo_entry;
+ struct list_head stack_entry;
};
struct sock *unix_peer_get(struct sock *sk);
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 42ed886c75d1..e235c03ee3c3 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -108,6 +108,7 @@ void unix_init_vertex(struct unix_sock *u)
vertex->out_degree = 0;
INIT_LIST_HEAD(&vertex->edges);
INIT_LIST_HEAD(&vertex->entry);
+ INIT_LIST_HEAD(&vertex->scc_entry);
}
DEFINE_SPINLOCK(unix_gc_lock);
@@ -217,6 +218,83 @@ void unix_free_edges(struct scm_fp_list *fpl)
kvfree(fpl->edges);
}
+enum unix_vertex_index {
+ UNIX_VERTEX_INDEX_UNVISITED,
+ UNIX_VERTEX_INDEX_START,
+};
+
+static LIST_HEAD(unix_visited_vertices);
+
+static void __unix_walk_scc(struct unix_vertex *vertex)
+{
+ unsigned long index = UNIX_VERTEX_INDEX_START;
+ LIST_HEAD(vertex_stack);
+ struct unix_edge *edge;
+ LIST_HEAD(edge_stack);
+
+next_vertex:
+ vertex->index = index;
+ vertex->lowlink = index;
+ index++;
+
+ vertex->on_stack = true;
+ list_move(&vertex->scc_entry, &vertex_stack);
+
+ list_for_each_entry(edge, &vertex->edges, entry) {
+ if (!edge->successor->out_degree)
+ continue;
+
+ if (edge->successor->index == UNIX_VERTEX_INDEX_UNVISITED) {
+ list_add(&edge->stack_entry, &edge_stack);
+
+ vertex = edge->successor;
+ goto next_vertex;
+ }
+
+ if (edge->successor->on_stack)
+ vertex->lowlink = min(vertex->lowlink, edge->successor->index);
+next_edge:
+ }
+
+ if (vertex->index == vertex->lowlink) {
+ LIST_HEAD(scc);
+
+ list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
+
+ list_for_each_entry_reverse(vertex, &scc, scc_entry) {
+ list_move_tail(&vertex->entry, &unix_visited_vertices);
+
+ vertex->on_stack = false;
+ }
+
+ list_del(&scc);
+ }
+
+ if (!list_empty(&edge_stack)) {
+ edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry);
+ list_del_init(&edge->stack_entry);
+
+ vertex = edge->predecessor;
+ vertex->lowlink = min(vertex->lowlink, edge->successor->lowlink);
+ goto next_edge;
+ }
+}
+
+static void unix_walk_scc(void)
+{
+ struct unix_vertex *vertex;
+
+ list_for_each_entry(vertex, &unix_unvisited_vertices, entry)
+ vertex->index = UNIX_VERTEX_INDEX_UNVISITED;
+
+ while (!list_empty(&unix_unvisited_vertices)) {
+ vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+ __unix_walk_scc(vertex);
+ }
+
+ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+}
+
static LIST_HEAD(gc_candidates);
static LIST_HEAD(gc_inflight_list);
@@ -364,6 +442,8 @@ static void __unix_gc(struct work_struct *work)
spin_lock(&unix_gc_lock);
+ unix_walk_scc();
+
/* First, select candidates for garbage collection. Only
* in-flight sockets are considered, and from those only ones
* which don't have any external reference.
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* Re: [PATCH v1 net-next 07/16] af_unix: Detect Strongly Connected Components.
2024-02-03 3:00 ` [PATCH v1 net-next 07/16] af_unix: Detect Strongly Connected Components Kuniyuki Iwashima
@ 2024-02-03 19:59 ` kernel test robot
2024-02-03 21:36 ` kernel test robot
1 sibling, 0 replies; 21+ messages in thread
From: kernel test robot @ 2024-02-03 19:59 UTC (permalink / raw)
To: Kuniyuki Iwashima, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: llvm, oe-kbuild-all, netdev, Kuniyuki Iwashima
Hi Kuniyuki,
kernel test robot noticed the following build warnings:
[auto build test WARNING on net-next/main]
url: https://github.com/intel-lab-lkp/linux/commits/Kuniyuki-Iwashima/af_unix-Add-struct-unix_vertex-in-struct-unix_sock/20240203-110847
base: net-next/main
patch link: https://lore.kernel.org/r/20240203030058.60750-8-kuniyu%40amazon.com
patch subject: [PATCH v1 net-next 07/16] af_unix: Detect Strongly Connected Components.
config: x86_64-rhel-8.3-bpf (https://download.01.org/0day-ci/archive/20240204/202402040348.uejoTcrq-lkp@intel.com/config)
compiler: clang version 17.0.6 (https://github.com/llvm/llvm-project 6009708b4367171ccdbf4b5905cb6a803753fe18)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240204/202402040348.uejoTcrq-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202402040348.uejoTcrq-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> net/unix/garbage.c:257:2: warning: label at end of compound statement is a C2x extension [-Wc2x-extensions]
257 | }
| ^
1 warning generated.
vim +257 net/unix/garbage.c
227
228 static void __unix_walk_scc(struct unix_vertex *vertex)
229 {
230 unsigned long index = UNIX_VERTEX_INDEX_START;
231 LIST_HEAD(vertex_stack);
232 struct unix_edge *edge;
233 LIST_HEAD(edge_stack);
234
235 next_vertex:
236 vertex->index = index;
237 vertex->lowlink = index;
238 index++;
239
240 vertex->on_stack = true;
241 list_move(&vertex->scc_entry, &vertex_stack);
242
243 list_for_each_entry(edge, &vertex->edges, entry) {
244 if (!edge->successor->out_degree)
245 continue;
246
247 if (edge->successor->index == UNIX_VERTEX_INDEX_UNVISITED) {
248 list_add(&edge->stack_entry, &edge_stack);
249
250 vertex = edge->successor;
251 goto next_vertex;
252 }
253
254 if (edge->successor->on_stack)
255 vertex->lowlink = min(vertex->lowlink, edge->successor->index);
256 next_edge:
> 257 }
258
259 if (vertex->index == vertex->lowlink) {
260 LIST_HEAD(scc);
261
262 list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
263
264 list_for_each_entry_reverse(vertex, &scc, scc_entry) {
265 list_move_tail(&vertex->entry, &unix_visited_vertices);
266
267 vertex->on_stack = false;
268 }
269
270 list_del(&scc);
271 }
272
273 if (!list_empty(&edge_stack)) {
274 edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry);
275 list_del_init(&edge->stack_entry);
276
277 vertex = edge->predecessor;
278 vertex->lowlink = min(vertex->lowlink, edge->successor->lowlink);
279 goto next_edge;
280 }
281 }
282
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v1 net-next 07/16] af_unix: Detect Strongly Connected Components.
2024-02-03 3:00 ` [PATCH v1 net-next 07/16] af_unix: Detect Strongly Connected Components Kuniyuki Iwashima
2024-02-03 19:59 ` kernel test robot
@ 2024-02-03 21:36 ` kernel test robot
1 sibling, 0 replies; 21+ messages in thread
From: kernel test robot @ 2024-02-03 21:36 UTC (permalink / raw)
To: Kuniyuki Iwashima, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: llvm, oe-kbuild-all, netdev, Kuniyuki Iwashima
Hi Kuniyuki,
kernel test robot noticed the following build warnings:
[auto build test WARNING on net-next/main]
url: https://github.com/intel-lab-lkp/linux/commits/Kuniyuki-Iwashima/af_unix-Add-struct-unix_vertex-in-struct-unix_sock/20240203-110847
base: net-next/main
patch link: https://lore.kernel.org/r/20240203030058.60750-8-kuniyu%40amazon.com
patch subject: [PATCH v1 net-next 07/16] af_unix: Detect Strongly Connected Components.
config: riscv-defconfig (https://download.01.org/0day-ci/archive/20240204/202402040541.rLg7ze2a-lkp@intel.com/config)
compiler: clang version 19.0.0git (https://github.com/llvm/llvm-project 7dd790db8b77c4a833c06632e903dc4f13877a64)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240204/202402040541.rLg7ze2a-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202402040541.rLg7ze2a-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> net/unix/garbage.c:257:2: warning: label at end of compound statement is a C23 extension [-Wc23-extensions]
257 | }
| ^
1 warning generated.
vim +257 net/unix/garbage.c
227
228 static void __unix_walk_scc(struct unix_vertex *vertex)
229 {
230 unsigned long index = UNIX_VERTEX_INDEX_START;
231 LIST_HEAD(vertex_stack);
232 struct unix_edge *edge;
233 LIST_HEAD(edge_stack);
234
235 next_vertex:
236 vertex->index = index;
237 vertex->lowlink = index;
238 index++;
239
240 vertex->on_stack = true;
241 list_move(&vertex->scc_entry, &vertex_stack);
242
243 list_for_each_entry(edge, &vertex->edges, entry) {
244 if (!edge->successor->out_degree)
245 continue;
246
247 if (edge->successor->index == UNIX_VERTEX_INDEX_UNVISITED) {
248 list_add(&edge->stack_entry, &edge_stack);
249
250 vertex = edge->successor;
251 goto next_vertex;
252 }
253
254 if (edge->successor->on_stack)
255 vertex->lowlink = min(vertex->lowlink, edge->successor->index);
256 next_edge:
> 257 }
258
259 if (vertex->index == vertex->lowlink) {
260 LIST_HEAD(scc);
261
262 list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
263
264 list_for_each_entry_reverse(vertex, &scc, scc_entry) {
265 list_move_tail(&vertex->entry, &unix_visited_vertices);
266
267 vertex->on_stack = false;
268 }
269
270 list_del(&scc);
271 }
272
273 if (!list_empty(&edge_stack)) {
274 edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry);
275 list_del_init(&edge->stack_entry);
276
277 vertex = edge->predecessor;
278 vertex->lowlink = min(vertex->lowlink, edge->successor->lowlink);
279 goto next_edge;
280 }
281 }
282
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1 net-next 08/16] af_unix: Save O(n) setup of Tarjan's algo.
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
` (6 preceding siblings ...)
2024-02-03 3:00 ` [PATCH v1 net-next 07/16] af_unix: Detect Strongly Connected Components Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 09/16] af_unix: Avoid Tarjan's algorithm if unnecessary Kuniyuki Iwashima
` (7 subsequent siblings)
15 siblings, 0 replies; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
Before starting Tarjan's algorithm, we need to mark all vertices
as unvisited. We can save this O(n) setup by reserving two special
indices (0, 1) and using two variables.
The first time we link a vertex to unix_unvisited_vertices, we set
unix_vertex_unvisited_index to index.
During DFS, we can see that the index of unvisited vertices is the
same as unix_vertex_unvisited_index.
When we finalise SCC later, we set unix_vertex_grouped_index to each
vertex's index.
Then, we can know that the index of a visited vertex is >= 2 if the
vertex is on the stack, and if the index is unix_vertex_grouped_index,
it is not on the stack and belongs to a different SCC.
After the whole algorithm, all indices of vertices are set as
unix_vertex_grouped_index.
Next time we start DFS, we know that all unvisited vertices have
unix_vertex_grouped_index, and we can use unix_vertex_unvisited_index
as the not-on-stack marker.
To do this, we swap unix_vertex_(grouped|unvisited)_index at the end
of Tarjan's algorithm.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/af_unix.h | 1 -
net/unix/garbage.c | 34 +++++++++++++++++++---------------
2 files changed, 19 insertions(+), 16 deletions(-)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 0874f6b41adc..b3ba5e949d62 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -38,7 +38,6 @@ struct unix_vertex {
unsigned long out_degree;
unsigned long index;
unsigned long lowlink;
- bool on_stack;
};
struct unix_edge {
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index e235c03ee3c3..24137bf95e02 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -115,6 +115,14 @@ DEFINE_SPINLOCK(unix_gc_lock);
static LIST_HEAD(unix_unvisited_vertices);
unsigned int unix_tot_inflight;
+enum unix_vertex_index {
+ UNIX_VERTEX_INDEX_MARK1,
+ UNIX_VERTEX_INDEX_MARK2,
+ UNIX_VERTEX_INDEX_START,
+};
+
+static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1;
+
void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
{
struct unix_vertex *successor;
@@ -138,8 +146,11 @@ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
edge->predecessor = &inflight->vertex;
edge->successor = successor;
- if (!edge->predecessor->out_degree++)
+ if (!edge->predecessor->out_degree++) {
+ edge->predecessor->index = unix_vertex_unvisited_index;
+
list_add_tail(&edge->predecessor->entry, &unix_unvisited_vertices);
+ }
INIT_LIST_HEAD(&edge->entry);
list_add_tail(&edge->entry, &edge->predecessor->edges);
@@ -218,12 +229,8 @@ void unix_free_edges(struct scm_fp_list *fpl)
kvfree(fpl->edges);
}
-enum unix_vertex_index {
- UNIX_VERTEX_INDEX_UNVISITED,
- UNIX_VERTEX_INDEX_START,
-};
-
static LIST_HEAD(unix_visited_vertices);
+static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
static void __unix_walk_scc(struct unix_vertex *vertex)
{
@@ -237,21 +244,20 @@ static void __unix_walk_scc(struct unix_vertex *vertex)
vertex->lowlink = index;
index++;
- vertex->on_stack = true;
list_move(&vertex->scc_entry, &vertex_stack);
list_for_each_entry(edge, &vertex->edges, entry) {
if (!edge->successor->out_degree)
continue;
- if (edge->successor->index == UNIX_VERTEX_INDEX_UNVISITED) {
+ if (edge->successor->index == unix_vertex_unvisited_index) {
list_add(&edge->stack_entry, &edge_stack);
vertex = edge->successor;
goto next_vertex;
}
- if (edge->successor->on_stack)
+ if (edge->successor->index != unix_vertex_grouped_index)
vertex->lowlink = min(vertex->lowlink, edge->successor->index);
next_edge:
}
@@ -264,7 +270,7 @@ static void __unix_walk_scc(struct unix_vertex *vertex)
list_for_each_entry_reverse(vertex, &scc, scc_entry) {
list_move_tail(&vertex->entry, &unix_visited_vertices);
- vertex->on_stack = false;
+ vertex->index = unix_vertex_grouped_index;
}
list_del(&scc);
@@ -282,17 +288,15 @@ static void __unix_walk_scc(struct unix_vertex *vertex)
static void unix_walk_scc(void)
{
- struct unix_vertex *vertex;
-
- list_for_each_entry(vertex, &unix_unvisited_vertices, entry)
- vertex->index = UNIX_VERTEX_INDEX_UNVISITED;
-
while (!list_empty(&unix_unvisited_vertices)) {
+ struct unix_vertex *vertex;
+
vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
__unix_walk_scc(vertex);
}
list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+ swap(unix_vertex_unvisited_index, unix_vertex_grouped_index);
}
static LIST_HEAD(gc_candidates);
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v1 net-next 09/16] af_unix: Avoid Tarjan's algorithm if unnecessary.
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
` (7 preceding siblings ...)
2024-02-03 3:00 ` [PATCH v1 net-next 08/16] af_unix: Save O(n) setup of Tarjan's algo Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 10/16] af_unix: Skip GC if no cycle exists Kuniyuki Iwashima
` (6 subsequent siblings)
15 siblings, 0 replies; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In a typical use case, there should be no cyclic references, so we
should skip full cycle detection as much as possible.
Every time we add/delete/update an edge, we check if the successor
is already inflight.
The vertex does not form a cycle if the successor is not inflight.
Thus, we do not need to run Tarjan's algorithm. Instead, we can
just iterate the already grouped SCC in unix_walk_scc_fast().
But if the successor is already in flight, we set unix_graph_updated
to true and do the grouping again later in unix_walk_scc().
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/unix/garbage.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 44 insertions(+), 2 deletions(-)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 24137bf95e02..0f46df05a019 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -111,6 +111,19 @@ void unix_init_vertex(struct unix_sock *u)
INIT_LIST_HEAD(&vertex->scc_entry);
}
+static bool unix_graph_updated;
+
+static void unix_graph_update(struct unix_edge *edge)
+{
+ if (unix_graph_updated)
+ return;
+
+ if (!edge->successor->out_degree)
+ return;
+
+ unix_graph_updated = true;
+}
+
DEFINE_SPINLOCK(unix_gc_lock);
static LIST_HEAD(unix_unvisited_vertices);
unsigned int unix_tot_inflight;
@@ -159,6 +172,8 @@ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
INIT_LIST_HEAD(&edge->embryo_entry);
list_add_tail(&edge->embryo_entry, &receiver->vertex.edges);
}
+
+ unix_graph_update(edge);
}
WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix);
@@ -178,6 +193,8 @@ void unix_del_edges(struct scm_fp_list *fpl)
while (i < fpl->count_unix) {
struct unix_edge *edge = fpl->edges + i++;
+ unix_graph_update(edge);
+
list_del(&edge->entry);
if (!--edge->predecessor->out_degree)
@@ -198,8 +215,11 @@ void unix_update_edges(struct unix_sock *receiver)
spin_lock(&unix_gc_lock);
- list_for_each_entry(edge, &receiver->vertex.edges, embryo_entry)
+ list_for_each_entry(edge, &receiver->vertex.edges, embryo_entry) {
+ unix_graph_update(edge);
+
edge->successor = &receiver->vertex;
+ }
list_del_init(&receiver->vertex.edges);
@@ -297,6 +317,25 @@ static void unix_walk_scc(void)
list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
swap(unix_vertex_unvisited_index, unix_vertex_grouped_index);
+ unix_graph_updated = false;
+}
+
+static void unix_walk_scc_fast(void)
+{
+ while (!list_empty(&unix_unvisited_vertices)) {
+ struct unix_vertex *vertex;
+ LIST_HEAD(scc);
+
+ vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+ list_add(&scc, &vertex->scc_entry);
+
+ list_for_each_entry_reverse(vertex, &scc, scc_entry)
+ list_move_tail(&vertex->entry, &unix_visited_vertices);
+
+ list_del(&scc);
+ }
+
+ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
}
static LIST_HEAD(gc_candidates);
@@ -446,7 +485,10 @@ static void __unix_gc(struct work_struct *work)
spin_lock(&unix_gc_lock);
- unix_walk_scc();
+ if (unix_graph_updated)
+ unix_walk_scc();
+ else
+ unix_walk_scc_fast();
/* First, select candidates for garbage collection. Only
* in-flight sockets are considered, and from those only ones
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v1 net-next 10/16] af_unix: Skip GC if no cycle exists.
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
` (8 preceding siblings ...)
2024-02-03 3:00 ` [PATCH v1 net-next 09/16] af_unix: Avoid Tarjan's algorithm if unnecessary Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 11/16] af_unix: Assign a unique index to SCC Kuniyuki Iwashima
` (5 subsequent siblings)
15 siblings, 0 replies; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
Once we run Tarjan's algorithm, we know whether cyclic references
exist. If there is no cycle, we set unix_graph_maybe_cyclic to
false and can skip the entire garbage collection next time.
When finalising SCC, we set unix_graph_maybe_cyclic to true if SCC
consists of multiple vertices.
Even if SCC is a single vertex, a cycle might exist as self-fd passing.
To detect the corner case, we can check all edges of the vertex, but
instead, we add a new field that counts the number of self-references
to finish the decision in O(1).
With this change, __unix_gc() is just a spin_lock dance in the normal
usage.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/af_unix.h | 1 +
net/unix/garbage.c | 27 ++++++++++++++++++++++++++-
2 files changed, 27 insertions(+), 1 deletion(-)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index b3ba5e949d62..59ec8d7880ce 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -36,6 +36,7 @@ struct unix_vertex {
struct list_head entry;
struct list_head scc_entry;
unsigned long out_degree;
+ unsigned long self_degree;
unsigned long index;
unsigned long lowlink;
};
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 0f46df05a019..cbddca5d8dd1 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -106,12 +106,14 @@ void unix_init_vertex(struct unix_sock *u)
struct unix_vertex *vertex = &u->vertex;
vertex->out_degree = 0;
+ vertex->self_degree = 0;
INIT_LIST_HEAD(&vertex->edges);
INIT_LIST_HEAD(&vertex->entry);
INIT_LIST_HEAD(&vertex->scc_entry);
}
static bool unix_graph_updated;
+static bool unix_graph_maybe_cyclic;
static void unix_graph_update(struct unix_edge *edge)
{
@@ -122,6 +124,7 @@ static void unix_graph_update(struct unix_edge *edge)
return;
unix_graph_updated = true;
+ unix_graph_maybe_cyclic = true;
}
DEFINE_SPINLOCK(unix_gc_lock);
@@ -159,6 +162,9 @@ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
edge->predecessor = &inflight->vertex;
edge->successor = successor;
+ if (edge->predecessor == edge->successor)
+ edge->predecessor->self_degree++;
+
if (!edge->predecessor->out_degree++) {
edge->predecessor->index = unix_vertex_unvisited_index;
@@ -199,6 +205,9 @@ void unix_del_edges(struct scm_fp_list *fpl)
if (!--edge->predecessor->out_degree)
list_del_init(&edge->predecessor->entry);
+
+ if (edge->predecessor == edge->successor)
+ edge->predecessor->self_degree--;
}
WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix);
@@ -218,6 +227,9 @@ void unix_update_edges(struct unix_sock *receiver)
list_for_each_entry(edge, &receiver->vertex.edges, embryo_entry) {
unix_graph_update(edge);
+ if (edge->predecessor == edge->successor)
+ edge->predecessor->self_degree--;
+
edge->successor = &receiver->vertex;
}
@@ -293,6 +305,14 @@ static void __unix_walk_scc(struct unix_vertex *vertex)
vertex->index = unix_vertex_grouped_index;
}
+ if (!list_is_singular(&scc)) {
+ unix_graph_maybe_cyclic = true;
+ } else {
+ vertex = list_first_entry(&scc, typeof(*vertex), scc_entry);
+ if (vertex->self_degree)
+ unix_graph_maybe_cyclic = true;
+ }
+
list_del(&scc);
}
@@ -308,6 +328,8 @@ static void __unix_walk_scc(struct unix_vertex *vertex)
static void unix_walk_scc(void)
{
+ unix_graph_maybe_cyclic = false;
+
while (!list_empty(&unix_unvisited_vertices)) {
struct unix_vertex *vertex;
@@ -485,6 +507,9 @@ static void __unix_gc(struct work_struct *work)
spin_lock(&unix_gc_lock);
+ if (!unix_graph_maybe_cyclic)
+ goto skip_gc;
+
if (unix_graph_updated)
unix_walk_scc();
else
@@ -573,7 +598,7 @@ static void __unix_gc(struct work_struct *work)
/* All candidates should have been detached by now. */
WARN_ON_ONCE(!list_empty(&gc_candidates));
-
+skip_gc:
/* Paired with READ_ONCE() in wait_for_unix_gc(). */
WRITE_ONCE(gc_in_progress, false);
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v1 net-next 11/16] af_unix: Assign a unique index to SCC.
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
` (9 preceding siblings ...)
2024-02-03 3:00 ` [PATCH v1 net-next 10/16] af_unix: Skip GC if no cycle exists Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 12/16] af_unix: Detect dead SCC Kuniyuki Iwashima
` (4 subsequent siblings)
15 siblings, 0 replies; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
The definition of the lowlink in Tarjan's algorithm is the
smallest index of a vertex that is reachable with at most one
back-edge in SCC.
If we start traversing from A in the following graph, the final
lowlink of D is 3.
A -> B -> D D = (4, 3) (index, lowlink)
^ | | C = (3, 1)
| V | B = (2, 1)
`--- C <--' A = (1, 1)
This is because the lowlink of D is updated with the index of C.
In the following patch, we detect a dead SCC by checking two
conditions for each vertex.
1) vertex has no edge directed to another SCC (no bridge)
2) vertex's out_degree is the same as the refcount of its file
If 1) is false, there is a receiver of all fds of the SCC.
To evaluate 1), we need to assign a unique index to a SCC and
assign it to all vertices in the SCC.
This patch changes the lowlink update logic so that in the above
example, the lowlink of D is updated with the lowlink of C.
A -> B -> D D = (4, 1) (index, lowlink)
^ | | C = (3, 1)
| V | B = (2, 1)
`--- C <--' A = (1, 1)
Then, all vertices in the same SCC have the same lowlink, and we
can quickly find the bridge if exists.
However, it is no longer called lowlink, so we rename it to
scc_index.
Also, we add a global variable to hold the last index used in DFS
so that we do not reset the initial index in each DFS.
This patch can be squashed to the SCC detection patch but is
split deliberately for anyone wondering why lowlink is not used
as used in the original Tarjan's algorithm.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/af_unix.h | 2 +-
net/unix/garbage.c | 15 ++++++++-------
2 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 59ec8d7880ce..66c8cf835625 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -38,7 +38,7 @@ struct unix_vertex {
unsigned long out_degree;
unsigned long self_degree;
unsigned long index;
- unsigned long lowlink;
+ unsigned long scc_index;
};
struct unix_edge {
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index cbddca5d8dd1..3d60a5379b7b 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -263,18 +263,18 @@ void unix_free_edges(struct scm_fp_list *fpl)
static LIST_HEAD(unix_visited_vertices);
static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
+static unsigned long unix_vertex_last_index = UNIX_VERTEX_INDEX_START;
static void __unix_walk_scc(struct unix_vertex *vertex)
{
- unsigned long index = UNIX_VERTEX_INDEX_START;
LIST_HEAD(vertex_stack);
struct unix_edge *edge;
LIST_HEAD(edge_stack);
next_vertex:
- vertex->index = index;
- vertex->lowlink = index;
- index++;
+ vertex->index = unix_vertex_last_index;
+ vertex->scc_index = unix_vertex_last_index;
+ unix_vertex_last_index++;
list_move(&vertex->scc_entry, &vertex_stack);
@@ -290,11 +290,11 @@ static void __unix_walk_scc(struct unix_vertex *vertex)
}
if (edge->successor->index != unix_vertex_grouped_index)
- vertex->lowlink = min(vertex->lowlink, edge->successor->index);
+ vertex->scc_index = min(vertex->scc_index, edge->successor->scc_index);
next_edge:
}
- if (vertex->index == vertex->lowlink) {
+ if (vertex->index == vertex->scc_index) {
LIST_HEAD(scc);
list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
@@ -321,13 +321,14 @@ static void __unix_walk_scc(struct unix_vertex *vertex)
list_del_init(&edge->stack_entry);
vertex = edge->predecessor;
- vertex->lowlink = min(vertex->lowlink, edge->successor->lowlink);
+ vertex->scc_index = min(vertex->scc_index, edge->successor->scc_index);
goto next_edge;
}
}
static void unix_walk_scc(void)
{
+ unix_vertex_last_index = UNIX_VERTEX_INDEX_START;
unix_graph_maybe_cyclic = false;
while (!list_empty(&unix_unvisited_vertices)) {
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v1 net-next 12/16] af_unix: Detect dead SCC.
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
` (10 preceding siblings ...)
2024-02-03 3:00 ` [PATCH v1 net-next 11/16] af_unix: Assign a unique index to SCC Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 13/16] af_unix: Replace garbage collection algorithm Kuniyuki Iwashima
` (3 subsequent siblings)
15 siblings, 0 replies; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
When iterating SCC, we call unix_vertex_dead() for each vertex
to check if the vertex is close()d and has no bridge to another
SCC.
If both conditions are true for every vertex in SCC, we can
execute garbage collection for all skb in the SCC.
The actual garbage collection is done in the following patch,
replacing the old implementation.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/unix/garbage.c | 34 +++++++++++++++++++++++++++++++++-
1 file changed, 33 insertions(+), 1 deletion(-)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 3d60a5379b7b..528215527b23 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -261,6 +261,29 @@ void unix_free_edges(struct scm_fp_list *fpl)
kvfree(fpl->edges);
}
+static bool unix_vertex_dead(struct unix_vertex *vertex)
+{
+ struct unix_edge *edge;
+ struct unix_sock *u;
+ long total_ref;
+
+ list_for_each_entry(edge, &vertex->edges, entry) {
+ if (!edge->successor->out_degree)
+ return false;
+
+ if (edge->successor->scc_index != vertex->scc_index)
+ return false;
+ }
+
+ u = container_of(vertex, typeof(*u), vertex);
+ total_ref = file_count(u->sk.sk_socket->file);
+
+ if (total_ref != vertex->out_degree)
+ return false;
+
+ return true;
+}
+
static LIST_HEAD(unix_visited_vertices);
static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
static unsigned long unix_vertex_last_index = UNIX_VERTEX_INDEX_START;
@@ -295,6 +318,7 @@ static void __unix_walk_scc(struct unix_vertex *vertex)
}
if (vertex->index == vertex->scc_index) {
+ bool dead = true;
LIST_HEAD(scc);
list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
@@ -303,6 +327,9 @@ static void __unix_walk_scc(struct unix_vertex *vertex)
list_move_tail(&vertex->entry, &unix_visited_vertices);
vertex->index = unix_vertex_grouped_index;
+
+ if (dead)
+ dead = unix_vertex_dead(vertex);
}
if (!list_is_singular(&scc)) {
@@ -347,14 +374,19 @@ static void unix_walk_scc_fast(void)
{
while (!list_empty(&unix_unvisited_vertices)) {
struct unix_vertex *vertex;
+ bool dead = true;
LIST_HEAD(scc);
vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
list_add(&scc, &vertex->scc_entry);
- list_for_each_entry_reverse(vertex, &scc, scc_entry)
+ list_for_each_entry_reverse(vertex, &scc, scc_entry) {
list_move_tail(&vertex->entry, &unix_visited_vertices);
+ if (dead)
+ dead = unix_vertex_dead(vertex);
+ }
+
list_del(&scc);
}
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v1 net-next 13/16] af_unix: Replace garbage collection algorithm.
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
` (11 preceding siblings ...)
2024-02-03 3:00 ` [PATCH v1 net-next 12/16] af_unix: Detect dead SCC Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 14/16] af_unix: Remove scm_fp_dup() in unix_attach_fds() Kuniyuki Iwashima
` (2 subsequent siblings)
15 siblings, 0 replies; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
If we find a dead SCC during iteration, we call unix_collect_skb().
Then, we move all skb in the SCC to a global skb queue, and later
we purge the queue after unlocking unix_gc_lock.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/af_unix.h | 7 --
net/unix/af_unix.c | 12 --
net/unix/garbage.c | 267 +++++++-----------------------------------
3 files changed, 41 insertions(+), 245 deletions(-)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 66c8cf835625..78ab1107ffb3 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -20,8 +20,6 @@ static inline struct unix_sock *unix_get_socket(struct file *filp)
extern spinlock_t unix_gc_lock;
extern unsigned int unix_tot_inflight;
-void unix_inflight(struct user_struct *user, struct file *fp);
-void unix_notinflight(struct user_struct *user, struct file *fp);
void unix_init_vertex(struct unix_sock *u);
void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
void unix_del_edges(struct scm_fp_list *fpl);
@@ -88,12 +86,7 @@ struct unix_sock {
struct sock *peer;
struct sock *listener;
struct unix_vertex vertex;
- struct list_head link;
- unsigned long inflight;
spinlock_t lock;
- unsigned long gc_flags;
-#define UNIX_GC_CANDIDATE 0
-#define UNIX_GC_MAYBE_CYCLE 1
struct socket_wq peer_wq;
wait_queue_entry_t peer_wake;
struct scm_stat scm_stat;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index dab5d8d96e87..0b70f7b5b5a8 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -993,12 +993,10 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
sk->sk_destruct = unix_sock_destructor;
u = unix_sk(sk);
u->listener = NULL;
- u->inflight = 0;
u->path.dentry = NULL;
u->path.mnt = NULL;
spin_lock_init(&u->lock);
unix_init_vertex(u);
- INIT_LIST_HEAD(&u->link);
mutex_init(&u->iolock); /* single task reading lock */
mutex_init(&u->bindlock); /* single task binding lock */
init_waitqueue_head(&u->peer_wait);
@@ -1806,8 +1804,6 @@ static inline bool too_many_unix_fds(struct task_struct *p)
static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
- int i;
-
if (too_many_unix_fds(current))
return -ETOOMANYREFS;
@@ -1819,9 +1815,6 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
if (!UNIXCB(skb).fp)
return -ENOMEM;
- for (i = scm->fp->count - 1; i >= 0; i--)
- unix_inflight(scm->fp->user, scm->fp->fp[i]);
-
if (unix_alloc_edges(UNIXCB(skb).fp))
return -ENOMEM;
@@ -1830,15 +1823,10 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
- int i;
-
scm->fp = UNIXCB(skb).fp;
UNIXCB(skb).fp = NULL;
unix_free_edges(scm->fp);
-
- for (i = scm->fp->count - 1; i >= 0; i--)
- unix_notinflight(scm->fp->user, scm->fp->fp[i]);
}
static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 528215527b23..84c445c79f8c 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -284,6 +284,36 @@ static bool unix_vertex_dead(struct unix_vertex *vertex)
return true;
}
+static struct sk_buff_head hitlist;
+
+static void unix_collect_skb(struct list_head *scc)
+{
+ struct unix_vertex *vertex;
+
+ list_for_each_entry_reverse(vertex, scc, scc_entry) {
+ struct unix_sock *u = container_of(vertex, typeof(*u), vertex);
+ struct sk_buff_head *queue = &u->sk.sk_receive_queue;
+
+ spin_lock(&queue->lock);
+
+ if (u->sk.sk_state == TCP_LISTEN) {
+ struct sk_buff *skb;
+
+ skb_queue_walk(queue, skb) {
+ struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue;
+
+ spin_lock(&embryo_queue->lock);
+ skb_queue_splice_init(embryo_queue, &hitlist);
+ spin_unlock(&embryo_queue->lock);
+ }
+ } else {
+ skb_queue_splice_init(queue, &hitlist);
+ }
+
+ spin_unlock(&queue->lock);
+ }
+}
+
static LIST_HEAD(unix_visited_vertices);
static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
static unsigned long unix_vertex_last_index = UNIX_VERTEX_INDEX_START;
@@ -332,7 +362,9 @@ static void __unix_walk_scc(struct unix_vertex *vertex)
dead = unix_vertex_dead(vertex);
}
- if (!list_is_singular(&scc)) {
+ if (dead) {
+ unix_collect_skb(&scc);
+ } else if (!list_is_singular(&scc)) {
unix_graph_maybe_cyclic = true;
} else {
vertex = list_first_entry(&scc, typeof(*vertex), scc_entry);
@@ -387,255 +419,38 @@ static void unix_walk_scc_fast(void)
dead = unix_vertex_dead(vertex);
}
+ if (dead)
+ unix_collect_skb(&scc);
+
list_del(&scc);
}
list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
}
-static LIST_HEAD(gc_candidates);
-static LIST_HEAD(gc_inflight_list);
-
-/* Keep the number of times in flight count for the file
- * descriptor if it is for an AF_UNIX socket.
- */
-void unix_inflight(struct user_struct *user, struct file *filp)
-{
- struct unix_sock *u = unix_get_socket(filp);
-
- spin_lock(&unix_gc_lock);
-
- if (u) {
- if (!u->inflight) {
- WARN_ON_ONCE(!list_empty(&u->link));
- list_add_tail(&u->link, &gc_inflight_list);
- } else {
- WARN_ON_ONCE(list_empty(&u->link));
- }
- u->inflight++;
- }
-
- spin_unlock(&unix_gc_lock);
-}
-
-void unix_notinflight(struct user_struct *user, struct file *filp)
-{
- struct unix_sock *u = unix_get_socket(filp);
-
- spin_lock(&unix_gc_lock);
-
- if (u) {
- WARN_ON_ONCE(!u->inflight);
- WARN_ON_ONCE(list_empty(&u->link));
-
- u->inflight--;
- if (!u->inflight)
- list_del_init(&u->link);
- }
-
- spin_unlock(&unix_gc_lock);
-}
-
-static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
- struct sk_buff_head *hitlist)
-{
- struct sk_buff *skb;
- struct sk_buff *next;
-
- spin_lock(&x->sk_receive_queue.lock);
- skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
- /* Do we have file descriptors ? */
- if (UNIXCB(skb).fp) {
- bool hit = false;
- /* Process the descriptors of this socket */
- int nfd = UNIXCB(skb).fp->count;
- struct file **fp = UNIXCB(skb).fp->fp;
-
- while (nfd--) {
- /* Get the socket the fd matches if it indeed does so */
- struct unix_sock *u = unix_get_socket(*fp++);
-
- /* Ignore non-candidates, they could have been added
- * to the queues after starting the garbage collection
- */
- if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
- hit = true;
-
- func(u);
- }
- }
- if (hit && hitlist != NULL) {
- __skb_unlink(skb, &x->sk_receive_queue);
- __skb_queue_tail(hitlist, skb);
- }
- }
- }
- spin_unlock(&x->sk_receive_queue.lock);
-}
-
-static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
- struct sk_buff_head *hitlist)
-{
- if (x->sk_state != TCP_LISTEN) {
- scan_inflight(x, func, hitlist);
- } else {
- struct sk_buff *skb;
- struct sk_buff *next;
- struct unix_sock *u;
- LIST_HEAD(embryos);
-
- /* For a listening socket collect the queued embryos
- * and perform a scan on them as well.
- */
- spin_lock(&x->sk_receive_queue.lock);
- skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
- u = unix_sk(skb->sk);
-
- /* An embryo cannot be in-flight, so it's safe
- * to use the list link.
- */
- WARN_ON_ONCE(!list_empty(&u->link));
- list_add_tail(&u->link, &embryos);
- }
- spin_unlock(&x->sk_receive_queue.lock);
-
- while (!list_empty(&embryos)) {
- u = list_entry(embryos.next, struct unix_sock, link);
- scan_inflight(&u->sk, func, hitlist);
- list_del_init(&u->link);
- }
- }
-}
-
-static void dec_inflight(struct unix_sock *usk)
-{
- usk->inflight--;
-}
-
-static void inc_inflight(struct unix_sock *usk)
-{
- usk->inflight++;
-}
-
-static void inc_inflight_move_tail(struct unix_sock *u)
-{
- u->inflight++;
-
- /* If this still might be part of a cycle, move it to the end
- * of the list, so that it's checked even if it was already
- * passed over
- */
- if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags))
- list_move_tail(&u->link, &gc_candidates);
-}
-
static bool gc_in_progress;
static void __unix_gc(struct work_struct *work)
{
- struct sk_buff_head hitlist;
- struct unix_sock *u, *next;
- LIST_HEAD(not_cycle_list);
- struct list_head cursor;
-
spin_lock(&unix_gc_lock);
- if (!unix_graph_maybe_cyclic)
+ if (!unix_graph_maybe_cyclic) {
+ spin_unlock(&unix_gc_lock);
goto skip_gc;
+ }
+
+ __skb_queue_head_init(&hitlist);
if (unix_graph_updated)
unix_walk_scc();
else
unix_walk_scc_fast();
- /* First, select candidates for garbage collection. Only
- * in-flight sockets are considered, and from those only ones
- * which don't have any external reference.
- *
- * Holding unix_gc_lock will protect these candidates from
- * being detached, and hence from gaining an external
- * reference. Since there are no possible receivers, all
- * buffers currently on the candidates' queues stay there
- * during the garbage collection.
- *
- * We also know that no new candidate can be added onto the
- * receive queues. Other, non candidate sockets _can_ be
- * added to queue, so we must make sure only to touch
- * candidates.
- */
- list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
- long total_refs;
-
- total_refs = file_count(u->sk.sk_socket->file);
-
- WARN_ON_ONCE(!u->inflight);
- WARN_ON_ONCE(total_refs < u->inflight);
- if (total_refs == u->inflight) {
- list_move_tail(&u->link, &gc_candidates);
- __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
- __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
- }
- }
-
- /* Now remove all internal in-flight reference to children of
- * the candidates.
- */
- list_for_each_entry(u, &gc_candidates, link)
- scan_children(&u->sk, dec_inflight, NULL);
-
- /* Restore the references for children of all candidates,
- * which have remaining references. Do this recursively, so
- * only those remain, which form cyclic references.
- *
- * Use a "cursor" link, to make the list traversal safe, even
- * though elements might be moved about.
- */
- list_add(&cursor, &gc_candidates);
- while (cursor.next != &gc_candidates) {
- u = list_entry(cursor.next, struct unix_sock, link);
-
- /* Move cursor to after the current position. */
- list_move(&cursor, &u->link);
-
- if (u->inflight) {
- list_move_tail(&u->link, ¬_cycle_list);
- __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
- scan_children(&u->sk, inc_inflight_move_tail, NULL);
- }
- }
- list_del(&cursor);
-
- /* Now gc_candidates contains only garbage. Restore original
- * inflight counters for these as well, and remove the skbuffs
- * which are creating the cycle(s).
- */
- skb_queue_head_init(&hitlist);
- list_for_each_entry(u, &gc_candidates, link)
- scan_children(&u->sk, inc_inflight, &hitlist);
-
- /* not_cycle_list contains those sockets which do not make up a
- * cycle. Restore these to the inflight list.
- */
- while (!list_empty(¬_cycle_list)) {
- u = list_entry(not_cycle_list.next, struct unix_sock, link);
- __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
- list_move_tail(&u->link, &gc_inflight_list);
- }
-
spin_unlock(&unix_gc_lock);
- /* Here we are. Hitlist is filled. Die. */
__skb_queue_purge(&hitlist);
-
- spin_lock(&unix_gc_lock);
-
- /* All candidates should have been detached by now. */
- WARN_ON_ONCE(!list_empty(&gc_candidates));
skip_gc:
- /* Paired with READ_ONCE() in wait_for_unix_gc(). */
WRITE_ONCE(gc_in_progress, false);
-
- spin_unlock(&unix_gc_lock);
}
static DECLARE_WORK(unix_gc_work, __unix_gc);
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v1 net-next 14/16] af_unix: Remove scm_fp_dup() in unix_attach_fds().
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
` (12 preceding siblings ...)
2024-02-03 3:00 ` [PATCH v1 net-next 13/16] af_unix: Replace garbage collection algorithm Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 15/16] af_unix: Remove lock dance in unix_peek_fds() Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 16/16] selftest: af_unix: Test GC for SCM_RIGHTS Kuniyuki Iwashima
15 siblings, 0 replies; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
When we passed fds, we used to bump each file's recount twice before
linking the socket to a global list, gc_inflight_list. Otherwise, the
inflight socket could have been garbage-collected before queuing skb.
Previously, we linked the inflight socket to the list in advance and
later queued the skb holding the socket. So, there was a small race
window like this:
CPU 1 CPU 2 CPU 3
----- ----- -----
/* Here A's refcount is 1, and inflight count is 0 */
bump A's refcount to 2
bump A's inflight count to 1
link A to list
close(A) decreases
A's refcount to 1
GC starts and
mark A as candidate
as refcount == inflight count
However, we no longer publish an inflight socket to GC before queueing
skb, so we can just move scm->fp to skb without cloning.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/unix/af_unix.c | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 0b70f7b5b5a8..9022a3a5dccc 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1807,13 +1807,8 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
if (too_many_unix_fds(current))
return -ETOOMANYREFS;
- /* Need to duplicate file references for the sake of garbage
- * collection. Otherwise a socket in the fps might become a
- * candidate for GC while the skb is not yet queued.
- */
- UNIXCB(skb).fp = scm_fp_dup(scm->fp);
- if (!UNIXCB(skb).fp)
- return -ENOMEM;
+ UNIXCB(skb).fp = scm->fp;
+ scm->fp = NULL;
if (unix_alloc_edges(UNIXCB(skb).fp))
return -ENOMEM;
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v1 net-next 15/16] af_unix: Remove lock dance in unix_peek_fds().
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
` (13 preceding siblings ...)
2024-02-03 3:00 ` [PATCH v1 net-next 14/16] af_unix: Remove scm_fp_dup() in unix_attach_fds() Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
2024-02-03 3:00 ` [PATCH v1 net-next 16/16] selftest: af_unix: Test GC for SCM_RIGHTS Kuniyuki Iwashima
15 siblings, 0 replies; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In the previous GC implementation, the shape of the inflight graph
was not expected to change while GC was in progress.
MSG_PEEK was tricky because it could install inflight fd silently
and transform the graph.
Let's say we peeked a fd, which was a listening socket, and accept()ed
some embryo sockets from it. The garbage collection algorithm would
have been confused as the set of sockets visited in scan_inflight()
were different within the same GC invocation.
That's why we placed spin_lock(&unix_gc_lock) and spin_unlock() in
unix_peek_fds() with a fat comment.
In a new implementation, we no longer garbage-collect the socket if
it exists in another queue, that is, if it has a bridge to another
SCC. Also, accept() will require the lock if it has edges.
Thus, we need not do the complicated lock dance.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/af_unix.h | 1 -
net/unix/af_unix.c | 42 ------------------------------------------
net/unix/garbage.c | 2 +-
3 files changed, 1 insertion(+), 44 deletions(-)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 78ab1107ffb3..33ddfe27bf50 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -17,7 +17,6 @@ static inline struct unix_sock *unix_get_socket(struct file *filp)
}
#endif
-extern spinlock_t unix_gc_lock;
extern unsigned int unix_tot_inflight;
void unix_init_vertex(struct unix_sock *u);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 9022a3a5dccc..d55fc3e9875b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1827,48 +1827,6 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
scm->fp = scm_fp_dup(UNIXCB(skb).fp);
-
- /*
- * Garbage collection of unix sockets starts by selecting a set of
- * candidate sockets which have reference only from being in flight
- * (total_refs == inflight_refs). This condition is checked once during
- * the candidate collection phase, and candidates are marked as such, so
- * that non-candidates can later be ignored. While inflight_refs is
- * protected by unix_gc_lock, total_refs (file count) is not, hence this
- * is an instantaneous decision.
- *
- * Once a candidate, however, the socket must not be reinstalled into a
- * file descriptor while the garbage collection is in progress.
- *
- * If the above conditions are met, then the directed graph of
- * candidates (*) does not change while unix_gc_lock is held.
- *
- * Any operations that changes the file count through file descriptors
- * (dup, close, sendmsg) does not change the graph since candidates are
- * not installed in fds.
- *
- * Dequeing a candidate via recvmsg would install it into an fd, but
- * that takes unix_gc_lock to decrement the inflight count, so it's
- * serialized with garbage collection.
- *
- * MSG_PEEK is special in that it does not change the inflight count,
- * yet does install the socket into an fd. The following lock/unlock
- * pair is to ensure serialization with garbage collection. It must be
- * done between incrementing the file count and installing the file into
- * an fd.
- *
- * If garbage collection starts after the barrier provided by the
- * lock/unlock, then it will see the elevated refcount and not mark this
- * as a candidate. If a garbage collection is already in progress
- * before the file count was incremented, then the lock/unlock pair will
- * ensure that garbage collection is finished before progressing to
- * installing the fd.
- *
- * (*) A -> B where B is on the queue of A or B is on the queue of C
- * which is on the queue of listening socket A.
- */
- spin_lock(&unix_gc_lock);
- spin_unlock(&unix_gc_lock);
}
static void unix_destruct_scm(struct sk_buff *skb)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 84c445c79f8c..d9dd97d3d4a6 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -127,7 +127,7 @@ static void unix_graph_update(struct unix_edge *edge)
unix_graph_maybe_cyclic = true;
}
-DEFINE_SPINLOCK(unix_gc_lock);
+static DEFINE_SPINLOCK(unix_gc_lock);
static LIST_HEAD(unix_unvisited_vertices);
unsigned int unix_tot_inflight;
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v1 net-next 16/16] selftest: af_unix: Test GC for SCM_RIGHTS.
2024-02-03 3:00 [PATCH v1 net-next 00/16] af_unix: Reimplment GC Kuniyuki Iwashima
` (14 preceding siblings ...)
2024-02-03 3:00 ` [PATCH v1 net-next 15/16] af_unix: Remove lock dance in unix_peek_fds() Kuniyuki Iwashima
@ 2024-02-03 3:00 ` Kuniyuki Iwashima
15 siblings, 0 replies; 21+ messages in thread
From: Kuniyuki Iwashima @ 2024-02-03 3:00 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
This patch adds test cases to verify the new GC.
We run each test for 3 types of sockets.
* SOCK_DGRAM
* SOCK_STREAM with no embryo socket
* SOCK_STREAM with embryo sockets
Before and after running each test case, we ensure that there is
no AF_UNIX socket left in the netns by reading /proc/net/protocols.
We cannot use /proc/net/unix and UNIX_DIAG because the embryo socket
does not show up there.
Each test creates multiple sockets in an array. We pass sockets in
the even index using the peer sockets in the odd index.
So, send_fd(0, 1) actually sends fd[0] to fd[2] via fd[0 + 1].
Test 1 : A <-> A
Test 2 : A <-> B
Test 3 : A -> B -> C <- D
^.___|___.' ^
`---------'
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
tools/testing/selftests/net/.gitignore | 1 +
tools/testing/selftests/net/af_unix/Makefile | 2 +-
.../selftests/net/af_unix/scm_rights.c | 242 ++++++++++++++++++
3 files changed, 244 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/net/af_unix/scm_rights.c
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 2f9d378edec3..d996a0ab0765 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -31,6 +31,7 @@ reuseport_dualstack
rxtimestamp
sctp_hello
scm_pidfd
+scm_rights
sk_bind_sendto_listen
sk_connect_zero_addr
socket
diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile
index 221c387a7d7f..3b83c797650d 100644
--- a/tools/testing/selftests/net/af_unix/Makefile
+++ b/tools/testing/selftests/net/af_unix/Makefile
@@ -1,4 +1,4 @@
CFLAGS += $(KHDR_INCLUDES)
-TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd
+TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd scm_rights
include ../../lib.mk
diff --git a/tools/testing/selftests/net/af_unix/scm_rights.c b/tools/testing/selftests/net/af_unix/scm_rights.c
new file mode 100644
index 000000000000..2f21a6e12cb8
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/scm_rights.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Amazon.com Inc. or its affiliates. */
+#define _GNU_SOURCE
+#include <sched.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "../../kselftest_harness.h"
+
+FIXTURE(scm_rights)
+{
+ int fd[16];
+};
+
+FIXTURE_VARIANT(scm_rights)
+{
+ int type;
+ char name[16];
+ bool test_listener;
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, dgram)
+{
+ .type = SOCK_DGRAM,
+ .name = "UNIX ",
+ .test_listener = false,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream)
+{
+ .type = SOCK_STREAM,
+ .name = "UNIX-STREAM ",
+ .test_listener = false,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream_listener)
+{
+ .type = SOCK_STREAM,
+ .name = "UNIX-STREAM ",
+ .test_listener = true,
+};
+
+static int count_sockets(struct __test_metadata *_metadata,
+ const FIXTURE_VARIANT(scm_rights) *variant)
+{
+ int sockets = -1, len, ret;
+ size_t unused;
+ char *line;
+ FILE *f;
+
+ f = fopen("/proc/net/protocols", "r");
+ ASSERT_NE(NULL, f);
+
+ len = strlen(variant->name);
+
+ while (getline(&line, &unused, f) != -1) {
+ int unused2;
+
+ if (strncmp(line, variant->name, len))
+ continue;
+
+ ret = sscanf(line + len, "%d %d", &unused2, &sockets);
+ ASSERT_EQ(2, ret);
+
+ break;
+ }
+
+ ret = fclose(f);
+ ASSERT_EQ(0, ret);
+
+ return sockets;
+}
+
+FIXTURE_SETUP(scm_rights)
+{
+ int ret;
+
+ ret = unshare(CLONE_NEWNET);
+ ASSERT_EQ(0, ret);
+
+ ret = count_sockets(_metadata, variant);
+ ASSERT_EQ(0, ret);
+}
+
+FIXTURE_TEARDOWN(scm_rights)
+{
+ int ret;
+
+ ret = count_sockets(_metadata, variant);
+ ASSERT_EQ(0, ret);
+}
+
+static void create_listeners(struct __test_metadata *_metadata,
+ FIXTURE_DATA(scm_rights) *self,
+ int n)
+{
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
+ };
+ socklen_t addrlen;
+ int i, ret;
+
+ for (i = 0; i < n * 2; i += 2) {
+ self->fd[i] = socket(AF_UNIX, SOCK_STREAM, 0);
+ ASSERT_LE(0, self->fd[i]);
+
+ addrlen = sizeof(addr.sun_family);
+ ret = bind(self->fd[i], (struct sockaddr *)&addr, addrlen);
+ ASSERT_EQ(0, ret);
+
+ ret = listen(self->fd[i], -1);
+ ASSERT_EQ(0, ret);
+
+ addrlen = sizeof(addr);
+ ret = getsockname(self->fd[i], (struct sockaddr *)&addr, &addrlen);
+ ASSERT_EQ(0, ret);
+
+ self->fd[i + 1] = socket(AF_UNIX, SOCK_STREAM, 0);
+ ASSERT_LE(0, self->fd[i + 1]);
+
+ ret = connect(self->fd[i + 1], (struct sockaddr *)&addr, addrlen);
+ ASSERT_EQ(0, ret);
+ }
+}
+
+static void create_socketpairs(struct __test_metadata *_metadata,
+ FIXTURE_DATA(scm_rights) *self,
+ const FIXTURE_VARIANT(scm_rights) *variant,
+ int n)
+{
+ int i, ret;
+
+ for (i = 0; i < n * 2; i += 2) {
+ ret = socketpair(AF_UNIX, variant->type, 0, self->fd + i);
+ ASSERT_EQ(0, ret);
+ }
+}
+
+static void __create_sockets(struct __test_metadata *_metadata,
+ FIXTURE_DATA(scm_rights) *self,
+ const FIXTURE_VARIANT(scm_rights) *variant,
+ int n)
+{
+ if (variant->test_listener)
+ create_listeners(_metadata, self, n);
+ else
+ create_socketpairs(_metadata, self, variant, n);
+}
+
+static void __close_sockets(struct __test_metadata *_metadata,
+ FIXTURE_DATA(scm_rights) *self,
+ int n)
+{
+ int i, ret;
+
+ for (i = 0; i < n * 2; i++) {
+ ret = close(self->fd[i]);
+ ASSERT_EQ(0, ret);
+ }
+}
+
+void __send_fd(struct __test_metadata *_metadata,
+ const FIXTURE_DATA(scm_rights) *self,
+ int inflight, int receiver)
+{
+#define MSG "nop"
+#define MSGLEN 3
+ struct {
+ struct cmsghdr cmsghdr;
+ int fd;
+ } cmsg = {
+ .cmsghdr = {
+ .cmsg_len = CMSG_LEN(sizeof(cmsg.fd)),
+ .cmsg_level = SOL_SOCKET,
+ .cmsg_type = SCM_RIGHTS,
+ },
+ .fd = self->fd[inflight * 2],
+ };
+ struct iovec iov = {
+ .iov_base = MSG,
+ .iov_len = MSGLEN,
+ };
+ struct msghdr msg = {
+ .msg_name = NULL,
+ .msg_namelen = 0,
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = &cmsg,
+ .msg_controllen = CMSG_SPACE(sizeof(cmsg.fd)),
+ };
+ int ret;
+
+ ret = sendmsg(self->fd[receiver * 2 + 1], &msg, 0);
+ ASSERT_EQ(MSGLEN, ret);
+}
+
+#define create_sockets(n) \
+ __create_sockets(_metadata, self, variant, n)
+#define close_sockets(n) \
+ __close_sockets(_metadata, self, n)
+#define send_fd(inflight, receiver) \
+ __send_fd(_metadata, self, inflight, receiver)
+
+TEST_F(scm_rights, self_ref)
+{
+ create_sockets(1);
+
+ send_fd(0, 0);
+
+ close_sockets(1);
+}
+
+TEST_F(scm_rights, triangle)
+{
+ create_sockets(3);
+
+ send_fd(0, 1);
+ send_fd(1, 2);
+ send_fd(2, 0);
+
+ close_sockets(3);
+}
+
+TEST_F(scm_rights, cross_edge)
+{
+ create_sockets(4);
+
+ send_fd(0, 1);
+ send_fd(1, 2);
+ send_fd(2, 0);
+ send_fd(1, 3);
+ send_fd(3, 2);
+
+ close_sockets(4);
+}
+
+TEST_HARNESS_MAIN
--
2.30.2
^ permalink raw reply related [flat|nested] 21+ messages in thread