From: "Clément Léger" <cleger@rivosinc.com>
To: qemu-devel@nongnu.org
Cc: "Daniel P. Berrangé" <berrange@redhat.com>,
"Peter Maydell" <peter.maydell@linaro.org>,
"Jason Wang" <jasowang@redhat.com>
Subject: Re: [PATCH v2] osdep: add a qemu_close_all_open_fd() helper
Date: Thu, 11 Jul 2024 15:34:26 +0200 [thread overview]
Message-ID: <1657b51b-243d-4607-8490-3695214da106@rivosinc.com> (raw)
In-Reply-To: <20240618111704.63092-1-cleger@rivosinc.com>
Gentle ping ?
Thanks,
Clément
On 18/06/2024 13:17, Clément Léger wrote:
> Since commit 03e471c41d8b ("qemu_init: increase NOFILE soft limit on
> POSIX"), the maximum number of file descriptors that can be opened are
> raised to nofile.rlim_max. On recent debian distro, this yield a maximum
> of 1073741816 file descriptors. Now, when forking to start
> qemu-bridge-helper, this actually calls close() on the full possible file
> descriptor range (more precisely [3 - sysconf(_SC_OPEN_MAX)]) which
> takes a considerable amount of time. In order to reduce that time,
> factorize existing code to close all open files descriptors in a new
> qemu_close_all_open_fd() function. This function uses various methods
> to close all the open file descriptors ranging from the most efficient
> one to the least one. It also accepts an ordered array of file
> descriptors that should not be closed since this is required by the
> callers that calls it after forking.
>
> Signed-off-by: Clément Léger <cleger@rivosinc.com>
>
> ----
>
> v2:
> - Factorize async_teardown.c close_fds implementation as well as tap.c ones
> - Apply checkpatch
> - v1: https://lore.kernel.org/qemu-devel/20240617162520.4045016-1-cleger@rivosinc.com/
>
> ---
> include/qemu/osdep.h | 8 +++
> net/tap.c | 31 ++++++-----
> system/async-teardown.c | 37 +------------
> util/osdep.c | 115 ++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 141 insertions(+), 50 deletions(-)
>
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index f61edcfdc2..9369a97d3d 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -755,6 +755,14 @@ static inline void qemu_reset_optind(void)
>
> int qemu_fdatasync(int fd);
>
> +/**
> + * Close all open file descriptors except the ones supplied in the @skip array
> + *
> + * @skip: ordered array of distinct file descriptors that should not be closed
> + * @nskip: number of entries in the @skip array.
> + */
> +void qemu_close_all_open_fd(const int *skip, unsigned int nskip);
> +
> /**
> * Sync changes made to the memory mapped file back to the backing
> * storage. For POSIX compliant systems this will fallback
> diff --git a/net/tap.c b/net/tap.c
> index 51f7aec39d..6fc3939078 100644
> --- a/net/tap.c
> +++ b/net/tap.c
> @@ -385,6 +385,21 @@ static TAPState *net_tap_fd_init(NetClientState *peer,
> return s;
> }
>
> +static void close_all_fds_after_fork(int excluded_fd)
> +{
> + const int skip_fd[] = {0, 1, 2, 3, excluded_fd};
> + unsigned int nskip = ARRAY_SIZE(skip_fd);
> +
> + /*
> + * skip_fd must be an ordered array of distinct fds, exclude
> + * excluded_fd if already included in the [0 - 3] range
> + */
> + if (excluded_fd <= 3) {
> + nskip--;
> + }
> + qemu_close_all_open_fd(skip_fd, nskip);
> +}
> +
> static void launch_script(const char *setup_script, const char *ifname,
> int fd, Error **errp)
> {
> @@ -400,13 +415,7 @@ static void launch_script(const char *setup_script, const char *ifname,
> return;
> }
> if (pid == 0) {
> - int open_max = sysconf(_SC_OPEN_MAX), i;
> -
> - for (i = 3; i < open_max; i++) {
> - if (i != fd) {
> - close(i);
> - }
> - }
> + close_all_fds_after_fork(fd);
> parg = args;
> *parg++ = (char *)setup_script;
> *parg++ = (char *)ifname;
> @@ -490,17 +499,11 @@ static int net_bridge_run_helper(const char *helper, const char *bridge,
> return -1;
> }
> if (pid == 0) {
> - int open_max = sysconf(_SC_OPEN_MAX), i;
> char *fd_buf = NULL;
> char *br_buf = NULL;
> char *helper_cmd = NULL;
>
> - for (i = 3; i < open_max; i++) {
> - if (i != sv[1]) {
> - close(i);
> - }
> - }
> -
> + close_all_fds_after_fork(sv[1]);
> fd_buf = g_strdup_printf("%s%d", "--fd=", sv[1]);
>
> if (strrchr(helper, ' ') || strrchr(helper, '\t')) {
> diff --git a/system/async-teardown.c b/system/async-teardown.c
> index 396963c091..9148ee8d04 100644
> --- a/system/async-teardown.c
> +++ b/system/async-teardown.c
> @@ -26,40 +26,6 @@
>
> static pid_t the_ppid;
>
> -/*
> - * Close all open file descriptors.
> - */
> -static void close_all_open_fd(void)
> -{
> - struct dirent *de;
> - int fd, dfd;
> - DIR *dir;
> -
> -#ifdef CONFIG_CLOSE_RANGE
> - int r = close_range(0, ~0U, 0);
> - if (!r) {
> - /* Success, no need to try other ways. */
> - return;
> - }
> -#endif
> -
> - dir = opendir("/proc/self/fd");
> - if (!dir) {
> - /* If /proc is not mounted, there is nothing that can be done. */
> - return;
> - }
> - /* Avoid closing the directory. */
> - dfd = dirfd(dir);
> -
> - for (de = readdir(dir); de; de = readdir(dir)) {
> - fd = atoi(de->d_name);
> - if (fd != dfd) {
> - close(fd);
> - }
> - }
> - closedir(dir);
> -}
> -
> static void hup_handler(int signal)
> {
> /* Check every second if this process has been reparented. */
> @@ -85,9 +51,8 @@ static int async_teardown_fn(void *arg)
> /*
> * Close all file descriptors that might have been inherited from the
> * main qemu process when doing clone, needed to make libvirt happy.
> - * Not using close_range for increased compatibility with older kernels.
> */
> - close_all_open_fd();
> + qemu_close_all_open_fd(NULL, 0);
>
> /* Set up a handler for SIGHUP and unblock SIGHUP. */
> sigaction(SIGHUP, &sa, NULL);
> diff --git a/util/osdep.c b/util/osdep.c
> index 5d23bbfbec..f3710710e3 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -625,3 +625,118 @@ int qemu_fdatasync(int fd)
> return fsync(fd);
> #endif
> }
> +
> +#ifdef CONFIG_LINUX
> +static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip)
> +{
> + struct dirent *de;
> + int fd, dfd;
> + bool close_fd;
> + DIR *dir;
> + int i;
> +
> + dir = opendir("/proc/self/fd");
> + if (!dir) {
> + /* If /proc is not mounted, there is nothing that can be done. */
> + return false;
> + }
> + /* Avoid closing the directory. */
> + dfd = dirfd(dir);
> +
> + for (de = readdir(dir); de; de = readdir(dir)) {
> + fd = atoi(de->d_name);
> + close_fd = true;
> + if (fd == dfd) {
> + close_fd = false;
> + } else {
> + for (i = 0; i < nskip; i++) {
> + if (fd == skip[i]) {
> + close_fd = false;
> + break;
> + }
> + }
> + }
> + if (close_fd) {
> + close(fd);
> + }
> + }
> + closedir(dir);
> +
> + return true;
> +}
> +#else
> +static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip)
> +{
> + return false;
> +}
> +#endif
> +
> +#ifdef CONFIG_CLOSE_RANGE
> +static bool qemu_close_all_open_fd_close_range(const int *skip,
> + unsigned int nskip)
> +{
> + int max_fd = sysconf(_SC_OPEN_MAX) - 1;
> + int first = 0, last = max_fd;
> + int cur_skip = 0, ret;
> +
> + do {
> + if (nskip) {
> + while (first == skip[cur_skip]) {
> + cur_skip++;
> + first++;
> + }
> + if (cur_skip < nskip) {
> + last = skip[cur_skip] - 1;
> + }
> + if (last > max_fd) {
> + last = max_fd;
> + /*
> + * We can directly skip the remaining skip fds since the current
> + * one is already above the maximum supported one.
> + */
> + cur_skip = nskip;
> + }
> + if (first > last) {
> + break;
> + }
> + }
> + ret = close_range(first, last, 0);
> + if (ret < 0) {
> + return false;
> + }
> + first = last + 1;
> + last = max_fd;
> + } while (cur_skip < nskip);
> +
> + return true;
> +}
> +#else
> +static bool qemu_close_all_open_fd_close_range(const int *skip,
> + unsigned int nskip)
> +{
> + return false;
> +}
> +#endif
> +
> +void qemu_close_all_open_fd(const int *skip, unsigned int nskip)
> +{
> + int open_max = sysconf(_SC_OPEN_MAX);
> + int cur_skip = 0, i;
> +
> + if (qemu_close_all_open_fd_close_range(skip, nskip)) {
> + return;
> + }
> +
> + if (qemu_close_all_open_fd_proc(skip, nskip)) {
> + return;
> + }
> +
> + /* Fallback */
> + for (i = 0; i < open_max; i++) {
> + if (i == skip[cur_skip]) {
> + cur_skip++;
> + continue;
> + }
> + close(i);
> + }
> +}
next prev parent reply other threads:[~2024-07-11 13:35 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-06-18 11:17 [PATCH v2] osdep: add a qemu_close_all_open_fd() helper Clément Léger
2024-07-11 13:34 ` Clément Léger [this message]
2024-07-11 18:43 ` Richard Henderson
2024-07-16 13:42 ` Clément Léger
2024-07-12 15:12 ` Daniel P. Berrangé
2024-07-16 12:37 ` Clément Léger
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1657b51b-243d-4607-8490-3695214da106@rivosinc.com \
--to=cleger@rivosinc.com \
--cc=berrange@redhat.com \
--cc=jasowang@redhat.com \
--cc=peter.maydell@linaro.org \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).