* [PATCHES 0/3] Mem accounting and IRQ req cache
@ 2021-02-10 15:16 Jens Axboe
2021-02-10 15:16 ` [PATCH 1/3] io_uring: enable req cache for IRQ driven IO Jens Axboe
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Jens Axboe @ 2021-02-10 15:16 UTC (permalink / raw)
To: io-uring
Hi,
This builds on the stuff that Pavel and I have been tossing around:
Patch 1: Enable req cache for the last class of requests, those that end
up completing from IRQ context. That's regular file/bdev reads
and writes.
Patch 2: Enable SLAB_ACCOUNT/memcg accounting for requests
Patch 3: Use memcg for the ring array accounting as well. That moves it
outside of rlimit memlock, though we retain memlock accounting
for registered buffers.
--
Jens Axboe
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH 1/3] io_uring: enable req cache for IRQ driven IO
2021-02-10 15:16 [PATCHES 0/3] Mem accounting and IRQ req cache Jens Axboe
@ 2021-02-10 15:16 ` Jens Axboe
2021-02-10 15:16 ` [PATCH 2/3] io_uring: enable kmemcg account for io_uring requests Jens Axboe
2021-02-10 15:16 ` [PATCH 3/3] io_uring: place ring SQ/CQ arrays under memcg memory limits Jens Axboe
2 siblings, 0 replies; 4+ messages in thread
From: Jens Axboe @ 2021-02-10 15:16 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe
This is the last class of requests that cannot utilize the req alloc
cache. Add a per-ctx req cache that is protected by the completion_lock,
and refill our submit side cache when it gets over our batch count.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/io_uring.c | 71 ++++++++++++++++++++++++++++++++++++---------------
1 file changed, 51 insertions(+), 20 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e73ca37c6a3b..2c7ff0b1b086 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -272,7 +272,11 @@ struct io_sq_data {
struct io_comp_state {
struct io_kiocb *reqs[IO_COMPL_BATCH];
unsigned int nr;
+ unsigned int locked_free_nr;
+ /* inline/task_work completion list, under ->uring_lock */
struct list_head free_list;
+ /* IRQ completion list, under ->completion_lock */
+ struct list_head locked_free_list;
};
struct io_submit_state {
@@ -1033,6 +1037,9 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void io_put_req(struct io_kiocb *req);
static void io_put_req_deferred(struct io_kiocb *req, int nr);
static void io_double_put_req(struct io_kiocb *req);
+static void io_dismantle_req(struct io_kiocb *req);
+static void io_put_task(struct task_struct *task, int nr);
+static void io_queue_next(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void __io_queue_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
@@ -1353,6 +1360,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
init_llist_head(&ctx->rsrc_put_llist);
INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
+ INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
return ctx;
err:
kfree(ctx->cancel_hash);
@@ -1908,8 +1916,8 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
__io_cqring_fill_event(req, res, 0);
}
-static void io_req_complete_post(struct io_kiocb *req, long res,
- unsigned int cflags)
+static inline void io_req_complete_post(struct io_kiocb *req, long res,
+ unsigned int cflags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
@@ -1917,16 +1925,26 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
spin_lock_irqsave(&ctx->completion_lock, flags);
__io_cqring_fill_event(req, res, cflags);
io_commit_cqring(ctx);
+ /*
+ * If we're the last reference to this request, add to our locked
+ * free_list cache.
+ */
+ if (refcount_dec_and_test(&req->refs)) {
+ struct io_comp_state *cs = &ctx->submit_state.comp;
+
+ io_dismantle_req(req);
+ io_put_task(req->task, 1);
+ list_add(&req->compl.list, &cs->locked_free_list);
+ cs->locked_free_nr++;
+ } else
+ req = NULL;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
-}
-
-static inline void io_req_complete_nostate(struct io_kiocb *req, long res,
- unsigned int cflags)
-{
- io_req_complete_post(req, res, cflags);
- io_put_req(req);
+ if (req) {
+ io_queue_next(req);
+ percpu_ref_put(&ctx->refs);
+ }
}
static void io_req_complete_state(struct io_kiocb *req, long res,
@@ -1944,7 +1962,7 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
if (issue_flags & IO_URING_F_COMPLETE_DEFER)
io_req_complete_state(req, res, cflags);
else
- io_req_complete_nostate(req, res, cflags);
+ io_req_complete_post(req, res, cflags);
}
static inline void io_req_complete(struct io_kiocb *req, long res)
@@ -1952,12 +1970,26 @@ static inline void io_req_complete(struct io_kiocb *req, long res)
__io_req_complete(req, 0, res, 0);
}
-static bool io_flush_cached_reqs(struct io_submit_state *state)
+static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
{
+ struct io_submit_state *state = &ctx->submit_state;
+ struct io_comp_state *cs = &state->comp;
struct io_kiocb *req = NULL;
- while (!list_empty(&state->comp.free_list)) {
- req = list_first_entry(&state->comp.free_list, struct io_kiocb,
+ /*
+ * If we have more than a batch's worth of requests in our IRQ side
+ * locked cache, grab the lock and move them over to our submission
+ * side cache.
+ */
+ if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) {
+ spin_lock_irq(&ctx->completion_lock);
+ list_splice_init(&cs->locked_free_list, &cs->free_list);
+ cs->locked_free_nr = 0;
+ spin_unlock_irq(&ctx->completion_lock);
+ }
+
+ while (!list_empty(&cs->free_list)) {
+ req = list_first_entry(&cs->free_list, struct io_kiocb,
compl.list);
list_del(&req->compl.list);
state->reqs[state->free_reqs++] = req;
@@ -1978,7 +2010,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
int ret;
- if (io_flush_cached_reqs(state))
+ if (io_flush_cached_reqs(ctx))
goto got_req;
ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
@@ -8748,14 +8780,12 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
idr_destroy(&ctx->io_buffer_idr);
}
-static void io_req_cache_free(struct io_ring_ctx *ctx)
+static void io_req_cache_free(struct list_head *list)
{
- struct io_comp_state *cs = &ctx->submit_state.comp;
-
- while (!list_empty(&cs->free_list)) {
+ while (!list_empty(list)) {
struct io_kiocb *req;
- req = list_first_entry(&cs->free_list, struct io_kiocb, compl.list);
+ req = list_first_entry(list, struct io_kiocb, compl.list);
list_del(&req->compl.list);
kmem_cache_free(req_cachep, req);
}
@@ -8803,7 +8833,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
free_uid(ctx->user);
put_cred(ctx->creds);
kfree(ctx->cancel_hash);
- io_req_cache_free(ctx);
+ io_req_cache_free(&ctx->submit_state.comp.free_list);
+ io_req_cache_free(&ctx->submit_state.comp.locked_free_list);
kfree(ctx);
}
--
2.30.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 2/3] io_uring: enable kmemcg account for io_uring requests
2021-02-10 15:16 [PATCHES 0/3] Mem accounting and IRQ req cache Jens Axboe
2021-02-10 15:16 ` [PATCH 1/3] io_uring: enable req cache for IRQ driven IO Jens Axboe
@ 2021-02-10 15:16 ` Jens Axboe
2021-02-10 15:16 ` [PATCH 3/3] io_uring: place ring SQ/CQ arrays under memcg memory limits Jens Axboe
2 siblings, 0 replies; 4+ messages in thread
From: Jens Axboe @ 2021-02-10 15:16 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe
This puts io_uring under the memory cgroups accounting and limits for
requests.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/io_uring.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2c7ff0b1b086..bffed6aa5722 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -10350,7 +10350,8 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
- req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT);
return 0;
};
__initcall(io_uring_init);
--
2.30.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 3/3] io_uring: place ring SQ/CQ arrays under memcg memory limits
2021-02-10 15:16 [PATCHES 0/3] Mem accounting and IRQ req cache Jens Axboe
2021-02-10 15:16 ` [PATCH 1/3] io_uring: enable req cache for IRQ driven IO Jens Axboe
2021-02-10 15:16 ` [PATCH 2/3] io_uring: enable kmemcg account for io_uring requests Jens Axboe
@ 2021-02-10 15:16 ` Jens Axboe
2 siblings, 0 replies; 4+ messages in thread
From: Jens Axboe @ 2021-02-10 15:16 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe
Instead of imposing rlimit memlock limits for the rings themselves,
ensure that we account them properly under memcg with __GFP_ACCOUNT.
We retain rlimit memlock for registered buffers, this is just for the
ring arrays themselves.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/io_uring.c | 85 ++++++---------------------------------------------
1 file changed, 10 insertions(+), 75 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index bffed6aa5722..7a1e4ecf5f94 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1017,11 +1017,6 @@ static const struct io_op_def io_op_defs[] = {
},
};
-enum io_mem_account {
- ACCT_LOCKED,
- ACCT_PINNED,
-};
-
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
struct files_struct *files);
@@ -8355,25 +8350,16 @@ static inline int __io_account_mem(struct user_struct *user,
return 0;
}
-static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
- enum io_mem_account acct)
+static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
if (ctx->limit_mem)
__io_unaccount_mem(ctx->user, nr_pages);
- if (ctx->mm_account) {
- if (acct == ACCT_LOCKED) {
- mmap_write_lock(ctx->mm_account);
- ctx->mm_account->locked_vm -= nr_pages;
- mmap_write_unlock(ctx->mm_account);
- }else if (acct == ACCT_PINNED) {
- atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
- }
- }
+ if (ctx->mm_account)
+ atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
}
-static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
- enum io_mem_account acct)
+static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
int ret;
@@ -8383,15 +8369,8 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
return ret;
}
- if (ctx->mm_account) {
- if (acct == ACCT_LOCKED) {
- mmap_write_lock(ctx->mm_account);
- ctx->mm_account->locked_vm += nr_pages;
- mmap_write_unlock(ctx->mm_account);
- } else if (acct == ACCT_PINNED) {
- atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
- }
- }
+ if (ctx->mm_account)
+ atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
return 0;
}
@@ -8411,7 +8390,7 @@ static void io_mem_free(void *ptr)
static void *io_mem_alloc(size_t size)
{
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
- __GFP_NORETRY;
+ __GFP_NORETRY | __GFP_ACCOUNT;
return (void *) __get_free_pages(gfp_flags, get_order(size));
}
@@ -8445,18 +8424,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
return off;
}
-static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
-{
- size_t pages;
-
- pages = (size_t)1 << get_order(
- rings_size(sq_entries, cq_entries, NULL));
- pages += (size_t)1 << get_order(
- array_size(sizeof(struct io_uring_sqe), sq_entries));
-
- return pages;
-}
-
static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
int i, j;
@@ -8471,7 +8438,7 @@ static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
unpin_user_page(imu->bvec[j].bv_page);
if (imu->acct_pages)
- io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED);
+ io_unaccount_mem(ctx, imu->acct_pages);
kvfree(imu->bvec);
imu->nr_bvecs = 0;
}
@@ -8569,7 +8536,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
if (!imu->acct_pages)
return 0;
- ret = io_account_mem(ctx, imu->acct_pages, ACCT_PINNED);
+ ret = io_account_mem(ctx, imu->acct_pages);
if (ret)
imu->acct_pages = 0;
return ret;
@@ -8949,14 +8916,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
/* if we failed setting up the ctx, we might not have any rings */
io_iopoll_try_reap_events(ctx);
- /*
- * Do this upfront, so we won't have a grace period where the ring
- * is closed but resources aren't reaped yet. This can cause
- * spurious failure in setting up a new ring.
- */
- io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries),
- ACCT_LOCKED);
-
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
/*
* Use system_unbound_wq to avoid spawning tons of event kworkers
@@ -9780,7 +9739,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
struct user_struct *user = NULL;
struct io_ring_ctx *ctx;
struct file *file;
- bool limit_mem;
int ret;
if (!entries)
@@ -9821,26 +9779,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
}
user = get_uid(current_user());
- limit_mem = !capable(CAP_IPC_LOCK);
-
- if (limit_mem) {
- ret = __io_account_mem(user,
- ring_pages(p->sq_entries, p->cq_entries));
- if (ret) {
- free_uid(user);
- return ret;
- }
- }
ctx = io_ring_ctx_alloc(p);
if (!ctx) {
- if (limit_mem)
- __io_unaccount_mem(user, ring_pages(p->sq_entries,
- p->cq_entries));
free_uid(user);
return -ENOMEM;
}
ctx->compat = in_compat_syscall();
+ ctx->limit_mem = !capable(CAP_IPC_LOCK);
ctx->user = user;
ctx->creds = get_current_cred();
#ifdef CONFIG_AUDIT
@@ -9876,17 +9822,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
goto err;
}
#endif
-
- /*
- * Account memory _before_ installing the file descriptor. Once
- * the descriptor is installed, it can get closed at any time. Also
- * do this before hitting the general error path, as ring freeing
- * will un-account as well.
- */
- io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
- ACCT_LOCKED);
- ctx->limit_mem = limit_mem;
-
ret = io_allocate_scq_urings(ctx, p);
if (ret)
goto err;
--
2.30.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2021-02-10 15:17 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-02-10 15:16 [PATCHES 0/3] Mem accounting and IRQ req cache Jens Axboe
2021-02-10 15:16 ` [PATCH 1/3] io_uring: enable req cache for IRQ driven IO Jens Axboe
2021-02-10 15:16 ` [PATCH 2/3] io_uring: enable kmemcg account for io_uring requests Jens Axboe
2021-02-10 15:16 ` [PATCH 3/3] io_uring: place ring SQ/CQ arrays under memcg memory limits Jens Axboe
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.