* [PATCH v8 1/3] llist: Allow optional sentinel node terminated lockless list
2022-10-04 15:17 [PATCH v8 0/3] blk-cgroup: Optimize blkcg_rstat_flush() Waiman Long
@ 2022-10-04 15:17 ` Waiman Long
2022-10-04 15:17 ` [PATCH v8 2/3] blk-cgroup: Return -ENOMEM directly in blkcg_css_alloc() error path Waiman Long
` (2 subsequent siblings)
3 siblings, 0 replies; 7+ messages in thread
From: Waiman Long @ 2022-10-04 15:17 UTC (permalink / raw)
To: Tejun Heo, Jens Axboe
Cc: cgroups, linux-block, linux-kernel, Ming Lei, Andy Shevchenko,
Andrew Morton, Michal Koutný, Waiman Long
The lockless list API is useful for dealing with list in a lockless
manner. However, one of the drawback of the existing API is that there
is not an easy way to determine if an entry has already been put into a
lockless list. This has to be tracked externally and the tracking will
not be atomic unless some external synchronization logic is in place.
This patch changes the internal of the lockless list code to allow it
to support a lockless list terminated by an internal sentinel value
(LLIST_END) instead of NULL. The advantage of this scheme is that
we can atomically determine if an entry has been put into a lockless
list by doing a NULL check of the next pointer of the llist_node. The
drawback is that a bit more code may be needed to handle both NULL and
the sentinel value. The real world performance impact of this change,
however, should be negligible.
To use a sentinel terminated lockless list, the following new API must
be used for initialization and deletion of a lockless list.
- SLLIST_HEAD_INIT() and init_sllist_head() for initialization
- sllist_del_all() and __llist_del_all() for deletion
Other llist APIs are modified to process both NULL or the sentinel
terminated lockless list.
Of course, the callers should clear the next pointer when an entry is
removed from a sentinel terminated lockless list. Note that the internal
LIST_END sentinel value will never be returned. NULL will always be
returned if the lockless list is empty for backward compatibility.
Signed-off-by: Waiman Long <longman@redhat.com>
---
include/linux/llist.h | 100 +++++++++++++++++++++++++++++++++---------
lib/llist.c | 20 ++++++---
2 files changed, 95 insertions(+), 25 deletions(-)
diff --git a/include/linux/llist.h b/include/linux/llist.h
index 85bda2d02d65..c7380e9b98e2 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -2,7 +2,8 @@
#ifndef LLIST_H
#define LLIST_H
/*
- * Lock-less NULL terminated single linked list
+ * Lock-less NULL or sentinel terminated singly linked list
+ * --------------------------------------------------------
*
* Cases where locking is not needed:
* If there are multiple producers and multiple consumers, llist_add can be
@@ -44,6 +45,15 @@
* list can NOT be used in NMI handlers. So code that uses the list in
* an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
*
+ * A sentinel node terminated lock-less list allows lock-list membership
+ * determination to be done atomically by doing a NULL check of the next
+ * pointer of the llist_node as it will never be NULL if it is in a lock-less
+ * list. The following APIs must be used for the initalization and deletion
+ * of a sentinel terminated lock-less list.
+ *
+ * - SLLIST_HEAD_INIT() and init_sllist_head() for initialization
+ * - sllist_del_all() and __llist_del_all() for deletion
+ *
* Copyright 2010,2011 Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
*/
@@ -64,6 +74,16 @@ struct llist_node {
#define LLIST_HEAD_INIT(name) { NULL }
#define LLIST_HEAD(name) struct llist_head name = LLIST_HEAD_INIT(name)
+/*
+ * Sentinel terminated llist_head initializer
+ *
+ * LLIST_END is chosen to be 1 so that a check for both NULL and LLIST_END
+ * can be optimized by the compiler to a single unsigned integer comparison.
+ */
+#define LLIST_END ((struct llist_node *)1UL)
+#define SLLIST_HEAD_INIT(name) { LLIST_END }
+#define SLLIST_HEAD(name) struct llist_head name = SLLIST_HEAD_INIT(name)
+
/**
* init_llist_head - initialize lock-less list head
* @head: the head for your lock-less list
@@ -73,6 +93,15 @@ static inline void init_llist_head(struct llist_head *list)
list->first = NULL;
}
+/**
+ * init_sllist_head - initialize sentinel terminated lock-less list head
+ * @head: the head for your lock-less list
+ */
+static inline void init_sllist_head(struct llist_head *list)
+{
+ list->first = LLIST_END;
+}
+
/**
* llist_entry - get the struct of this entry
* @ptr: the &struct llist_node pointer.
@@ -83,21 +112,22 @@ static inline void init_llist_head(struct llist_head *list)
container_of(ptr, type, member)
/**
- * member_address_is_nonnull - check whether the member address is not NULL
+ * member_address_is_valid - check whether member addr is not NULL or sentinel
* @ptr: the object pointer (struct type * that contains the llist_node)
* @member: the name of the llist_node within the struct.
*
* This macro is conceptually the same as
- * &ptr->member != NULL
+ * (&ptr->member != NULL) && (&ptr->member != LLIST_END)
* but it works around the fact that compilers can decide that taking a member
- * address is never a NULL pointer.
+ * address is never a NULL or the sentinel pointer.
*
- * Real objects that start at a high address and have a member at NULL are
- * unlikely to exist, but such pointers may be returned e.g. by the
- * container_of() macro.
+ * Real objects that start at a high address and have a member at NULL or
+ * LLIST_END are unlikely to exist, but such pointers may be returned e.g.
+ * by the container_of() macro.
*/
-#define member_address_is_nonnull(ptr, member) \
- ((uintptr_t)(ptr) + offsetof(typeof(*(ptr)), member) != 0)
+#define member_address_is_valid(ptr, member) \
+ ({ uintptr_t __n = (uintptr_t)(ptr) + offsetof(typeof(*(ptr)), member);\
+ __n && (__n != (uintptr_t)LLIST_END); })
/**
* llist_for_each - iterate over some deleted entries of a lock-less list
@@ -114,7 +144,7 @@ static inline void init_llist_head(struct llist_head *list)
* reverse the order by yourself before traversing.
*/
#define llist_for_each(pos, node) \
- for ((pos) = (node); pos; (pos) = (pos)->next)
+ for ((pos) = (node); (pos) && (pos) != LLIST_END; (pos) = (pos)->next)
/**
* llist_for_each_safe - iterate over some deleted entries of a lock-less list
@@ -133,7 +163,8 @@ static inline void init_llist_head(struct llist_head *list)
* reverse the order by yourself before traversing.
*/
#define llist_for_each_safe(pos, n, node) \
- for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
+ for ((pos) = (node); (pos) && ((pos) != LLIST_END) && \
+ ((n) = (pos)->next, true); (pos) = (n))
/**
* llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
@@ -152,7 +183,7 @@ static inline void init_llist_head(struct llist_head *list)
*/
#define llist_for_each_entry(pos, node, member) \
for ((pos) = llist_entry((node), typeof(*(pos)), member); \
- member_address_is_nonnull(pos, member); \
+ member_address_is_valid(pos, member); \
(pos) = llist_entry((pos)->member.next, typeof(*(pos)), member))
/**
@@ -172,11 +203,11 @@ static inline void init_llist_head(struct llist_head *list)
* you want to traverse from the oldest to the newest, you must
* reverse the order by yourself before traversing.
*/
-#define llist_for_each_entry_safe(pos, n, node, member) \
- for (pos = llist_entry((node), typeof(*pos), member); \
- member_address_is_nonnull(pos, member) && \
- (n = llist_entry(pos->member.next, typeof(*n), member), true); \
- pos = n)
+#define llist_for_each_entry_safe(pos, n, node, member) \
+ for (pos = llist_entry((node), typeof(*(pos)), member); \
+ member_address_is_valid(pos, member) && \
+ (n = llist_entry((pos)->member.next, typeof(*(n)), member),\
+ true); pos = n)
/**
* llist_empty - tests whether a lock-less list is empty
@@ -188,12 +219,16 @@ static inline void init_llist_head(struct llist_head *list)
*/
static inline bool llist_empty(const struct llist_head *head)
{
- return READ_ONCE(head->first) == NULL;
+ struct llist_node *first = READ_ONCE(head->first);
+
+ return !first || (first == LLIST_END);
}
static inline struct llist_node *llist_next(struct llist_node *node)
{
- return node->next;
+ struct llist_node *next = node->next;
+
+ return (next == LLIST_END) ? NULL : next;
}
extern bool llist_add_batch(struct llist_node *new_first,
@@ -204,9 +239,11 @@ static inline bool __llist_add_batch(struct llist_node *new_first,
struct llist_node *new_last,
struct llist_head *head)
{
+ bool empty = llist_empty(head);
+
new_last->next = head->first;
head->first = new_first;
- return new_last->next == NULL;
+ return empty;
}
/**
@@ -247,6 +284,29 @@ static inline struct llist_node *__llist_del_all(struct llist_head *head)
return first;
}
+/**
+ * sllist_del_all - delete all entries from sentinel terminated lock-less list
+ * @head: the head of lock-less list to delete all entries
+ *
+ * If list is empty, return NULL, otherwise, delete all entries and
+ * return the pointer to the first entry. The order of entries
+ * deleted is from the newest to the oldest added one.
+ */
+static inline struct llist_node *sllist_del_all(struct llist_head *head)
+{
+ struct llist_node *first = xchg(&head->first, LLIST_END);
+
+ return (first == LLIST_END) ? NULL : first;
+}
+
+static inline struct llist_node *__sllist_del_all(struct llist_head *head)
+{
+ struct llist_node *first = head->first;
+
+ head->first = LLIST_END;
+ return (first == LLIST_END) ? NULL : first;
+}
+
extern struct llist_node *llist_del_first(struct llist_head *head);
struct llist_node *llist_reverse_order(struct llist_node *head);
diff --git a/lib/llist.c b/lib/llist.c
index 611ce4881a87..1e782c9cafa8 100644
--- a/lib/llist.c
+++ b/lib/llist.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Lock-less NULL terminated single linked list
+ * Lock-less NULL or sentinel terminated singly linked lists
*
* The basic atomic operation of this list is cmpxchg on long. On
* architectures that don't have NMI-safe cmpxchg implementation, the
@@ -14,7 +14,6 @@
#include <linux/export.h>
#include <linux/llist.h>
-
/**
* llist_add_batch - add several linked entries in batch
* @new_first: first entry in batch to be added
@@ -32,7 +31,7 @@ bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
new_last->next = first = READ_ONCE(head->first);
} while (cmpxchg(&head->first, first, new_first) != first);
- return !first;
+ return !first || (first == LLIST_END);
}
EXPORT_SYMBOL_GPL(llist_add_batch);
@@ -56,7 +55,7 @@ struct llist_node *llist_del_first(struct llist_head *head)
entry = smp_load_acquire(&head->first);
for (;;) {
- if (entry == NULL)
+ if (!entry || (entry == LLIST_END))
return NULL;
old_entry = entry;
next = READ_ONCE(entry->next);
@@ -79,14 +78,25 @@ EXPORT_SYMBOL_GPL(llist_del_first);
struct llist_node *llist_reverse_order(struct llist_node *head)
{
struct llist_node *new_head = NULL;
+ struct llist_node *new_tail = head;
+
+ if (!head || (head == LLIST_END))
+ return NULL;
- while (head) {
+ while (head && (head != LLIST_END)) {
struct llist_node *tmp = head;
+
head = head->next;
tmp->next = new_head;
new_head = tmp;
}
+ /*
+ * Terminate list with the same NULL or sentinel terminator
+ */
+ if (head)
+ new_tail->next = LLIST_END;
+
return new_head;
}
EXPORT_SYMBOL_GPL(llist_reverse_order);
--
2.31.1
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v8 3/3] blk-cgroup: Optimize blkcg_rstat_flush()
2022-10-04 15:17 [PATCH v8 0/3] blk-cgroup: Optimize blkcg_rstat_flush() Waiman Long
2022-10-04 15:17 ` [PATCH v8 1/3] llist: Allow optional sentinel node terminated lockless list Waiman Long
2022-10-04 15:17 ` [PATCH v8 2/3] blk-cgroup: Return -ENOMEM directly in blkcg_css_alloc() error path Waiman Long
@ 2022-10-04 15:17 ` Waiman Long
2022-10-04 18:49 ` Michal Koutný
[not found] ` <20221006101141.1832-1-hdanton@sina.com>
3 siblings, 1 reply; 7+ messages in thread
From: Waiman Long @ 2022-10-04 15:17 UTC (permalink / raw)
To: Tejun Heo, Jens Axboe
Cc: cgroups, linux-block, linux-kernel, Ming Lei, Andy Shevchenko,
Andrew Morton, Michal Koutný, Waiman Long
For a system with many CPUs and block devices, the time to do
blkcg_rstat_flush() from cgroup_rstat_flush() can be rather long. It
can be especially problematic as interrupt is disabled during the flush.
It was reported that it might take seconds to complete in some extreme
cases leading to hard lockup messages.
As it is likely that not all the percpu blkg_iostat_set's has been
updated since the last flush, those stale blkg_iostat_set's don't need
to be flushed in this case. This patch optimizes blkcg_rstat_flush()
by keeping a lockless list of recently updated blkg_iostat_set's in a
newly added percpu blkcg->lhead pointer.
The blkg_iostat_set is added to a sentinel lockless list on the update
side in blk_cgroup_bio_start(). It is removed from the sentinel lockless
list when flushed in blkcg_rstat_flush(). Due to racing, it is possible
that blk_iostat_set's in the lockless list may have no new IO stats to
be flushed, but that is OK.
To protect against destruction of blkg, a percpu reference is gotten
when putting into the lockless list and put back when removed.
A blkg_iostat_set can determine if it is in a lockless list by checking
the content of its lnode.next pointer which will be non-NULL when in
a sentinel lockless list.
When booting up an instrumented test kernel with this patch on a
2-socket 96-thread system with cgroup v2, out of the 2051 calls to
cgroup_rstat_flush() after bootup, 1788 of the calls were exited
immediately because of empty lockless list. After an all-cpu kernel
build, the ratio became 6295424/6340513. That was more than 99%.
Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
block/blk-cgroup.c | 75 ++++++++++++++++++++++++++++++++++++++++++----
block/blk-cgroup.h | 9 ++++++
2 files changed, 78 insertions(+), 6 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 946592249795..63569b05db0d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -59,6 +59,37 @@ static struct workqueue_struct *blkcg_punt_bio_wq;
#define BLKG_DESTROY_BATCH_SIZE 64
+/*
+ * Lockless lists for tracking IO stats update
+ *
+ * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
+ * There are multiple blkg's (one for each block device) attached to each
+ * blkcg. The rstat code keeps track of which cpu has IO stats updated,
+ * but it doesn't know which blkg has the updated stats. If there are many
+ * block devices in a system, the cost of iterating all the blkg's to flush
+ * out the IO stats can be high. To reduce such overhead, a set of percpu
+ * lockless lists (lhead) per blkcg are used to track the set of recently
+ * updated iostat_cpu's since the last flush. An iostat_cpu will be put
+ * onto the lockless list on the update side [blk_cgroup_bio_start()] if
+ * not there yet and then removed when being flushed [blkcg_rstat_flush()].
+ * References to blkg are gotten and then put back in the process to
+ * protect against blkg removal.
+ *
+ * Return: 0 if successful or -ENOMEM if allocation fails.
+ */
+static int init_blkcg_llists(struct blkcg *blkcg)
+{
+ int cpu;
+
+ blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
+ if (!blkcg->lhead)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu)
+ init_sllist_head(per_cpu_ptr(blkcg->lhead, cpu));
+ return 0;
+}
+
/**
* blkcg_css - find the current css
*
@@ -236,8 +267,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
blkg->blkcg = blkcg;
u64_stats_init(&blkg->iostat.sync);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
+ per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
+ }
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
@@ -864,7 +897,9 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct blkcg *blkcg = css_to_blkcg(css);
- struct blkcg_gq *blkg;
+ struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+ struct llist_node *lnode;
+ struct blkg_iostat_set *bisc, *next_bisc;
/* Root-level stats are sourced from system-wide IO stats */
if (!cgroup_parent(css->cgroup))
@@ -872,12 +907,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
rcu_read_lock();
- hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ lnode = sllist_del_all(lhead);
+ if (!lnode)
+ goto out;
+
+ /*
+ * Iterate only the iostat_cpu's queued in the lockless list.
+ */
+ llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
+ struct blkcg_gq *blkg = bisc->blkg;
struct blkcg_gq *parent = blkg->parent;
- struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
struct blkg_iostat cur;
unsigned int seq;
+ WRITE_ONCE(lnode->next, NULL);
+
/* fetch the current per-cpu values */
do {
seq = u64_stats_fetch_begin(&bisc->sync);
@@ -890,8 +934,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (parent && parent->parent)
blkcg_iostat_update(parent, &blkg->iostat.cur,
&blkg->iostat.last);
+ percpu_ref_put(&blkg->refcnt);
}
+out:
rcu_read_unlock();
}
@@ -1170,6 +1216,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
mutex_unlock(&blkcg_pol_mutex);
+ free_percpu(blkcg->lhead);
kfree(blkcg);
}
@@ -1189,6 +1236,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
goto unlock;
}
+ if (init_blkcg_llists(blkcg))
+ goto free_blkcg;
+
for (i = 0; i < BLKCG_MAX_POLS ; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
struct blkcg_policy_data *cpd;
@@ -1229,7 +1279,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
for (i--; i >= 0; i--)
if (blkcg->cpd[i])
blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
-
+ free_percpu(blkcg->lhead);
+free_blkcg:
if (blkcg != &blkcg_root)
kfree(blkcg);
unlock:
@@ -1990,6 +2041,7 @@ static int blk_cgroup_io_type(struct bio *bio)
void blk_cgroup_bio_start(struct bio *bio)
{
+ struct blkcg *blkcg = bio->bi_blkg->blkcg;
int rwd = blk_cgroup_io_type(bio), cpu;
struct blkg_iostat_set *bis;
unsigned long flags;
@@ -2008,9 +2060,20 @@ void blk_cgroup_bio_start(struct bio *bio)
}
bis->cur.ios[rwd]++;
+ /*
+ * If the iostat_cpu isn't in a lockless list, put it into the
+ * list to indicate that a stat update is pending.
+ */
+ if (!READ_ONCE(bis->lnode.next)) {
+ struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
+
+ llist_add(&bis->lnode, lhead);
+ percpu_ref_get(&bis->blkg->refcnt);
+ }
+
u64_stats_update_end_irqrestore(&bis->sync, flags);
if (cgroup_subsys_on_dfl(io_cgrp_subsys))
- cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
+ cgroup_rstat_updated(blkcg->css.cgroup, cpu);
put_cpu();
}
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index d2724d1dd7c9..0968b6c8ea12 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -18,6 +18,7 @@
#include <linux/cgroup.h>
#include <linux/kthread.h>
#include <linux/blk-mq.h>
+#include <linux/llist.h>
struct blkcg_gq;
struct blkg_policy_data;
@@ -43,6 +44,8 @@ struct blkg_iostat {
struct blkg_iostat_set {
struct u64_stats_sync sync;
+ struct llist_node lnode;
+ struct blkcg_gq *blkg;
struct blkg_iostat cur;
struct blkg_iostat last;
};
@@ -97,6 +100,12 @@ struct blkcg {
struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
struct list_head all_blkcgs_node;
+
+ /*
+ * List of updated percpu blkg_iostat_set's since the last flush.
+ */
+ struct llist_head __percpu *lhead;
+
#ifdef CONFIG_BLK_CGROUP_FC_APPID
char fc_app_id[FC_APPID_LEN];
#endif
--
2.31.1
^ permalink raw reply related [flat|nested] 7+ messages in thread