All of lore.kernel.org
 help / color / mirror / Atom feed
From: Vivek Goyal <vgoyal@redhat.com>
To: Tejun Heo <tj@kernel.org>
Cc: axboe@kernel.dk, hughd@google.com, avi@redhat.com,
	nate@cpanel.net, cl@linux-foundation.org,
	linux-kernel@vger.kernel.org, dpshah@google.com,
	ctalbott@google.com, rni@google.com,
	Andrew Morton <akpm@linux-foundation.org>
Subject: Re: [PATCHSET] mempool, percpu, blkcg: fix percpu stat allocation and remove stats_lock
Date: Wed, 29 Feb 2012 12:36:39 -0500	[thread overview]
Message-ID: <20120229173639.GB5930@redhat.com> (raw)
In-Reply-To: <20120227194321.GF27677@redhat.com>

On Mon, Feb 27, 2012 at 02:43:21PM -0500, Vivek Goyal wrote:
> On Mon, Feb 27, 2012 at 06:11:41PM +0900, Tejun Heo wrote:
> > On Sun, Feb 26, 2012 at 10:11:46PM -0500, Vivek Goyal wrote:
> > > Ok. This sounds better than embeding work_struct in blkg, I can embed it
> > > in request_queue and make the worker walk the list of blkg pending 
> > > alloc of stats. Will try that. Thanks for the idea.
> > 
> > We might not need to make it even per-queue.  Simple global list of
> > pending blkgs and single work item should work fine, I think.
> 
> Thanks for the suggestion Tejun. I have implemented it and below is the
> patch. I have done basic testing of boot and cgroup creation. Yet to test
> it over elevator switch path. Will do that once it is fixed. I will sign
> it after testing.
> 
> Do let me know if you want some changes in the patch.
> 
> Thanks
> Vivek
> 
> Allocate blkg per cpu stat from a worker thread.
> 
> Yet-to-be-signed-off-by: Vivek Goyal <vgoyal@redhat.com>

Came up with second version of patch. Minor cleanups. There were couple
of redundant condition checks.

Thanks
Vivek


Allocate blkg per cpu stats asynchrnously from a worker thread.

---
 block/blk-cgroup.c |  134 +++++++++++++++++++++++++++++++++++++++--------------
 block/blk-cgroup.h |    2 
 2 files changed, 101 insertions(+), 35 deletions(-)

Index: tejun-misc/block/blk-cgroup.h
===================================================================
--- tejun-misc.orig/block/blk-cgroup.h	2012-02-28 01:29:09.238256494 -0500
+++ tejun-misc/block/blk-cgroup.h	2012-02-28 01:29:12.000000000 -0500
@@ -180,6 +180,8 @@ struct blkio_group {
 	struct request_queue *q;
 	struct list_head q_node;
 	struct hlist_node blkcg_node;
+	/* List of blkg waiting for per cpu stats memory to be allocated */
+	struct list_head pending_alloc_node;
 	struct blkio_cgroup *blkcg;
 	/* Store cgroup path */
 	char path[128];
Index: tejun-misc/block/blk-cgroup.c
===================================================================
--- tejun-misc.orig/block/blk-cgroup.c	2012-02-28 01:29:09.239256494 -0500
+++ tejun-misc/block/blk-cgroup.c	2012-02-29 23:02:00.279293289 -0500
@@ -30,6 +30,12 @@ static LIST_HEAD(blkio_list);
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
+static DEFINE_SPINLOCK(pending_alloc_list_lock);
+static LIST_HEAD(pending_alloc_list);
+
+static void blkio_stat_alloc_fn(struct work_struct *);
+static DECLARE_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);
+
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
@@ -391,6 +397,9 @@ void blkiocg_update_dispatch_stats(struc
 	struct blkio_group_stats_cpu *stats_cpu;
 	unsigned long flags;
 
+	if (pd->stats_cpu == NULL)
+		return;
+
 	/*
 	 * Disabling interrupts to provide mutual exclusion between two
 	 * writes on same cpu. It probably is not needed for 64bit. Not
@@ -443,6 +452,9 @@ void blkiocg_update_io_merged_stats(stru
 	struct blkio_group_stats_cpu *stats_cpu;
 	unsigned long flags;
 
+	if (pd->stats_cpu == NULL)
+		return;
+
 	/*
 	 * Disabling interrupts to provide mutual exclusion between two
 	 * writes on same cpu. It probably is not needed for 64bit. Not
@@ -460,6 +472,72 @@ void blkiocg_update_io_merged_stats(stru
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
+static void blkio_stat_alloc_fn(struct work_struct *work)
+{
+
+	void *stat_ptr = NULL;
+	struct blkio_group *blkg, *n;
+	int i;
+
+alloc_stats:
+	spin_lock_irq(&pending_alloc_list_lock);
+		if (list_empty(&pending_alloc_list)) {
+			/* Nothing to do */
+			spin_unlock_irq(&pending_alloc_list_lock);
+			return;
+		}
+	spin_unlock_irq(&pending_alloc_list_lock);
+
+	WARN_ON(stat_ptr != NULL);
+	stat_ptr = alloc_percpu(struct blkio_group_stats_cpu);
+
+	/* Retry. Should there be an upper limit on number of retries */
+	if (stat_ptr == NULL)
+		goto alloc_stats;
+
+	spin_lock_irq(&blkio_list_lock);
+	spin_lock(&pending_alloc_list_lock);
+
+	list_for_each_entry_safe(blkg, n, &pending_alloc_list,
+		pending_alloc_node) {
+		for (i = 0; i < BLKIO_NR_POLICIES; i++) {
+			struct blkio_policy_type *pol = blkio_policy[i];
+			struct blkg_policy_data *pd;
+
+			if (!pol)
+				continue;
+
+			if (!blkg->pd[i])
+				continue;
+
+			pd = blkg->pd[i];
+			if (pd->stats_cpu)
+				continue;
+
+			pd->stats_cpu = stat_ptr;
+			stat_ptr = NULL;
+			break;
+		}
+
+		if (i == BLKIO_NR_POLICIES - 1) {
+			/* We are done with this group */
+			list_del_init(&blkg->pending_alloc_node);
+			continue;
+		} else
+			/* Go allocate more memory */
+			break;
+	}
+	spin_unlock(&pending_alloc_list_lock);
+	spin_unlock_irq(&blkio_list_lock);
+
+	if (stat_ptr != NULL) {
+		/* Nobody needs memory anymore */
+		free_percpu(stat_ptr);
+		return;
+	} else
+		goto alloc_stats;
+}
+
 /**
  * blkg_free - free a blkg
  * @blkg: blkg to free
@@ -509,6 +587,7 @@ static struct blkio_group *blkg_alloc(st
 	spin_lock_init(&blkg->stats_lock);
 	blkg->q = q;
 	INIT_LIST_HEAD(&blkg->q_node);
+	INIT_LIST_HEAD(&blkg->pending_alloc_node);
 	blkg->blkcg = blkcg;
 	blkg->refcnt = 1;
 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
@@ -530,13 +609,6 @@ static struct blkio_group *blkg_alloc(st
 
 		blkg->pd[i] = pd;
 		pd->blkg = blkg;
-
-		/* broken, read comment in the callsite */
-		pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
-		if (!pd->stats_cpu) {
-			blkg_free(blkg);
-			return NULL;
-		}
 	}
 
 	/* invoke per-policy init */
@@ -556,7 +628,7 @@ struct blkio_group *blkg_lookup_create(s
 				       bool for_root)
 	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
-	struct blkio_group *blkg, *new_blkg;
+	struct blkio_group *blkg;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
@@ -580,48 +652,29 @@ struct blkio_group *blkg_lookup_create(s
 
 	/*
 	 * Allocate and initialize.
-	 *
-	 * FIXME: The following is broken.  Percpu memory allocation
-	 * requires %GFP_KERNEL context and can't be performed from IO
-	 * path.  Allocation here should inherently be atomic and the
-	 * following lock dancing can be removed once the broken percpu
-	 * allocation is fixed.
 	 */
-	spin_unlock_irq(q->queue_lock);
-	rcu_read_unlock();
-
-	new_blkg = blkg_alloc(blkcg, q);
-
-	rcu_read_lock();
-	spin_lock_irq(q->queue_lock);
-
-	/* did bypass get turned on inbetween? */
-	if (unlikely(blk_queue_bypass(q)) && !for_root) {
-		blkg = ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
-		goto out;
-	}
-
-	/* did someone beat us to it? */
-	blkg = blkg_lookup(blkcg, q);
-	if (unlikely(blkg))
-		goto out;
+	blkg = blkg_alloc(blkcg, q);
 
 	/* did alloc fail? */
-	if (unlikely(!new_blkg)) {
+	if (unlikely(!blkg)) {
 		blkg = ERR_PTR(-ENOMEM);
 		goto out;
 	}
 
 	/* insert */
 	spin_lock(&blkcg->lock);
-	swap(blkg, new_blkg);
+	spin_lock(&pending_alloc_list_lock);
 
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 	list_add(&blkg->q_node, &q->blkg_list);
+	list_add(&blkg->pending_alloc_node, &pending_alloc_list);
 
+	/* Queue per cpu stat allocation from worker thread. */
+	queue_work(system_nrt_wq, &blkio_stat_alloc_work);
+
+	spin_unlock(&pending_alloc_list_lock);
 	spin_unlock(&blkcg->lock);
 out:
-	blkg_free(new_blkg);
 	return blkg;
 }
 EXPORT_SYMBOL_GPL(blkg_lookup_create);
@@ -648,11 +701,16 @@ static void blkg_destroy(struct blkio_gr
 	lockdep_assert_held(q->queue_lock);
 	lockdep_assert_held(&blkcg->lock);
 
+	spin_lock(&pending_alloc_list_lock);
+
 	/* Something wrong if we are trying to remove same group twice */
 	WARN_ON_ONCE(list_empty(&blkg->q_node));
 	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
 	list_del_init(&blkg->q_node);
 	hlist_del_init_rcu(&blkg->blkcg_node);
+	list_del_init(&blkg->pending_alloc_node);
+
+	spin_unlock(&pending_alloc_list_lock);
 
 	/*
 	 * Put the reference taken at the time of creation so that when all
@@ -755,6 +813,9 @@ static void blkio_reset_stats_cpu(struct
 	struct blkg_policy_data *pd = blkg->pd[plid];
 	struct blkio_group_stats_cpu *stats_cpu;
 	int i, j, k;
+
+	if (pd->stats_cpu == NULL)
+		return;
 	/*
 	 * Note: On 64 bit arch this should not be an issue. This has the
 	 * possibility of returning some inconsistent value on 32bit arch
@@ -886,6 +947,9 @@ static uint64_t blkio_read_stat_cpu(stru
 	struct blkio_group_stats_cpu *stats_cpu;
 	u64 val = 0, tval;
 
+	if (pd->stats_cpu == NULL)
+		return val;
+
 	for_each_possible_cpu(cpu) {
 		unsigned int start;
 		stats_cpu = per_cpu_ptr(pd->stats_cpu, cpu);

  reply	other threads:[~2012-02-29 17:36 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-02-23 22:30 [PATCHSET] mempool, percpu, blkcg: fix percpu stat allocation and remove stats_lock Tejun Heo
2012-02-23 22:30 ` [PATCH 1/8] mempool: factor out mempool_fill() Tejun Heo
2012-02-23 22:30 ` [PATCH 2/8] mempool: separate out __mempool_create() Tejun Heo
2012-02-23 22:30 ` [PATCH 3/8] mempool, percpu: implement percpu mempool Tejun Heo
2012-02-23 22:30 ` [PATCH 4/8] block: fix deadlock through percpu allocation in blk-cgroup Tejun Heo
2012-02-23 22:30 ` [PATCH 5/8] blkcg: don't use percpu for merged stats Tejun Heo
2012-02-23 22:30 ` [PATCH 6/8] blkcg: simplify stat reset Tejun Heo
2012-02-23 22:30 ` [PATCH 7/8] blkcg: restructure blkio_get_stat() Tejun Heo
2012-02-23 22:30 ` [PATCH 8/8] blkcg: remove blkio_group->stats_lock Tejun Heo
2012-02-23 22:43 ` [PATCHSET] mempool, percpu, blkcg: fix percpu stat allocation and remove stats_lock Andrew Morton
2012-02-23 23:01   ` Tejun Heo
2012-02-23 23:12     ` Tejun Heo
2012-02-23 23:22       ` Andrew Morton
2012-02-23 23:24         ` Tejun Heo
2012-02-24 14:20       ` Vivek Goyal
2012-02-25 21:44         ` Tejun Heo
2012-02-27  3:11           ` Vivek Goyal
2012-02-27  9:11             ` Tejun Heo
2012-02-27 19:43               ` Vivek Goyal
2012-02-29 17:36                 ` Vivek Goyal [this message]
2012-03-05 22:13                   ` Tejun Heo
2012-03-06 21:09                     ` Vivek Goyal
2012-03-06 21:20                       ` Andrew Morton
2012-03-06 21:34                         ` Vivek Goyal
2012-03-06 21:55                           ` Andrew Morton
2012-03-07 14:55                             ` Vivek Goyal
2012-03-07 17:05                               ` Tejun Heo
2012-03-07 19:13                                 ` Vivek Goyal
2012-03-07 19:22                                   ` Tejun Heo
2012-03-07 19:42                                     ` Vivek Goyal
2012-03-07 22:56                                       ` Tejun Heo
2012-03-07 23:08                                         ` Andrew Morton
2012-03-07 23:15                                           ` Tejun Heo
2012-03-07 23:05                               ` Andrew Morton
2012-03-08 17:57                                 ` Vivek Goyal
2012-03-08 18:08                                   ` Tejun Heo
2012-03-08 18:11                                     ` Tejun Heo
2012-03-08 18:22                                       ` Vivek Goyal
2012-03-08 18:27                                         ` Tejun Heo
2012-03-15 16:48                                           ` Vivek Goyal
2012-03-15 16:59                                             ` Tejun Heo
2012-03-20 11:50                                               ` Jens Axboe
2012-03-08 20:16                                     ` Vivek Goyal
2012-03-08 20:33                                       ` Vivek Goyal
2012-03-08 20:35                                         ` Tejun Heo
2012-03-08 19:06                                   ` Andrew Morton
2012-02-25  3:44       ` Vivek Goyal
2012-02-25 21:46         ` Tejun Heo
2012-02-25 22:21           ` Tejun Heo
2012-02-27 14:25             ` Vivek Goyal
2012-02-27 14:40               ` Vivek Goyal
2012-03-05 17:45                 ` Tejun Heo
2012-02-27 18:22       ` Vivek Goyal
2012-02-29 19:03         ` Vivek Goyal
2012-03-05 17:20           ` Tejun Heo
2012-03-05 18:03             ` Vivek Goyal

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20120229173639.GB5930@redhat.com \
    --to=vgoyal@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=avi@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=cl@linux-foundation.org \
    --cc=ctalbott@google.com \
    --cc=dpshah@google.com \
    --cc=hughd@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=nate@cpanel.net \
    --cc=rni@google.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.