All of lore.kernel.org
 help / color / mirror / Atom feed
* per-cpu blk_plug_list
@ 2004-03-01 21:18 Chen, Kenneth W
  2004-03-01 22:06 ` Jens Axboe
                   ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: Chen, Kenneth W @ 2004-03-01 21:18 UTC (permalink / raw)
  To: linux-kernel; +Cc: Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 3258 bytes --]

We were surprised to find global lock in I/O path still exists on 2.6
kernel.  This global lock triggers unacceptable lock contention and
prevents 2.6 kernel to scale on the I/O side.

blk_plug_list/blk_plug_lock manages plug/unplug action.  When you have
lots of cpu simultaneously submits I/O, there are lots of movement with
the device queue on and off that global list.  Our measurement showed
that blk_plug_lock contention prevents linux-2.6.3 kernel to scale pass
beyond 40 thousand I/O per second in the I/O submit path.

Looking at the vmstat and readprofile output with vanilla 2.6.3 kernel:

procs -----------memory---------- -----io---- -system-- ----cpu----
 r  b  swpd   free   buff  cache    bi    bo   in    cs   us sy id wa
409 484  0 22470560  10160 181792  89456  5867 46899 26383 7 93  0  0
427 488  0 22466176  10160 181792  90307  6057 46818 26155 7 93  0  0
448 469  0 22462208  10160 181792  84076  5769 48217 26782 7 93  0  0

575324 total                                      0.0818
225342 blk_run_queues                           391.2188
140164 __make_request                            38.4221
131765 scsi_request_fn                           55.6440
 21566 generic_unplug_device                     61.2670
  6050 get_request                                3.7812

All cpus are pegged at 93% kernel time spinning on the lock.
blk_plug_device() and blk_remove_plug() didn't show up in readprofile
because they spin with irq disabled and readprofile is based on timer
interrupt.  Nevertheless, it's obvious from blk_run_queues().

The plug management need to be converted into per-cpu.  The idea here
is to manage device queue on per-cpu plug list.  This would also allows
optimal queue kick-off since submit/kick-off typically occurs within
same system call. With our proof-of-concept patch, here are the results,
again vmstat and readprofile output:

procs -----------memory---------- -----io----   -system--    ----cpu----
 r  b  swpd   free   buff  cache    bi    bo     in    cs    us sy id wa
30 503  0 22506176   9536 172096  212849 20251 102537 177093 27 20  0 53
11 532  0 22505792   9536 172096  209073 21235 103374 177183 29 20  0 51
27 499  0 22505408   9536 172096  205334 21943 103883 176649 30 20  0 50

1988800 total                                      0.2828
1174759 cpu_idle                                2447.4146
156385 default_idle                             4887.0312
 72586 do_softirq                                108.0149
 62463 schedule                                   12.0492
 51913 scsi_request_fn                            21.9227
 34351 __make_request                              9.4164
 24370 dio_bio_end_io                             63.4635
 22652 scsi_end_request                           44.2422

It clearly relieves the global lock contention and the kernel time is
now in the expected range.  I/O has been improved to 110K random I/O
per second and is now only limited by the disks instead of kernel.

The workload that triggers this scalability issue is an I/O intensive
database workload running on a 32P system.

Comments on the patch are welcome, I'm sure there will be a couple of
iterations on the solution.

- Ken

[-- Attachment #2: percpu_blk_plug.patch --]
[-- Type: application/octet-stream, Size: 5539 bytes --]

diff -Nurp linux-2.6.3/drivers/block/ll_rw_blk.c linux-2.6.3.blk/drivers/block/ll_rw_blk.c
--- linux-2.6.3/drivers/block/ll_rw_blk.c	2004-02-17 19:57:16.000000000 -0800
+++ linux-2.6.3.blk/drivers/block/ll_rw_blk.c	2004-03-01 11:36:09.000000000 -0800
@@ -39,8 +39,7 @@ static kmem_cache_t *request_cachep;
 /*
  * plug management
  */
-static LIST_HEAD(blk_plug_list);
-static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+static struct blk_plug_struct blk_plug_array[NR_CPUS];
 
 static wait_queue_head_t congestion_wqh[2];
 
@@ -1096,10 +1095,14 @@ void blk_plug_device(request_queue_t *q)
 	 */
 	if (!blk_queue_plugged(q)
 	    && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
-		spin_lock(&blk_plug_lock);
-		list_add_tail(&q->plug_list, &blk_plug_list);
+		struct blk_plug_struct * local_blk_plug =
+					 &blk_plug_array[smp_processor_id()];
+
+		spin_lock(&local_blk_plug->blk_plug_lock);
+		list_add_tail(&q->plug_list, &local_blk_plug->blk_plug_head);
+		q->blk_plug = local_blk_plug;
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		spin_unlock(&blk_plug_lock);
+		spin_unlock(&local_blk_plug->blk_plug_lock);
 	}
 }
 
@@ -1113,10 +1116,11 @@ int blk_remove_plug(request_queue_t *q)
 {
 	WARN_ON(!irqs_disabled());
 	if (blk_queue_plugged(q)) {
-		spin_lock(&blk_plug_lock);
+		struct blk_plug_struct * blk_plug = q->blk_plug;
+		spin_lock(&blk_plug->blk_plug_lock);
 		list_del_init(&q->plug_list);
 		del_timer(&q->unplug_timer);
-		spin_unlock(&blk_plug_lock);
+		spin_unlock(&blk_plug->blk_plug_lock);
 		return 1;
 	}
 
@@ -1237,6 +1241,26 @@ void blk_run_queue(struct request_queue 
 
 EXPORT_SYMBOL(blk_run_queue);
 
+/*
+ * blk_run_queues_cpu - fire all plugged queues on this cpu
+ */
+#define blk_plug_entry(entry) list_entry((entry), request_queue_t, plug_list)
+void blk_run_queues_cpu(int cpu)
+{
+	struct blk_plug_struct * cur_blk_plug = &blk_plug_array[cpu];
+	struct list_head * head = &cur_blk_plug->blk_plug_head;
+	spinlock_t *lock = &cur_blk_plug->blk_plug_lock;
+
+	spin_lock_irq(lock);
+	while (!list_empty(head)) {
+		request_queue_t *q = blk_plug_entry(head->next);
+		spin_unlock_irq(lock);
+		q->unplug_fn(q);
+		spin_lock_irq(lock);
+	}
+	spin_unlock_irq(lock);
+}
+
 /**
  * blk_run_queues - fire all plugged queues
  *
@@ -1245,30 +1269,12 @@ EXPORT_SYMBOL(blk_run_queue);
  *   are currently stopped are ignored. This is equivalent to the older
  *   tq_disk task queue run.
  **/
-#define blk_plug_entry(entry) list_entry((entry), request_queue_t, plug_list)
 void blk_run_queues(void)
 {
-	LIST_HEAD(local_plug_list);
-
-	spin_lock_irq(&blk_plug_lock);
-
-	/*
-	 * this will happen fairly often
-	 */
-	if (list_empty(&blk_plug_list))
-		goto out;
-
-	list_splice_init(&blk_plug_list, &local_plug_list);
-	
-	while (!list_empty(&local_plug_list)) {
-		request_queue_t *q = blk_plug_entry(local_plug_list.next);
+	int i;
 
-		spin_unlock_irq(&blk_plug_lock);
-		q->unplug_fn(q);
-		spin_lock_irq(&blk_plug_lock);
-	}
-out:
-	spin_unlock_irq(&blk_plug_lock);
+	for_each_cpu(i)
+		blk_run_queues_cpu(i);
 }
 
 EXPORT_SYMBOL(blk_run_queues);
@@ -2687,6 +2693,11 @@ int __init blk_dev_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(congestion_wqh); i++)
 		init_waitqueue_head(&congestion_wqh[i]);
+
+	for (i = 0; i < NR_CPUS; i++) {
+		INIT_LIST_HEAD(&blk_plug_array[i].blk_plug_head);
+		spin_lock_init(&blk_plug_array[i].blk_plug_lock);
+	}
 	return 0;
 }
 
diff -Nurp linux-2.6.3/fs/direct-io.c linux-2.6.3.blk/fs/direct-io.c
--- linux-2.6.3/fs/direct-io.c	2004-02-17 19:58:16.000000000 -0800
+++ linux-2.6.3.blk/fs/direct-io.c	2004-03-01 11:36:09.000000000 -0800
@@ -329,7 +329,7 @@ static struct bio *dio_await_one(struct 
 		if (dio->bio_list == NULL) {
 			dio->waiter = current;
 			spin_unlock_irqrestore(&dio->bio_list_lock, flags);
-			blk_run_queues();
+			blk_run_queues_cpu(smp_processor_id());
 			io_schedule();
 			spin_lock_irqsave(&dio->bio_list_lock, flags);
 			dio->waiter = NULL;
@@ -960,7 +960,7 @@ direct_io_worker(int rw, struct kiocb *i
 		if (ret == 0)
 			ret = dio->result;	/* Bytes written */
 		finished_one_bio(dio);		/* This can free the dio */
-		blk_run_queues();
+		blk_run_queues_cpu(smp_processor_id());
 	} else {
 		finished_one_bio(dio);
 		ret2 = dio_await_completion(dio);
diff -Nurp linux-2.6.3/include/linux/blkdev.h linux-2.6.3.blk/include/linux/blkdev.h
--- linux-2.6.3/include/linux/blkdev.h	2004-02-17 19:57:20.000000000 -0800
+++ linux-2.6.3.blk/include/linux/blkdev.h	2004-03-01 11:36:09.000000000 -0800
@@ -267,6 +267,11 @@ struct blk_queue_tag {
 	atomic_t refcnt;		/* map can be shared */
 };
 
+struct blk_plug_struct {
+	struct list_head	blk_plug_head;
+	spinlock_t		blk_plug_lock;
+} ____cacheline_aligned;
+
 struct request_queue
 {
 	/*
@@ -316,6 +321,7 @@ struct request_queue
 	int			bounce_gfp;
 
 	struct list_head	plug_list;
+	struct blk_plug_struct	*blk_plug;	/* blk_plug it belongs to */
 
 	/*
 	 * various queue flags, see QUEUE_* below
diff -Nurp linux-2.6.3/include/linux/fs.h linux-2.6.3.blk/include/linux/fs.h
--- linux-2.6.3/include/linux/fs.h	2004-02-17 19:57:20.000000000 -0800
+++ linux-2.6.3.blk/include/linux/fs.h	2004-03-01 11:36:09.000000000 -0800
@@ -1152,6 +1152,7 @@ extern int blkdev_put(struct block_devic
 extern int bd_claim(struct block_device *, void *);
 extern void bd_release(struct block_device *);
 extern void blk_run_queues(void);
+extern void blk_run_queues_cpu(int);
 
 /* fs/char_dev.c */
 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, char *);

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2004-03-03  3:54 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-03-01 21:18 per-cpu blk_plug_list Chen, Kenneth W
2004-03-01 22:06 ` Jens Axboe
2004-03-01 22:17 ` Andrew Morton
2004-03-01 22:31   ` Andrew Morton
     [not found] ` <B05667366EE6204181EABE9C1B1C0EB501F2AB4C@scsmsx401.sc.intel.com.suse.lists.linux.kernel>
2004-03-01 22:36   ` Andi Kleen
2004-03-01 22:39   ` per-cpu blk_plug_list II Andi Kleen
2004-03-03  3:44 ` per-cpu blk_plug_list Jesse Barnes
2004-03-03  3:55   ` Andrew Morton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.