>From 8b3e5b1a45c47043e4ae3a066ca494d9211cc8be Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 16 May 2017 14:46:03 +0300 Subject: [PATCH] btrfs: Rework the metadata reclaim algorithm Currently the metadata reclaim algorithm works by allong up to 2 commit cycles (where a commit cycle is defined as an iteration through all states in flush_space) for every space reservation ticket on the space_info->ticket_list. Empirical testing shows that this improves the number of reservations which could be satisfied, however it also leads to a very large number of transaction commits. Testing showed that for the same workload ticketed rework generates ~4.5k transaction whereas pre-ticketed work generated ~1.5k. This results in generic/027 runtime increase almost 3x (from 300 to 800 seconds), mainly due to the 'rm' phase causing a lot more transaction commits. Current patch fixes this by modifying the algorithm in such a way so as to allow only 1 transactino commit for every ticket by allow every other metadata cleaning operation to be performed twice. That is the code would stop flushing before the 2nd transaction commit commences. In addition to runtime measurements I also measured the number of reservations which are being satisfied. The test case is the following: #!/bin/bash create_file() { local dir=$1 local direct=$2 local i=0 mkdir -p $dir >/dev/null 2>&1 local STARTTIME=$(date +%s) while xfs_io -f $direct -c "pwrite 0 1k" $dir/file_$i >/dev/null 2>&1; do let i=$i+1 done local ENDTIME=$(date +%s) echo "Created $i files before returning error, time taken $(($ENDTIME - $STARTTIME))" } dir=/media/scratch/testdir/ loop=1 i=1 while [ $i -le $loop ]; do nr_worker=8 while [ $nr_worker -gt 0 ]; do # half buffered I/O half direct I/O if [ `expr $nr_worker % 2` -eq 0 ]; then create_file $dir/$nr_worker -d & else create_file $dir/$nr_worker & fi let nr_worker=$nr_worker-1 done wait STARTTIME=$(date +%s) rm -rf $dir ENDTIME=$(date +%s) echo "rming took $(($ENDTIME - $STARTTIME)) seconds" let i=$i+1 done Runnig this 10 times yelds the following sums of successful writes for all 8 workers per iteration: Pre-ticketed Ticketed Patched 9647 9688 9492 9811 9805 9627 9838 9656 9569 9818 9809 9419 9684 9883 9495 9755 9861 9696 9791 9756 9362 9852 9714 9489 9880 9787 9657 9809 9863 9453 average 9788 9782 9525 100% %97.38 median 9810 9787 9495 100% %97.02 stdev 73 77 108 stdev/average 0.75% 0.79% 1.14% stdev/median 0.75% 0.79% 1.14% generic/027 runtime: 300s 800s 225s Signed-off-by: Nikolay Borisov --- fs/btrfs/extent-tree.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3ab1f88af038..8124a750fd6b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4973,8 +4973,8 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; - int flush_state; - int commit_cycles = 0; + int flush_state = FLUSH_DELAYED_ITEMS_NR; + bool committed = false; u64 last_tickets_id; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); @@ -4991,8 +4991,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); - flush_state = FLUSH_DELAYED_ITEMS_NR; - do { + while (!committed || flush_state < COMMIT_TRANS) { struct reserve_ticket *ticket; int ret; @@ -5013,21 +5012,21 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } else { last_tickets_id = space_info->tickets_id; flush_state = FLUSH_DELAYED_ITEMS_NR; - if (commit_cycles) - commit_cycles--; + if (committed) + committed = false; } + spin_unlock(&space_info->lock); if (flush_state > COMMIT_TRANS) { - commit_cycles++; - if (commit_cycles > 2) { - wake_all_tickets(&space_info->tickets); - space_info->flush = 0; - } else { - flush_state = FLUSH_DELAYED_ITEMS_NR; - } + committed = true; + flush_state = FLUSH_DELAYED_ITEMS_NR; } - spin_unlock(&space_info->lock); - } while (flush_state <= COMMIT_TRANS); + } + + spin_lock(&space_info->lock); + wake_all_tickets(&space_info->tickets); + space_info->flush = 0; + spin_unlock(&space_info->lock); } void btrfs_init_async_reclaim_work(struct work_struct *work) -- 2.7.4