* [PATCH 1/4] btrfs: rip out may_commit_transaction
2021-06-18 15:18 [PATCH 0/4][v2] btrfs: commit the transaction unconditionally for ensopc Josef Bacik
@ 2021-06-18 15:18 ` Josef Bacik
2021-06-18 15:18 ` [PATCH 2/4] btrfs: remove FLUSH_DELAYED_REFS from data enospc flushing Josef Bacik
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Josef Bacik @ 2021-06-18 15:18 UTC (permalink / raw)
To: linux-btrfs, kernel-team
may_commit_transaction was introduced before the ticketing
infrastructure existed. There was a problem where we'd legitimately be
out of space, but every reservation would trigger a transaction commit
and then fail. Thus if you had 1000 things trying to make a
reservation, they'd all do the flushing loop and thus commit the
transaction 1000 times before they'd get their ENOSPC.
This helper was introduced to short circuit this, if there wasn't space
that could be reclaimed by committing the transaction then simply ENOSPC
out. This made true ENOSPC tests much faster as we didn't waste a bunch
of time.
However many of our bugs over the years have been from cases where we
didn't account for some space that would be reclaimed by committing a
transaction. The delayed refs rsv space, delayed rsv, many pinned bytes
miscalculations, etc. And in the meantime the original problem has been
solved with ticketing. We no longer will commit the transaction 1000
times. Instead we'll get 1000 waiters, we will go through the flushing
mechanisms, and if there's no progress after 2 loops we ENOSPC everybody
out. The ticketing infrastructure gives us a deterministic way to see
if we're making progress or not, thus we avoid a lot of extra work.
So simplify this step by simply unconditionally committing the
transaction. This removes what is arguably our most common source of
early ENOSPC bugs and will allow us to drastically simplify many of the
things we track because we simply won't need them with this stuff gone.
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
fs/btrfs/ctree.h | 1 -
fs/btrfs/space-info.c | 141 +++--------------------------------
include/trace/events/btrfs.h | 3 +-
3 files changed, 12 insertions(+), 133 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6131b58f779f..9860bfac9ace 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2786,7 +2786,6 @@ enum btrfs_flush_state {
ALLOC_CHUNK_FORCE = 9,
RUN_DELAYED_IPUTS = 10,
COMMIT_TRANS = 11,
- FORCE_COMMIT_TRANS = 12,
};
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 84f6601381a8..441a1b94806b 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -133,18 +133,13 @@
* operations, however they won't be usable until the transaction commits.
*
* COMMIT_TRANS
- * may_commit_transaction() is the ultimate arbiter on whether we commit the
- * transaction or not. In order to avoid constantly churning we do all the
- * above flushing first and then commit the transaction as the last resort.
- * However we need to take into account things like pinned space that would
- * be freed, plus any delayed work we may not have gotten rid of in the case
- * of metadata.
- *
- * FORCE_COMMIT_TRANS
- * For use by the preemptive flusher. We use this to bypass the ticketing
- * checks in may_commit_transaction, as we have more information about the
- * overall state of the system and may want to commit the transaction ahead
- * of actual ENOSPC conditions.
+ * This will commit the transaction. Historically we had a lot of logic
+ * surrounding whether or not we'd commit the transaction, but this was born
+ * out of a pre-tickets era where we could end up committing the transaction
+ * thousands of times in a row without making progress. Now thanks to our
+ * ticketing system we know if we're not making progress and can error
+ * everybody out after a few commits rather than burning the disk hoping for
+ * a different answer.
*
* OVERCOMMIT
*
@@ -621,109 +616,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
}
}
-/**
- * Possibly commit the transaction if its ok to
- *
- * @fs_info: the filesystem
- * @space_info: space_info we are checking for commit, either data or metadata
- *
- * This will check to make sure that committing the transaction will actually
- * get us somewhere and then commit the transaction if it does. Otherwise it
- * will return -ENOSPC.
- */
-static int may_commit_transaction(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
-{
- struct reserve_ticket *ticket = NULL;
- struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
- struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
- struct btrfs_trans_handle *trans;
- u64 reclaim_bytes = 0;
- u64 bytes_needed = 0;
- u64 cur_free_bytes = 0;
-
- trans = (struct btrfs_trans_handle *)current->journal_info;
- if (trans)
- return -EAGAIN;
-
- spin_lock(&space_info->lock);
- cur_free_bytes = btrfs_space_info_used(space_info, true);
- if (cur_free_bytes < space_info->total_bytes)
- cur_free_bytes = space_info->total_bytes - cur_free_bytes;
- else
- cur_free_bytes = 0;
-
- if (!list_empty(&space_info->priority_tickets))
- ticket = list_first_entry(&space_info->priority_tickets,
- struct reserve_ticket, list);
- else if (!list_empty(&space_info->tickets))
- ticket = list_first_entry(&space_info->tickets,
- struct reserve_ticket, list);
- if (ticket)
- bytes_needed = ticket->bytes;
-
- if (bytes_needed > cur_free_bytes)
- bytes_needed -= cur_free_bytes;
- else
- bytes_needed = 0;
- spin_unlock(&space_info->lock);
-
- if (!bytes_needed)
- return 0;
-
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- /*
- * See if there is enough pinned space to make this reservation, or if
- * we have block groups that are going to be freed, allowing us to
- * possibly do a chunk allocation the next loop through.
- */
- if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
- __percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
- goto commit;
-
- /*
- * See if there is some space in the delayed insertion reserve for this
- * reservation. If the space_info's don't match (like for DATA or
- * SYSTEM) then just go enospc, reclaiming this space won't recover any
- * space to satisfy those reservations.
- */
- if (space_info != delayed_rsv->space_info)
- goto enospc;
-
- spin_lock(&delayed_rsv->lock);
- reclaim_bytes += delayed_rsv->reserved;
- spin_unlock(&delayed_rsv->lock);
-
- spin_lock(&delayed_refs_rsv->lock);
- reclaim_bytes += delayed_refs_rsv->reserved;
- spin_unlock(&delayed_refs_rsv->lock);
-
- spin_lock(&trans_rsv->lock);
- reclaim_bytes += trans_rsv->reserved;
- spin_unlock(&trans_rsv->lock);
-
- if (reclaim_bytes >= bytes_needed)
- goto commit;
- bytes_needed -= reclaim_bytes;
-
- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
- goto enospc;
-
-commit:
- return btrfs_commit_transaction(trans);
-enospc:
- btrfs_end_transaction(trans);
- return -ENOSPC;
-}
-
/*
* Try to flush some data based on policy set by @state. This is only advisory
* and may fail for various reasons. The caller is supposed to examine the
@@ -801,9 +693,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
btrfs_wait_on_delayed_iputs(fs_info);
break;
case COMMIT_TRANS:
- ret = may_commit_transaction(fs_info, space_info);
- break;
- case FORCE_COMMIT_TRANS:
+ ASSERT(current->journal_info == NULL);
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -1193,7 +1083,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
(delayed_block_rsv->reserved +
delayed_refs_rsv->reserved)) {
to_reclaim = space_info->bytes_pinned;
- flush = FORCE_COMMIT_TRANS;
+ flush = COMMIT_TRANS;
} else if (delayed_block_rsv->reserved >
delayed_refs_rsv->reserved) {
to_reclaim = delayed_block_rsv->reserved;
@@ -1257,18 +1147,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* is a negative delayed ref count for the extent and assume that the space
* will be freed, and thus increase ->total_bytes_pinned.
*
- * Running the delayed refs gives us the actual real view of what will be
- * freed at the transaction commit time. This stage will not actually free
- * space for us, it just makes sure that may_commit_transaction() has all of
- * the information it needs to make the right decision.
- *
* COMMIT_TRANS
- * This is where we reclaim all of the pinned space generated by the previous
- * two stages. We will not commit the transaction if we don't think we're
- * likely to satisfy our request, which means if our current free space +
- * total_bytes_pinned < reservation we will not commit. This is why the
- * previous states are actually important, to make sure we know for sure
- * whether committing the transaction will allow us to make progress.
+ * This is where we reclaim all of the pinned space generated by running the
+ * iputs.
*
* ALLOC_CHUNK_FORCE
* For data we start with alloc chunk force, however we could have been full
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 8144b8e345b5..a63a3d34b47b 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -100,8 +100,7 @@ struct btrfs_space_info;
EM( ALLOC_CHUNK, "ALLOC_CHUNK") \
EM( ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE") \
EM( RUN_DELAYED_IPUTS, "RUN_DELAYED_IPUTS") \
- EM( COMMIT_TRANS, "COMMIT_TRANS") \
- EMe(FORCE_COMMIT_TRANS, "FORCE_COMMIT_TRANS")
+ EMe(COMMIT_TRANS, "COMMIT_TRANS")
/*
* First define the enums in the above macros to be exported to userspace via
--
2.26.3
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 4/4] btrfs: rip out ->total_bytes_pinned
2021-06-18 15:18 [PATCH 0/4][v2] btrfs: commit the transaction unconditionally for ensopc Josef Bacik
` (2 preceding siblings ...)
2021-06-18 15:18 ` [PATCH 3/4] btrfs: rip the first_ticket_bytes logic from fail_all_tickets Josef Bacik
@ 2021-06-18 15:18 ` Josef Bacik
2021-06-22 11:11 ` [PATCH 0/4][v2] btrfs: commit the transaction unconditionally for ensopc David Sterba
4 siblings, 0 replies; 6+ messages in thread
From: Josef Bacik @ 2021-06-18 15:18 UTC (permalink / raw)
To: linux-btrfs, kernel-team; +Cc: Nikolay Borisov
We used this in may_commit_transaction() in order to determine if we
needed to commit the transaction. However we no longer have that logic
and thus have no use of this counter anymore, so delete it.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
fs/btrfs/block-group.c | 3 ---
fs/btrfs/delayed-ref.c | 26 --------------------------
fs/btrfs/disk-io.c | 3 ---
fs/btrfs/extent-tree.c | 15 ---------------
fs/btrfs/space-info.c | 7 -------
fs/btrfs/space-info.h | 30 ------------------------------
fs/btrfs/sysfs.c | 13 -------------
7 files changed, 97 deletions(-)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 38885b29e6e5..e624e1d9840f 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1399,7 +1399,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
btrfs_space_info_update_bytes_pinned(fs_info, space_info,
-block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
- __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned);
block_group->pinned = 0;
spin_unlock(&block_group->lock);
@@ -3062,8 +3061,6 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- __btrfs_mod_total_bytes_pinned(cache->space_info,
- num_bytes);
set_extent_dirty(&trans->transaction->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index c92d9d4f5f46..06bc842ecdb3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -641,7 +641,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs =
&trans->transaction->delayed_refs;
struct btrfs_fs_info *fs_info = trans->fs_info;
- u64 flags = btrfs_ref_head_to_space_flags(existing);
int old_ref_mod;
BUG_ON(existing->is_data != update->is_data);
@@ -711,26 +710,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
}
}
- /*
- * This handles the following conditions:
- *
- * 1. We had a ref mod of 0 or more and went negative, indicating that
- * we may be freeing space, so add our space to the
- * total_bytes_pinned counter.
- * 2. We were negative and went to 0 or positive, so no longer can say
- * that the space would be pinned, decrement our counter from the
- * total_bytes_pinned counter.
- * 3. We are now at 0 and have ->must_insert_reserved set, which means
- * this was a new allocation and then we dropped it, and thus must
- * add our space to the total_bytes_pinned counter.
- */
- if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
- btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
- else if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
- btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes);
- else if (existing->total_ref_mod == 0 && existing->must_insert_reserved)
- btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
-
spin_unlock(&existing->lock);
}
@@ -835,17 +814,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
- u64 flags = btrfs_ref_head_to_space_flags(head_ref);
-
if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
trans->delayed_ref_updates +=
btrfs_csum_bytes_to_leaves(trans->fs_info,
head_ref->num_bytes);
}
- if (head_ref->ref_mod < 0)
- btrfs_mod_total_bytes_pinned(trans->fs_info, flags,
- head_ref->num_bytes);
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 544bb7a82e57..8305c1cad195 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4696,9 +4696,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
cache->space_info->bytes_reserved -= head->num_bytes;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- percpu_counter_add_batch(
- &cache->space_info->total_bytes_pinned,
- head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
btrfs_put_block_group(cache);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 421120d6a14b..d296483d148f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1804,19 +1804,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
}
- /*
- * We were dropping refs, or had a new ref and dropped it, and thus must
- * adjust down our total_bytes_pinned, the space may or may not have
- * been pinned and so is accounted for properly in the pinned space by
- * now.
- */
- if (head->total_ref_mod < 0 ||
- (head->total_ref_mod == 0 && head->must_insert_reserved)) {
- u64 flags = btrfs_ref_head_to_space_flags(head);
-
- btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes);
- }
-
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
}
@@ -2551,7 +2538,6 @@ static int pin_down_extent(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes);
set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
@@ -2762,7 +2748,6 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
cache->pinned -= len;
btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
space_info->max_extent_size = 0;
- __btrfs_mod_total_bytes_pinned(space_info, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index e969cba0f3b7..cc8634ee0f42 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -192,13 +192,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
if (!space_info)
return -ENOMEM;
- ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
- GFP_KERNEL);
- if (ret) {
- kfree(space_info);
- return ret;
- }
-
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&space_info->block_groups[i]);
init_rwsem(&space_info->groups_sem);
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 11eff2139aae..46a024f5aa70 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -43,18 +43,6 @@ struct btrfs_space_info {
u64 flags;
- /*
- * bytes_pinned is kept in line with what is actually pinned, as in
- * we've called update_block_group and dropped the bytes_used counter
- * and increased the bytes_pinned counter. However this means that
- * bytes_pinned does not reflect the bytes that will be pinned once the
- * delayed refs are flushed, so this counter is inc'ed every time we
- * call btrfs_free_extent so it is a realtime count of what will be
- * freed once the transaction is committed. It will be zeroed every
- * time the transaction commits.
- */
- struct percpu_counter total_bytes_pinned;
-
struct list_head list;
/* Protected by the spinlock 'lock'. */
struct list_head ro_bgs;
@@ -163,22 +151,4 @@ static inline void btrfs_space_info_free_bytes_may_use(
}
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
-
-static inline void __btrfs_mod_total_bytes_pinned(
- struct btrfs_space_info *space_info,
- s64 mod)
-{
- percpu_counter_add_batch(&space_info->total_bytes_pinned, mod,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
-}
-
-static inline void btrfs_mod_total_bytes_pinned(struct btrfs_fs_info *fs_info,
- u64 flags, s64 mod)
-{
- struct btrfs_space_info *space_info = btrfs_find_space_info(fs_info, flags);
-
- ASSERT(space_info);
- __btrfs_mod_total_bytes_pinned(space_info, mod);
-}
-
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 52c5311873d3..e68c0afb7ada 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -665,15 +665,6 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
} \
BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field)
-static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
- struct kobj_attribute *a,
- char *buf)
-{
- struct btrfs_space_info *sinfo = to_space_info(kobj);
- s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned);
- return scnprintf(buf, PAGE_SIZE, "%lld\n", val);
-}
-
static ssize_t btrfs_space_info_show_enospc_events(struct kobject *kobj,
struct kobj_attribute *a,
char *buf)
@@ -693,8 +684,6 @@ SPACE_INFO_ATTR(bytes_readonly);
SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
-BTRFS_ATTR(space_info, total_bytes_pinned,
- btrfs_space_info_show_total_bytes_pinned);
BTRFS_ATTR(space_info, enospc_events,
btrfs_space_info_show_enospc_events);
@@ -709,7 +698,6 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
- BTRFS_ATTR_PTR(space_info, total_bytes_pinned),
BTRFS_ATTR_PTR(space_info, enospc_events),
NULL,
};
@@ -718,7 +706,6 @@ ATTRIBUTE_GROUPS(space_info);
static void space_info_release(struct kobject *kobj)
{
struct btrfs_space_info *sinfo = to_space_info(kobj);
- percpu_counter_destroy(&sinfo->total_bytes_pinned);
kfree(sinfo);
}
--
2.26.3
^ permalink raw reply related [flat|nested] 6+ messages in thread