linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: <gregkh@suse.de>
To: aneesh.kumar@linux.vnet.ibm.com, dev@jaysonking.com,
	gregkh@suse.de, linux-ext4@vger.kernel.org, tytso@mit.edu
Cc: <stable@kernel.org>, <stable-commits@vger.kernel.org>
Subject: patch ext4-make-sure-all-the-block-allocation-paths-reserve-blocks.patch added to 2.6.27-stable tree
Date: Mon, 19 Apr 2010 10:26:57 -0700	[thread overview]
Message-ID: <1271698017884@kroah.org> (raw)
In-Reply-To: <1268699165-17461-4-git-send-email-tytso@mit.edu>


This is a note to let you know that we have just queued up the patch titled

    Subject: ext4: Make sure all the block allocation paths reserve blocks

to the 2.6.27-stable tree.  Its filename is

    ext4-make-sure-all-the-block-allocation-paths-reserve-blocks.patch

A git repo of this tree can be found at 
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary


>From tytso@mit.edu  Mon Apr 19 10:20:41 2010
From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 15 Mar 2010 20:25:57 -0400
Subject: ext4: Make sure all the block allocation paths reserve blocks
To: stable@kernel.org
Cc: Ext4 Developers List <linux-ext4@vger.kernel.org>, "Theodore Ts'o" <tytso@mit.edu>, "Jayson R. King" <dev@jaysonking.com>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Message-ID: <1268699165-17461-4-git-send-email-tytso@mit.edu>


From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

commit a30d542a0035b886ffaafd0057ced0a2b28c3a4f upstream.

With delayed allocation we need to make sure block are reserved before
we attempt to allocate them. Otherwise we get block allocation failure
(ENOSPC) during writepages which cannot be handled. This would mean
silent data loss (We do a printk stating data will be lost). This patch
updates the DIO and fallocate code path to do block reservation before
block allocation. This is needed to make sure parallel DIO and fallocate
request doesn't take block out of delayed reserve space.

When free blocks count go below a threshold we switch to a slow patch
which looks at other CPU's accumulated percpu counter values.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Jayson R. King <dev@jaysonking.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

---
 fs/ext4/balloc.c  |   58 +++++++++++++++++++++++++++++++++++++++---------------
 fs/ext4/ext4.h    |   13 ++++++++++++
 fs/ext4/inode.c   |    5 ----
 fs/ext4/mballoc.c |   23 ++++++++++++---------
 4 files changed, 69 insertions(+), 30 deletions(-)

--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1754,6 +1754,32 @@ out:
 	return ret;
 }
 
+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+						ext4_fsblk_t nblocks)
+{
+	s64 free_blocks;
+	ext4_fsblk_t root_blocks = 0;
+	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+
+	free_blocks = percpu_counter_read(fbc);
+
+	if (!capable(CAP_SYS_RESOURCE) &&
+		sbi->s_resuid != current->fsuid &&
+		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+		root_blocks = ext4_r_blocks_count(sbi->s_es);
+
+	if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
+		free_blocks = percpu_counter_sum(&sbi->s_freeblocks_counter);
+
+	if (free_blocks < (root_blocks + nblocks))
+		/* we don't have free space */
+		return -ENOSPC;
+
+	/* reduce fs free blocks counter */
+	percpu_counter_sub(fbc, nblocks);
+	return 0;
+}
+
 /**
  * ext4_has_free_blocks()
  * @sbi:	in-core super block structure.
@@ -1775,18 +1801,17 @@ ext4_fsblk_t ext4_has_free_blocks(struct
 		sbi->s_resuid != current->fsuid &&
 		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
 		root_blocks = ext4_r_blocks_count(sbi->s_es);
-#ifdef CONFIG_SMP
-	if (free_blocks - root_blocks < FBC_BATCH)
-		free_blocks =
-			percpu_counter_sum(&sbi->s_freeblocks_counter);
-#endif
+
+	if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
+		free_blocks = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+
 	if (free_blocks <= root_blocks)
 		/* we don't have free space */
 		return 0;
 	if (free_blocks - root_blocks < nblocks)
 		return free_blocks - root_blocks;
 	return nblocks;
- }
+}
 
 
 /**
@@ -1865,14 +1890,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_
 		/*
 		 * With delalloc we already reserved the blocks
 		 */
-		*count = ext4_has_free_blocks(sbi, *count);
-	}
-	if (*count == 0) {
-		*errp = -ENOSPC;
-		return 0;	/*return with ENOSPC error */
+		if (ext4_claim_free_blocks(sbi, *count)) {
+			*errp = -ENOSPC;
+			return 0;	/*return with ENOSPC error */
+		}
 	}
-	num = *count;
-
 	/*
 	 * Check quota for allocation of this block.
 	 */
@@ -2067,9 +2089,13 @@ allocated:
 	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
-	if (!EXT4_I(inode)->i_delalloc_reserved_flag)
-		percpu_counter_sub(&sbi->s_freeblocks_counter, num);
-
+	if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
+		/*
+		 * we allocated less blocks than we
+		 * claimed. Add the difference back.
+		 */
+		percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
+	}
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
 		spin_lock(sb_bgl_lock(sbi, flex_group));
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1015,6 +1015,8 @@ extern ext4_fsblk_t ext4_new_blocks(hand
 					unsigned long *count, int *errp);
 extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+						ext4_fsblk_t nblocks);
 extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
 						ext4_fsblk_t nblocks);
 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
@@ -1245,6 +1247,17 @@ do {								\
 		__ext4_std_error((sb), __func__, (errno));	\
 } while (0)
 
+#ifdef CONFIG_SMP
+/* Each CPU can accumulate FBC_BATCH blocks in their local
+ * counters. So we need to make sure we have free blocks more
+ * than FBC_BATCH  * nr_cpu_ids. Also add a window of 4 times.
+ */
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
+#else
+#define EXT4_FREEBLOCKS_WATERMARK 0
+#endif
+
+
 /*
  * Inodes and files operations
  */
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1564,13 +1564,10 @@ static int ext4_da_reserve_space(struct
 	md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
 	total = md_needed + nrblocks;
 
-	if (ext4_has_free_blocks(sbi, total) < total) {
+	if (ext4_claim_free_blocks(sbi, total)) {
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 		return -ENOSPC;
 	}
-	/* reduce fs free blocks counter */
-	percpu_counter_sub(&sbi->s_freeblocks_counter, total);
-
 	EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
 	EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
 
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3194,9 +3194,15 @@ ext4_mb_mark_diskspace_used(struct ext4_
 	 * at write_begin() time for delayed allocation
 	 * do not double accounting
 	 */
-	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
-		percpu_counter_sub(&sbi->s_freeblocks_counter,
-					ac->ac_b_ex.fe_len);
+	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
+			ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
+		/*
+		 * we allocated less blocks than we calimed
+		 * Add the difference back
+		 */
+		percpu_counter_add(&sbi->s_freeblocks_counter,
+				ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
+	}
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -4649,14 +4655,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t
 		/*
 		 * With delalloc we already reserved the blocks
 		 */
-		ar->len = ext4_has_free_blocks(sbi, ar->len);
-	}
-
-	if (ar->len == 0) {
-		*errp = -ENOSPC;
-		return 0;
+		if (ext4_claim_free_blocks(sbi, ar->len)) {
+			*errp = -ENOSPC;
+			return 0;
+		}
 	}

  reply	other threads:[~2010-04-19 17:29 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-03-16  0:25 [PATCH 2.6.27.y 00/11] *** SUBJECT HERE *** Theodore Ts'o
2010-03-16  0:25 ` [PATCH 2.6.27.y 01/11] ext4: invalidate pages if delalloc block allocation fails Theodore Ts'o
2010-04-19 17:26   ` patch ext4-invalidate-pages-if-delalloc-block-allocation-fails.patch added to 2.6.27-stable tree gregkh
2010-03-16  0:25 ` [PATCH 2.6.27.y 02/11] percpu counter: clean up percpu_counter_sum_and_set() Theodore Ts'o
2010-04-19 17:27   ` patch percpu-counter-clean-up-percpu_counter_sum_and_set.patch added to 2.6.27-stable tree gregkh
2010-03-16  0:25 ` [PATCH 2.6.27.y 03/11] ext4: Make sure all the block allocation paths reserve blocks Theodore Ts'o
2010-04-19 17:26   ` gregkh [this message]
2010-03-16  0:25 ` [PATCH 2.6.27.y 04/11] ext4: Add percpu dirty block accounting Theodore Ts'o
2010-03-16 18:48   ` Andreas Dilger
2010-03-17  0:51     ` tytso
2010-04-19 17:26   ` patch ext4-add-percpu-dirty-block-accounting.patch added to 2.6.27-stable tree gregkh
2010-03-16  0:25 ` [PATCH 2.6.27.y 05/11] ext4: Retry block reservation Theodore Ts'o
2010-04-19 17:27   ` patch ext4-retry-block-reservation.patch added to 2.6.27-stable tree gregkh
2010-03-16  0:26 ` [PATCH 2.6.27.y 06/11] ext4: Retry block allocation if we have free blocks left Theodore Ts'o
2010-04-19 17:26   ` patch ext4-retry-block-allocation-if-we-have-free-blocks-left.patch added to 2.6.27-stable tree gregkh
2010-03-16  0:26 ` [PATCH 2.6.27.y 07/11] ext4: Use tag dirty lookup during mpage_da_submit_io Theodore Ts'o
2010-04-19 17:27   ` patch ext4-use-tag-dirty-lookup-during-mpage_da_submit_io.patch added to 2.6.27-stable tree gregkh
2010-03-16  0:26 ` [PATCH 2.6.27.y 08/11] vfs: Remove the range_cont writeback mode Theodore Ts'o
2010-04-19 17:27   ` patch vfs-remove-the-range_cont-writeback-mode.patch added to 2.6.27-stable tree gregkh
2010-03-16  0:26 ` [PATCH 2.6.27.y 09/11] vfs: Add no_nrwrite_index_update writeback control flag Theodore Ts'o
2010-04-19 17:27   ` patch vfs-add-no_nrwrite_index_update-writeback-control-flag.patch added to 2.6.27-stable tree gregkh
2010-03-16  0:26 ` [PATCH 2.6.27.y 10/11] ext4: Fix file fragmentation during large file write Theodore Ts'o
2010-04-19 17:26   ` patch ext4-fix-file-fragmentation-during-large-file-write.patch added to 2.6.27-stable tree gregkh
2010-03-16  0:26 ` [PATCH 2.6.27.y 11/11] ext4: Implement range_cyclic in ext4_da_writepages instead of write_cache_pages Theodore Ts'o
2010-04-19 17:26   ` patch ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch added to 2.6.27-stable tree gregkh
2010-03-17  3:10 ` [PATCH 2.6.27.y 00/11] *** SUBJECT HERE *** Jayson R. King

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1271698017884@kroah.org \
    --to=gregkh@suse.de \
    --cc=aneesh.kumar@linux.vnet.ibm.com \
    --cc=dev@jaysonking.com \
    --cc=linux-ext4@vger.kernel.org \
    --cc=stable-commits@vger.kernel.org \
    --cc=stable@kernel.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).