From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jan Kara Subject: [PATCH] Split JBD checkpoint list Date: Tue, 24 May 2005 21:03:23 +0200 Message-ID: <20050524190323.GA28720@atrey.karlin.mff.cuni.cz> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="envbJBWh7q8WU6mo" Cc: linux-fsdevel@vger.kernel.org Return-path: Received: from atrey.karlin.mff.cuni.cz ([195.113.31.123]:56808 "EHLO atrey.karlin.mff.cuni.cz") by vger.kernel.org with ESMTP id S261421AbVEXTD2 (ORCPT ); Tue, 24 May 2005 15:03:28 -0400 To: sct@redhat.com Content-Disposition: inline Sender: linux-fsdevel-owner@vger.kernel.org List-Id: linux-fsdevel.vger.kernel.org --envbJBWh7q8WU6mo Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Hello! Attached patch add new transaction checkpoint list containing buffers which were already submitted for IO as Stephen once suggested. The patch should close some possible false assertion failures and maybe gain a bit of performance under some load. I gave the patch some basic testing. Stephen, can you please check the patch? I want to run some more tests (any suggestions?) and also check that journalling correctness was preserved (probably by running some IO load under UML, killing UML processes randomly and veryfying that FS is OK after the journal replay). Honza --envbJBWh7q8WU6mo Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="checkpoint-2.6.12-rc4.diff" diff -rupX /home/jack/.kerndiffexclude linux-2.6.12-rc4/fs/jbd/checkpoint.c linux-2.6.12-rc4-1-checkpointfix/fs/jbd/checkpoint.c --- linux-2.6.12-rc4/fs/jbd/checkpoint.c 2005-03-03 18:58:29.000000000 +0100 +++ linux-2.6.12-rc4-1-checkpointfix/fs/jbd/checkpoint.c 2005-05-24 20:30:48.000000000 +0200 @@ -24,24 +24,68 @@ #include /* - * Unlink a buffer from a transaction. + * Unlink a buffer from a transaction checkpoint list. * * Called with j_list_lock held. */ -static inline void __buffer_unlink(struct journal_head *jh) +static inline void __buffer_unlink_first(struct journal_head *jh) { transaction_t *transaction; transaction = jh->b_cp_transaction; - jh->b_cp_transaction = NULL; jh->b_cpnext->b_cpprev = jh->b_cpprev; jh->b_cpprev->b_cpnext = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) + if (transaction->t_checkpoint_list == jh) { transaction->t_checkpoint_list = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) - transaction->t_checkpoint_list = NULL; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; + } +} + +/* + * Unlink a buffer from a transaction checkpoint(io) list. + * + * Called with j_list_lock held. + */ + +static inline void __buffer_unlink(struct journal_head *jh) +{ + transaction_t *transaction; + + transaction = jh->b_cp_transaction; + + __buffer_unlink_first(jh); + if (transaction->t_checkpoint_io_list == jh) { + transaction->t_checkpoint_io_list = jh->b_cpnext; + if (transaction->t_checkpoint_io_list == jh) + transaction->t_checkpoint_io_list = NULL; + } +} + +/* + * Move a buffer from the checkpoint list to the checkpoint io list + * + * Called with j_list_lock held + */ + +static inline void __buffer_relink_io(struct journal_head *jh) +{ + transaction_t *transaction; + + transaction = jh->b_cp_transaction; + __buffer_unlink_first(jh); + + if (!transaction->t_checkpoint_io_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_io_list; + jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_io_list = jh; } /* @@ -117,7 +161,53 @@ static void jbd_sync_bh(journal_t *journ } /* - * Clean up a transaction's checkpoint list. + * Clean up transaction's list of buffers submitted for io. + * We wait for any pending IO to complete and remove any clean + * buffers. Note that take the buffers in the opposite ordering + * from the one in which they were submitted for IO. + * + * Called with j_list_lock held. + */ + +static void __wait_cp_io(journal_t *journal, transaction_t *transaction) +{ + struct journal_head *jh, *next_jh, *last_jh; + struct buffer_head *bh; + tid_t this_tid; + + this_tid = transaction->t_tid; +restart: + /* Didn't somebody cleaned up the transaction in the meanwhile */ + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + return; + jh = transaction->t_checkpoint_io_list; + if (!jh) + return; + last_jh = jh->b_cpprev; + next_jh = jh; + do { + jh = next_jh; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&journal->j_list_lock); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + spin_lock(&journal->j_list_lock); + goto restart; + } + next_jh = jh->b_cpnext; + /* Now in whatever state the buffer currently is, we know that + * it has been written out and so we can drop it from the list */ + __journal_remove_checkpoint(jh); + } while (jh != last_jh); +} + +/* + * Clean up a transaction's checkpoint lists. * * We wait for any pending IO to complete and make sure any clean * buffers are removed from the transaction. @@ -188,9 +278,13 @@ static int __cleanup_transaction(journal } else { jbd_unlock_bh_state(bh); } - jh = next_jh; } while (jh != last_jh); + /* The 2nd condition is evaluated only if we didn't drop j_list_lock */ + if (!ret && transaction->t_checkpoint_io_list != NULL) + ret = 1; + __wait_cp_io(journal, transaction); + return ret; out_return_1: spin_lock(&journal->j_list_lock); @@ -247,6 +341,7 @@ static int __flush_buffer(journal_t *jou J_ASSERT_BH(bh, !buffer_jwrite(bh)); set_buffer_jwrite(bh); bhs[*batch_count] = bh; + __buffer_relink_io(jh); jbd_unlock_bh_state(bh); (*batch_count)++; if (*batch_count == NR_BATCH) { @@ -339,8 +434,10 @@ int log_do_checkpoint(journal_t *journal } } while (jh != last_jh && !retry); - if (batch_count) + if (batch_count) { __flush_batch(journal, bhs, &batch_count); + retry = 1; + } /* * If someone cleaned up this transaction while we slept, we're @@ -455,6 +552,45 @@ int cleanup_journal_tail(journal_t *jour /* Checkpoint list management */ /* + * journal_clean_one_cp_list + * + * Find all the written-back checkpoint buffers in the given list and release them. + * + * Called with the journal locked. + * Called with j_list_lock held. + * Returns number of bufers reaped (for debug) + */ + +static int journal_clean_one_cp_list(struct journal_head *jh) +{ + struct journal_head *last_jh; + struct journal_head *next_jh = jh; + int ret = 0; + + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + /* Use trylock because of the ranking */ + if (jbd_trylock_bh_state(jh2bh(jh))) + ret += __try_to_free_cp_buf(jh); + /* + * This function only frees up some memory + * if possible so we dont have an obligation + * to finish processing. Bail out if preemption + * requested: + */ + if (need_resched()) + return ret; + } while (jh != last_jh); + + return ret; +} + +/* * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. @@ -476,31 +612,20 @@ int __journal_clean_checkpoint_list(jour last_transaction = transaction->t_cpprev; next_transaction = transaction; do { - struct journal_head *jh; - transaction = next_transaction; next_transaction = transaction->t_cpnext; - jh = transaction->t_checkpoint_list; - if (jh) { - struct journal_head *last_jh = jh->b_cpprev; - struct journal_head *next_jh = jh; - - do { - jh = next_jh; - next_jh = jh->b_cpnext; - /* Use trylock because of the ranknig */ - if (jbd_trylock_bh_state(jh2bh(jh))) - ret += __try_to_free_cp_buf(jh); - /* - * This function only frees up some memory - * if possible so we dont have an obligation - * to finish processing. Bail out if preemption - * requested: - */ - if (need_resched()) - goto out; - } while (jh != last_jh); - } + ret += journal_clean_one_cp_list(transaction-> + t_checkpoint_list); + if (need_resched()) + goto out; + /* It is essential that we are as careful as in the case of + t_checkpoint_list with removing the buffer from the list + as we can possibly see not yet submitted buffers on + io_list */ + ret += journal_clean_one_cp_list(transaction-> + t_checkpoint_io_list); + if (need_resched()) + goto out; } while (transaction != last_transaction); out: return ret; @@ -537,8 +662,10 @@ void __journal_remove_checkpoint(struct journal = transaction->t_journal; __buffer_unlink(jh); + jh->b_cp_transaction = NULL; - if (transaction->t_checkpoint_list != NULL) + if (transaction->t_checkpoint_list != NULL || + transaction->t_checkpoint_io_list != NULL) goto out; JBUFFER_TRACE(jh, "transaction has no more buffers"); @@ -627,6 +754,7 @@ void __journal_drop_transaction(journal_ J_ASSERT(transaction->t_shadow_list == NULL); J_ASSERT(transaction->t_log_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); + J_ASSERT(transaction->t_checkpoint_io_list == NULL); J_ASSERT(transaction->t_updates == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); diff -rupX /home/jack/.kerndiffexclude linux-2.6.12-rc4/include/linux/jbd.h linux-2.6.12-rc4-1-checkpointfix/include/linux/jbd.h --- linux-2.6.12-rc4/include/linux/jbd.h 2005-05-18 15:11:07.000000000 +0200 +++ linux-2.6.12-rc4-1-checkpointfix/include/linux/jbd.h 2005-05-22 23:15:09.000000000 +0200 @@ -499,6 +499,12 @@ struct transaction_s struct journal_head *t_checkpoint_list; /* + * Doubly-linked circular list of all buffers submitted for IO while + * checkpointing. [j_list_lock] + */ + struct journal_head *t_checkpoint_io_list; + + /* * Doubly-linked circular list of temporary buffers currently undergoing * IO in the log [j_list_lock] */ --envbJBWh7q8WU6mo--