From 2db938bee32e7469ca8ed9bfb3a05535f28c680d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 21 Feb 2011 17:25:37 +0100 Subject: jbd: Refine commit writeout logic Currently we write out all journal buffers in WRITE_SYNC mode. This improves performance for fsync heavy workloads but hinders performance when writes are mostly asynchronous, most noticably it slows down readers and users complain about slow desktop response etc. So submit writes as asynchronous in the normal case and only submit writes as WRITE_SYNC if we detect someone is waiting for current transaction commit. I've gathered some numbers to back this change. The first is the read latency test. It measures time to read 1 MB after several seconds of sleeping in presence of streaming writes. Top 10 times (out of 90) in us: Before After 2131586 697473 1709932 557487 1564598 535642 1480462 347573 1478579 323153 1408496 222181 1388960 181273 1329565 181070 1252486 172832 1223265 172278 Average: 619377 82180 So the improvement in both maximum and average latency is massive. I've measured fsync throughput by: fs_mark -n 100 -t 1 -s 16384 -d /mnt/fsync/ -S 1 -L 4 in presence of streaming reader. The numbers (fsyncs/s) are: Before After 9.9 6.3 6.8 6.0 6.3 6.2 5.8 6.1 So fsync performance seems unharmed by this change. Signed-off-by: Jan Kara --- fs/jbd/commit.c | 10 +++++++--- fs/jbd/journal.c | 2 ++ fs/jbd/transaction.c | 2 -- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index f2b9a571f4c..9d31e6a3920 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -298,6 +298,7 @@ void journal_commit_transaction(journal_t *journal) int tag_flag; int i; struct blk_plug plug; + int write_op = WRITE; /* * First job: lock down the current transaction and wait for @@ -413,13 +414,16 @@ void journal_commit_transaction(journal_t *journal) jbd_debug (3, "JBD: commit phase 2\n"); + if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid)) + write_op = WRITE_SYNC; + /* * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ blk_start_plug(&plug); err = journal_submit_data_buffers(journal, commit_transaction, - WRITE_SYNC); + write_op); blk_finish_plug(&plug); /* @@ -478,7 +482,7 @@ void journal_commit_transaction(journal_t *journal) blk_start_plug(&plug); - journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC); + journal_write_revoke_records(journal, commit_transaction, write_op); /* * If we found any dirty or locked buffers, then we should have @@ -649,7 +653,7 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(WRITE_SYNC, bh); + submit_bh(write_op, bh); } cond_resched(); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 0971e921780..2047fd77bf3 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -563,6 +563,8 @@ int log_wait_commit(journal_t *journal, tid_t tid) spin_unlock(&journal->j_state_lock); #endif spin_lock(&journal->j_state_lock); + if (!tid_geq(journal->j_commit_waited, tid)) + journal->j_commit_waited = tid; while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index b2a7e5244e3..febc10db5ce 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1433,8 +1433,6 @@ int journal_stop(handle_t *handle) } } - if (handle->h_sync) - transaction->t_synchronous_commit = 1; current->journal_info = NULL; spin_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); -- cgit v1.2.3 From ac0dd2474822fbd7d8e8cca12495e6b4760556cd Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Tue, 27 Mar 2012 16:09:16 +0900 Subject: ext3: remove max_debt in find_group_orlov() max_debt, involved variables and calculations are no longer needed, clean them up. Signed-off-by: Akira Fujita Signed-off-by: Jan Kara --- fs/ext3/ialloc.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index e3c39e4cec1..082afd78b10 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -180,8 +180,7 @@ error_return: * It's OK to put directory into a group unless * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or - * it has too few free blocks left (min_blocks) or - * it's already running too large debt (max_debt). + * it has too few free blocks left (min_blocks). * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more @@ -191,21 +190,16 @@ error_return: * when we allocate an inode, within 0--255. */ -#define INODE_COST 64 -#define BLOCK_COST 256 - static int find_group_orlov(struct super_block *sb, struct inode *parent) { int parent_group = EXT3_I(parent)->i_block_group; struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; int ngroups = sbi->s_groups_count; int inodes_per_group = EXT3_INODES_PER_GROUP(sb); unsigned int freei, avefreei; ext3_fsblk_t freeb, avefreeb; - ext3_fsblk_t blocks_per_dir; unsigned int ndirs; - int max_debt, max_dirs, min_inodes; + int max_dirs, min_inodes; ext3_grpblk_t min_blocks; int group = -1, i; struct ext3_group_desc *desc; @@ -242,20 +236,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) goto fallback; } - blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs; - max_dirs = ndirs / ngroups + inodes_per_group / 16; min_inodes = avefreei - inodes_per_group / 4; min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4; - max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST); - if (max_debt * INODE_COST > inodes_per_group) - max_debt = inodes_per_group / INODE_COST; - if (max_debt > 255) - max_debt = 255; - if (max_debt == 0) - max_debt = 1; - for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; desc = ext3_get_group_desc (sb, group, NULL); -- cgit v1.2.3 From f2b2242081314ee4385f3b49d92b0adff8324d80 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 21 Mar 2012 18:14:30 +0200 Subject: ext2: write superblock only once on unmount Currently on unmount if we are mounted R/W, we first write the superblock to the media if it is dirty, and then write it again, which is not optimal. This patch makes ext2 write the superblock on unmount less times. Signed-off-by: Artem Bityutskiy Signed-off-by: Jan Kara --- fs/ext2/super.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index e1025c7a437..12a7916cfa9 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -130,9 +130,6 @@ static void ext2_put_super (struct super_block * sb) dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - if (sb->s_dirt) - ext2_write_super(sb); - ext2_xattr_put_super(sb); if (!(sb->s_flags & MS_RDONLY)) { struct ext2_super_block *es = sbi->s_es; -- cgit v1.2.3 From b838ec2232b764a4903707e212c62f681b32cd51 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 31 Mar 2012 14:22:10 +0200 Subject: ext2: Remove s_dirt handling Places which modify superblock feature / state fields mark the superblock buffer dirty so it is written out by flusher thread. Thus there's no need to set s_dirt there. The only other fields changing in the superblock are the numbers of free blocks, free inodes and s_wtime. There's no real need to write (or even compute) these periodically. Free blocks / inodes counters are recomputed on every mount from group counters anyway and value of s_wtime is only informational and imprecise anyway. So it should be enough to write these opportunistically on mount, remount, umount, and sync_fs times. Signed-off-by: Jan Kara --- fs/ext2/balloc.c | 2 -- fs/ext2/ialloc.c | 2 -- fs/ext2/super.c | 3 --- fs/ext2/xattr.c | 1 - 4 files changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index a8cbe1bc6ad..a9bba1e3920 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -165,7 +165,6 @@ static void release_blocks(struct super_block *sb, int count) struct ext2_sb_info *sbi = EXT2_SB(sb); percpu_counter_add(&sbi->s_freeblocks_counter, count); - sb->s_dirt = 1; } } @@ -180,7 +179,6 @@ static void group_adjust_blocks(struct super_block *sb, int group_no, free_blocks = le16_to_cpu(desc->bg_free_blocks_count); desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count); spin_unlock(sb_bgl_lock(sbi, group_no)); - sb->s_dirt = 1; mark_buffer_dirty(bh); } } diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 8b15cf8cef3..c13eb7b91a1 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -81,7 +81,6 @@ static void ext2_release_inode(struct super_block *sb, int group, int dir) spin_unlock(sb_bgl_lock(EXT2_SB(sb), group)); if (dir) percpu_counter_dec(&EXT2_SB(sb)->s_dirs_counter); - sb->s_dirt = 1; mark_buffer_dirty(bh); } @@ -543,7 +542,6 @@ got: } spin_unlock(sb_bgl_lock(sbi, group)); - sb->s_dirt = 1; mark_buffer_dirty(bh2); if (test_opt(sb, GRPID)) { inode->i_mode = mode; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 12a7916cfa9..a43f9adcc81 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1158,7 +1158,6 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, mark_buffer_dirty(EXT2_SB(sb)->s_sbh); if (wait) sync_dirty_buffer(EXT2_SB(sb)->s_sbh); - sb->s_dirt = 0; } /* @@ -1191,8 +1190,6 @@ void ext2_write_super(struct super_block *sb) { if (!(sb->s_flags & MS_RDONLY)) ext2_sync_fs(sb, 1); - else - sb->s_dirt = 0; } static int ext2_remount (struct super_block * sb, int * flags, char * data) diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 6dcafc7efdf..b6754dbbce3 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -339,7 +339,6 @@ static void ext2_xattr_update_super_block(struct super_block *sb) spin_lock(&EXT2_SB(sb)->s_lock); EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR); spin_unlock(&EXT2_SB(sb)->s_lock); - sb->s_dirt = 1; mark_buffer_dirty(EXT2_SB(sb)->s_sbh); } -- cgit v1.2.3 From f72cf5e223a28d3b3ea7dc9e40464fd534e359e8 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 3 Apr 2012 12:49:18 +0300 Subject: ext2: do not register write_super within VFS Jan Kara removed 'sb->s_dirt' VFS flag references, so we do not need to register the ext2 'ext2_write_super()' method in the VFS superblock operations, because 'sb->s_dirt' won't be ever set to 1 and VFS won't ever call '->write_super()' anyway. Thus, remove the method. Tested using xfstests. Signed-off-by: Artem Bityutskiy Signed-off-by: Jan Kara --- fs/ext2/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index a43f9adcc81..e0e8f45e9a7 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -302,7 +302,6 @@ static const struct super_operations ext2_sops = { .write_inode = ext2_write_inode, .evict_inode = ext2_evict_inode, .put_super = ext2_put_super, - .write_super = ext2_write_super, .sync_fs = ext2_sync_fs, .statfs = ext2_statfs, .remount_fs = ext2_remount, -- cgit v1.2.3 From 9754e39c7bc51328f145e933bfb0df47cd67b6e9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 7 Apr 2012 12:33:03 +0200 Subject: jbd: Split updating of journal superblock and marking journal empty There are three case of updating journal superblock. In the first case, we want to mark journal as empty (setting s_sequence to 0), in the second case we want to update log tail, in the third case we want to update s_errno. Split these cases into separate functions. It makes the code slightly more straightforward and later patches will make the distinction even more important. Signed-off-by: Jan Kara --- fs/jbd/checkpoint.c | 2 +- fs/jbd/commit.c | 2 +- fs/jbd/journal.c | 164 +++++++++++++++++++++++++++++++--------------------- 3 files changed, 99 insertions(+), 69 deletions(-) (limited to 'fs') diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 05f0754f2b4..80c85f3e087 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -540,7 +540,7 @@ int cleanup_journal_tail(journal_t *journal) journal->j_tail = blocknr; spin_unlock(&journal->j_state_lock); if (!(journal->j_flags & JFS_ABORT)) - journal_update_superblock(journal, 1); + journal_update_sb_log_tail(journal); return 0; } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 9d31e6a3920..dba9cfd75f1 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal) /* Do we need to erase the effects of a prior journal_flush? */ if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); - journal_update_superblock(journal, 1); + journal_update_sb_log_tail(journal); } else { jbd_debug(3, "superblock not updated\n"); } diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 2047fd77bf3..44c104abfb3 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -923,8 +923,22 @@ static int journal_reset(journal_t *journal) journal->j_max_transaction_buffers = journal->j_maxlen / 4; - /* Add the dynamic fields and write it to disk. */ - journal_update_superblock(journal, 1); + /* + * As a special case, if the on-disk copy is already marked as needing + * no recovery (s_start == 0), then we can safely defer the superblock + * update until the next commit by setting JFS_FLUSHED. This avoids + * attempting a write to a potential-readonly device. + */ + if (sb->s_start == 0) { + jbd_debug(1,"JBD: Skipping superblock update on recovered sb " + "(start %u, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, + journal->j_errno); + journal->j_flags |= JFS_FLUSHED; + } else { + /* Add the dynamic fields and write it to disk. */ + journal_update_sb_log_tail(journal); + } return journal_start_thread(journal); } @@ -1001,35 +1015,11 @@ int journal_create(journal_t *journal) return journal_reset(journal); } -/** - * void journal_update_superblock() - Update journal sb on disk. - * @journal: The journal to update. - * @wait: Set to '0' if you don't want to wait for IO completion. - * - * Update a journal's dynamic superblock fields and write it to disk, - * optionally waiting for the IO to complete. - */ -void journal_update_superblock(journal_t *journal, int wait) +static void journal_write_superblock(journal_t *journal) { - journal_superblock_t *sb = journal->j_superblock; struct buffer_head *bh = journal->j_sb_buffer; - /* - * As a special case, if the on-disk copy is already marked as needing - * no recovery (s_start == 0) and there are no outstanding transactions - * in the filesystem, then we can safely defer the superblock update - * until the next commit by setting JFS_FLUSHED. This avoids - * attempting a write to a potential-readonly device. - */ - if (sb->s_start == 0 && journal->j_tail_sequence == - journal->j_transaction_sequence) { - jbd_debug(1,"JBD: Skipping superblock update on recovered sb " - "(start %u, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, - journal->j_errno); - goto out; - } - + trace_journal_write_superblock(journal); if (buffer_write_io_error(bh)) { char b[BDEVNAME_SIZE]; /* @@ -1047,44 +1037,94 @@ void journal_update_superblock(journal_t *journal, int wait) set_buffer_uptodate(bh); } + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + if (buffer_write_io_error(bh)) { + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "JBD: I/O error detected " + "when updating journal superblock for %s.\n", + journal_dev_name(journal, b)); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } +} + +/** + * journal_update_sb_log_tail() - Update log tail in journal sb on disk. + * @journal: The journal to update. + * + * Update a journal's superblock information about log tail and write it to + * disk, waiting for the IO to complete. + */ +void journal_update_sb_log_tail(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + spin_lock(&journal->j_state_lock); jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(journal->j_tail); - sb->s_errno = cpu_to_be32(journal->j_errno); spin_unlock(&journal->j_state_lock); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - if (wait) { - sync_dirty_buffer(bh); - if (buffer_write_io_error(bh)) { - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "JBD: I/O error detected " - "when updating journal superblock for %s.\n", - journal_dev_name(journal, b)); - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - } else - write_dirty_buffer(bh, WRITE); + journal_write_superblock(journal); - trace_jbd_update_superblock_end(journal, wait); -out: - /* If we have just flushed the log (by marking s_start==0), then - * any future commit will have to be careful to update the - * superblock again to re-record the true start of the log. */ + /* Log is no longer empty */ + spin_lock(&journal->j_state_lock); + WARN_ON(!sb->s_sequence); + journal->j_flags &= ~JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/** + * mark_journal_empty() - Mark on disk journal as empty. + * @journal: The journal to update. + * + * Update a journal's dynamic superblock fields to show that journal is empty. + * Write updated superblock to disk waiting for IO to complete. + */ +static void mark_journal_empty(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; spin_lock(&journal->j_state_lock); - if (sb->s_start) - journal->j_flags &= ~JFS_FLUSHED; - else - journal->j_flags |= JFS_FLUSHED; + jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n", + journal->j_tail_sequence); + + sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); + sb->s_start = cpu_to_be32(0); + spin_unlock(&journal->j_state_lock); + + journal_write_superblock(journal); + + spin_lock(&journal->j_state_lock); + /* Log is empty */ + journal->j_flags |= JFS_FLUSHED; spin_unlock(&journal->j_state_lock); } +/** + * journal_update_sb_errno() - Update error in the journal. + * @journal: The journal to update. + * + * Update a journal's errno. Write updated superblock to disk waiting for IO + * to complete. + */ +static void journal_update_sb_errno(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + + spin_lock(&journal->j_state_lock); + jbd_debug(1, "JBD: updating superblock error (errno %d)\n", + journal->j_errno); + sb->s_errno = cpu_to_be32(journal->j_errno); + spin_unlock(&journal->j_state_lock); + + journal_write_superblock(journal); +} + /* * Read the superblock for a given journal, performing initial * validation of the format. @@ -1268,14 +1308,11 @@ int journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { - /* We can now mark the journal as empty. */ - journal->j_tail = 0; journal->j_tail_sequence = ++journal->j_transaction_sequence; - journal_update_superblock(journal, 1); - } else { + mark_journal_empty(journal); + } else err = -EIO; - } brelse(journal->j_sb_buffer); } @@ -1457,7 +1494,6 @@ int journal_flush(journal_t *journal) { int err = 0; transaction_t *transaction = NULL; - unsigned int old_tail; spin_lock(&journal->j_state_lock); @@ -1499,14 +1535,8 @@ int journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ + mark_journal_empty(journal); spin_lock(&journal->j_state_lock); - old_tail = journal->j_tail; - journal->j_tail = 0; - spin_unlock(&journal->j_state_lock); - journal_update_superblock(journal, 1); - spin_lock(&journal->j_state_lock); - journal->j_tail = old_tail; - J_ASSERT(!journal->j_running_transaction); J_ASSERT(!journal->j_committing_transaction); J_ASSERT(!journal->j_checkpoint_transactions); @@ -1547,7 +1577,7 @@ int journal_wipe(journal_t *journal, int write) err = journal_skip_recovery(journal); if (write) - journal_update_superblock(journal, 1); + mark_journal_empty(journal); no_recovery: return err; @@ -1615,7 +1645,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) __journal_abort_hard(journal); if (errno) - journal_update_superblock(journal, 1); + journal_update_sb_errno(journal); } /** -- cgit v1.2.3 From 1ce8486dcc00c1e095af8d155fa4451936b89013 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 7 Apr 2012 12:50:13 +0200 Subject: jbd: protect all log tail updates with j_checkpoint_mutex There are some log tail updates that are not protected by j_checkpoint_mutex. Some of these are harmless because they happen during startup or shutdown but updates in journal_commit_transaction() and journal_flush() can really race with other log tail updates (e.g. someone doing journal_flush() with someone running cleanup_journal_tail()). So protect all log tail updates with j_checkpoint_mutex. Signed-off-by: Jan Kara --- fs/jbd/commit.c | 2 ++ fs/jbd/journal.c | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index dba9cfd75f1..1b27f46e610 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -308,7 +308,9 @@ void journal_commit_transaction(journal_t *journal) /* Do we need to erase the effects of a prior journal_flush? */ if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); + mutex_lock(&journal->j_checkpoint_mutex); journal_update_sb_log_tail(journal); + mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd_debug(3, "superblock not updated\n"); } diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 44c104abfb3..b29c7678525 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -936,8 +936,11 @@ static int journal_reset(journal_t *journal) journal->j_errno); journal->j_flags |= JFS_FLUSHED; } else { + /* Lock here to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); /* Add the dynamic fields and write it to disk. */ journal_update_sb_log_tail(journal); + mutex_unlock(&journal->j_checkpoint_mutex); } return journal_start_thread(journal); } @@ -1061,6 +1064,7 @@ void journal_update_sb_log_tail(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); spin_lock(&journal->j_state_lock); jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); @@ -1089,6 +1093,7 @@ static void mark_journal_empty(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); spin_lock(&journal->j_state_lock); jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n", journal->j_tail_sequence); @@ -1293,6 +1298,8 @@ int journal_destroy(journal_t *journal) /* Force any old transactions to disk */ + /* We cannot race with anybody but must keep assertions happy */ + mutex_lock(&journal->j_checkpoint_mutex); /* Totally anal locking here... */ spin_lock(&journal->j_list_lock); while (journal->j_checkpoint_transactions != NULL) { @@ -1315,6 +1322,7 @@ int journal_destroy(journal_t *journal) err = -EIO; brelse(journal->j_sb_buffer); } + mutex_unlock(&journal->j_checkpoint_mutex); if (journal->j_inode) iput(journal->j_inode); @@ -1528,6 +1536,7 @@ int journal_flush(journal_t *journal) if (is_journal_aborted(journal)) return -EIO; + mutex_lock(&journal->j_checkpoint_mutex); cleanup_journal_tail(journal); /* Finally, mark the journal as really needing no recovery. @@ -1536,6 +1545,7 @@ int journal_flush(journal_t *journal) * commits of data to the journal will restore the current * s_start value. */ mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); J_ASSERT(!journal->j_committing_transaction); @@ -1576,8 +1586,12 @@ int journal_wipe(journal_t *journal, int write) write ? "Clearing" : "Ignoring"); err = journal_skip_recovery(journal); - if (write) + if (write) { + /* Lock to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } no_recovery: return err; -- cgit v1.2.3 From fd2cbd4dfa3db477dd6226d387d3f1911d36a6a9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 7 Apr 2012 11:05:19 +0200 Subject: jbd: Write journal superblock with WRITE_FUA after checkpointing If journal superblock is written only in disk's caches and other transaction starts reusing space of the transaction cleaned from the log, it can happen blocks of a new transaction reach the disk before journal superblock. When power failure happens in such case, subsequent journal replay would still try to replay the old transaction but some of it's blocks may be already overwritten by the new transaction. For this reason we must use WRITE_FUA when updating log tail and we must first write new log tail to disk and update in-memory information only after that. Signed-off-by: Jan Kara --- fs/jbd/checkpoint.c | 23 +++++++++----------- fs/jbd/commit.c | 9 +++++++- fs/jbd/journal.c | 60 ++++++++++++++++++++++++++++++++++------------------- 3 files changed, 57 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 80c85f3e087..08c03044abd 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -508,20 +508,19 @@ int cleanup_journal_tail(journal_t *journal) /* * We need to make sure that any blocks that were recently written out * --- perhaps by log_do_checkpoint() --- are flushed out before we - * drop the transactions from the journal. It's unlikely this will be - * necessary, especially with an appropriately sized journal, but we - * need this to guarantee correctness. Fortunately - * cleanup_journal_tail() doesn't get called all that often. + * drop the transactions from the journal. Similarly we need to be sure + * superblock makes it to disk before next transaction starts reusing + * freed space (otherwise we could replay some blocks of the new + * transaction thinking they belong to the old one). So we use + * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially + * with an appropriately sized journal, but we need this to guarantee + * correctness. Fortunately cleanup_journal_tail() doesn't get called + * all that often. */ - if (journal->j_flags & JFS_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + journal_update_sb_log_tail(journal, first_tid, blocknr, + WRITE_FLUSH_FUA); spin_lock(&journal->j_state_lock); - if (!tid_gt(first_tid, journal->j_tail_sequence)) { - spin_unlock(&journal->j_state_lock); - /* Someone else cleaned up journal so return 0 */ - return 0; - } /* OK, update the superblock to recover the freed space. * Physical blocks come first: have we wrapped beyond the end of * the log? */ @@ -539,8 +538,6 @@ int cleanup_journal_tail(journal_t *journal) journal->j_tail_sequence = first_tid; journal->j_tail = blocknr; spin_unlock(&journal->j_state_lock); - if (!(journal->j_flags & JFS_ABORT)) - journal_update_sb_log_tail(journal); return 0; } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 1b27f46e610..52c15c77602 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -309,7 +309,14 @@ void journal_commit_transaction(journal_t *journal) if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); mutex_lock(&journal->j_checkpoint_mutex); - journal_update_sb_log_tail(journal); + /* + * We hold j_checkpoint_mutex so tail cannot change under us. + * We don't need any special data guarantees for writing sb + * since journal is empty and it is ok for write to be + * flushed only with transaction commit. + */ + journal_update_sb_log_tail(journal, journal->j_tail_sequence, + journal->j_tail, WRITE_SYNC); mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd_debug(3, "superblock not updated\n"); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index b29c7678525..425c2f2cf17 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -938,8 +938,16 @@ static int journal_reset(journal_t *journal) } else { /* Lock here to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); - /* Add the dynamic fields and write it to disk. */ - journal_update_sb_log_tail(journal); + /* + * Update log tail information. We use WRITE_FUA since new + * transaction will start reusing journal space and so we + * must make sure information about current log tail is on + * disk before that. + */ + journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } return journal_start_thread(journal); @@ -1018,11 +1026,15 @@ int journal_create(journal_t *journal) return journal_reset(journal); } -static void journal_write_superblock(journal_t *journal) +static void journal_write_superblock(journal_t *journal, int write_op) { struct buffer_head *bh = journal->j_sb_buffer; + int ret; - trace_journal_write_superblock(journal); + trace_journal_write_superblock(journal, write_op); + if (!(journal->j_flags & JFS_BARRIER)) + write_op &= ~(REQ_FUA | REQ_FLUSH); + lock_buffer(bh); if (buffer_write_io_error(bh)) { char b[BDEVNAME_SIZE]; /* @@ -1040,40 +1052,46 @@ static void journal_write_superblock(journal_t *journal) set_buffer_uptodate(bh); } - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - sync_dirty_buffer(bh); + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(write_op, bh); + wait_on_buffer(bh); if (buffer_write_io_error(bh)) { - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "JBD: I/O error detected " - "when updating journal superblock for %s.\n", - journal_dev_name(journal, b)); clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); + ret = -EIO; + } + if (ret) { + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "JBD: Error %d detected " + "when updating journal superblock for %s.\n", + ret, journal_dev_name(journal, b)); } } /** * journal_update_sb_log_tail() - Update log tail in journal sb on disk. * @journal: The journal to update. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_op: With which operation should we write the journal sb * * Update a journal's superblock information about log tail and write it to * disk, waiting for the IO to complete. */ -void journal_update_sb_log_tail(journal_t *journal) +void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, + unsigned int tail_block, int write_op) { journal_superblock_t *sb = journal->j_superblock; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - spin_lock(&journal->j_state_lock); - jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, journal->j_errno); + jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n", + tail_block, tail_tid); - sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); - sb->s_start = cpu_to_be32(journal->j_tail); - spin_unlock(&journal->j_state_lock); + sb->s_sequence = cpu_to_be32(tail_tid); + sb->s_start = cpu_to_be32(tail_block); - journal_write_superblock(journal); + journal_write_superblock(journal, write_op); /* Log is no longer empty */ spin_lock(&journal->j_state_lock); @@ -1102,7 +1120,7 @@ static void mark_journal_empty(journal_t *journal) sb->s_start = cpu_to_be32(0); spin_unlock(&journal->j_state_lock); - journal_write_superblock(journal); + journal_write_superblock(journal, WRITE_FUA); spin_lock(&journal->j_state_lock); /* Log is empty */ @@ -1127,7 +1145,7 @@ static void journal_update_sb_errno(journal_t *journal) sb->s_errno = cpu_to_be32(journal->j_errno); spin_unlock(&journal->j_state_lock); - journal_write_superblock(journal); + journal_write_superblock(journal, WRITE_SYNC); } /* -- cgit v1.2.3 From d7e9711760a17aef3a94fc6dff4759fa5961de25 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 24 Apr 2012 17:08:41 +0200 Subject: quota: Fix double lock in add_dquot_ref() with CONFIG_QUOTA_DEBUG When CONFIG_QUOTA_DEBUG is enabled we call inode_get_rsv_space() from add_dquot_ref() while holding i_lock. But inode_get_rsv_space() is trying to get i_lock as well resulting in double lock. Fix the problem by moving inode_get_rsv_space() call out of i_lock. Reported-and-analyzed-by: Jie Liu Signed-off-by: Jan Kara --- fs/quota/dquot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index d69a1d1d7e1..0dcdda305c9 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -907,14 +907,14 @@ static void add_dquot_ref(struct super_block *sb, int type) spin_unlock(&inode->i_lock); continue; } -#ifdef CONFIG_QUOTA_DEBUG - if (unlikely(inode_get_rsv_space(inode) > 0)) - reserved = 1; -#endif __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&inode_sb_list_lock); +#ifdef CONFIG_QUOTA_DEBUG + if (unlikely(inode_get_rsv_space(inode) > 0)) + reserved = 1; +#endif iput(old_inode); __dquot_initialize(inode, type); -- cgit v1.2.3 From 905c3937625d14fa4453a2643814b72b8a22d9ae Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 25 Apr 2012 21:24:30 +0200 Subject: ext3: Remove i_mutex use from ext3_quota_write() We don't need i_mutex in ext3_quota_write() because writes to quota file are serialized by dqio_mutex anyway. Changes to quota files outside of quota code are forbidded and enforced by NOATIME and IMMUTABLE bits. Signed-off-by: Jan Kara --- fs/ext3/super.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index cf0b5921cf0..7c08c93bb61 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -3000,7 +3000,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, (unsigned long long)off, (unsigned long long)len); return -EIO; } - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); bh = ext3_bread(handle, inode, blk, 1, &err); if (!bh) goto out; @@ -3024,10 +3023,8 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, } brelse(bh); out: - if (err) { - mutex_unlock(&inode->i_mutex); + if (err) return err; - } if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT3_I(inode)->i_disksize = inode->i_size; @@ -3035,7 +3032,6 @@ out: inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); - mutex_unlock(&inode->i_mutex); return len; } -- cgit v1.2.3 From 0b7f7cefaea96ef25dbe54b8e16d2d61560d8b9d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 25 Apr 2012 21:26:54 +0200 Subject: ext4: Remove i_mutex use from ext4_quota_write() We don't need i_mutex in ext4_quota_write() because writes to quota file are serialized by dqio_mutex anyway. Changes to quota files outside of quota code are forbidded and enforced by NOATIME and IMMUTABLE bits. Signed-off-by: Jan Kara --- fs/ext4/super.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ceebaf853be..97938db9f39 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4760,7 +4760,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, return -EIO; } - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); bh = ext4_bread(handle, inode, blk, 1, &err); if (!bh) goto out; @@ -4776,16 +4775,13 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: - if (err) { - mutex_unlock(&inode->i_mutex); + if (err) return err; - } if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; ext4_mark_inode_dirty(handle, inode); } - mutex_unlock(&inode->i_mutex); return len; } -- cgit v1.2.3 From 67f1648d217c3b8165ca114c7838164f31e15790 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 25 Apr 2012 21:28:12 +0200 Subject: reiserfs: Remove i_mutex use from reiserfs_quota_write() We don't need i_mutex in reiserfs_quota_write() because writes to quota file are serialized by dqio_mutex anyway. Changes to quota files outside of quota code are forbidded and enforced by NOATIME and IMMUTABLE bits. Signed-off-by: Jan Kara --- fs/reiserfs/super.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 8b7616ef06d..c07b7d70944 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2270,7 +2270,6 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, (unsigned long long)off, (unsigned long long)len); return -EIO; } - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -2302,16 +2301,13 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) { - mutex_unlock(&inode->i_mutex); + if (len == towrite) return err; - } if (inode->i_size < off + len - towrite) i_size_write(inode, off + len - towrite); inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - mutex_unlock(&inode->i_mutex); return len - towrite; } -- cgit v1.2.3 From e2a3fde75079efeb22cf1e5098d5689b664bf1d5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 25 Apr 2012 21:29:42 +0200 Subject: ext2: Remove i_mutex use from ext2_quota_write() We don't need i_mutex in ext2_quota_write() because writes to quota file are serialized by dqio_mutex anyway. Changes to quota files outside of quota code are forbidded and enforced by NOATIME and IMMUTABLE bits. Signed-off-by: Jan Kara --- fs/ext2/super.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index e0e8f45e9a7..7c1207c0e0c 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1434,7 +1434,6 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, struct buffer_head tmp_bh; struct buffer_head *bh; - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -1464,16 +1463,13 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) { - mutex_unlock(&inode->i_mutex); + if (len == towrite) return err; - } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - mutex_unlock(&inode->i_mutex); return len - towrite; } -- cgit v1.2.3 From f9ef178412cab442719c9ffdc659933f5f2fa87e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 25 Apr 2012 21:10:31 +0200 Subject: quota: Use precomputed value of sb_dqopt in dquot_quota_sync Signed-off-by: Jan Kara --- fs/quota/dquot.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 0dcdda305c9..6e429fdda17 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -638,7 +638,7 @@ int dquot_quota_sync(struct super_block *sb, int type, int wait) dqstats_inc(DQST_SYNCS); mutex_unlock(&dqopt->dqonoff_mutex); - if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)) + if (!wait || (dqopt->flags & DQUOT_QUOTA_SYS_FILE)) return 0; /* This is not very clever (and fast) but currently I don't know about @@ -652,18 +652,18 @@ int dquot_quota_sync(struct super_block *sb, int type, int wait) * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + mutex_lock(&dqopt->dqonoff_mutex); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_active(sb, cnt)) continue; - mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, + mutex_lock_nested(&dqopt->files[cnt]->i_mutex, I_MUTEX_QUOTA); - truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); - mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + mutex_unlock(&dqopt->files[cnt]->i_mutex); } - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + mutex_unlock(&dqopt->dqonoff_mutex); return 0; } -- cgit v1.2.3 From a80b12c3d08dbbf15e6a551e481c32a2df4911f3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 25 Apr 2012 21:15:53 +0200 Subject: quota: Get rid of nested I_MUTEX_QUOTA locking subclass So far i_mutex was ranking above dqonoff_mutex and i_mutex on quota files was special and ranking below dqonoff_mutex (and several other locks). However there's no real need for i_mutex on quota files to be special. IO on quota files is serialized by dqio_mutex anyway so we don't need to take i_mutex when writing to quota files. Other places where we take i_mutex on quota file can accomodate standard i_mutex lock ranking, we only need to change the lock ranking to be dqonoff_mutex > i_mutex which is a matter of changing documentation because there's no place which would enforce ordering in the other direction. Signed-off-by: Jan Kara --- fs/quota/dquot.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 6e429fdda17..10cbe841cb7 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -116,15 +116,15 @@ * spinlock to internal buffers before writing. * * Lock ordering (including related VFS locks) is the following: - * i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock > + * dqonoff_mutex > i_mutex > journal_lock > dqptr_sem > dquot->dq_lock > * dqio_mutex + * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc. * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem > * dqptr_sem. But filesystem has to count with the fact that functions such as * dquot_alloc_space() acquire dqptr_sem and they usually have to be called * from inside a transaction to keep filesystem consistency after a crash. Also * filesystems usually want to do some IO on dquot from ->mark_dirty which is * called with dqptr_sem held. - * i_mutex on quota files is special (it's below dqio_mutex) */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); @@ -658,8 +658,7 @@ int dquot_quota_sync(struct super_block *sb, int type, int wait) continue; if (!sb_has_quota_active(sb, cnt)) continue; - mutex_lock_nested(&dqopt->files[cnt]->i_mutex, - I_MUTEX_QUOTA); + mutex_lock(&dqopt->files[cnt]->i_mutex); truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); mutex_unlock(&dqopt->files[cnt]->i_mutex); } @@ -2037,8 +2036,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) /* If quota was reenabled in the meantime, we have * nothing to do */ if (!sb_has_quota_loaded(sb, cnt)) { - mutex_lock_nested(&toputinode[cnt]->i_mutex, - I_MUTEX_QUOTA); + mutex_lock(&toputinode[cnt]->i_mutex); toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA); truncate_inode_pages(&toputinode[cnt]->i_data, @@ -2133,7 +2131,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + mutex_lock(&inode->i_mutex); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; @@ -2180,7 +2178,7 @@ out_file_init: iput(inode); out_lock: if (oldflags != -1) { - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + mutex_lock(&inode->i_mutex); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); -- cgit v1.2.3 From d7dab39b6e16d5eea78ed3c705d2a2d0772b4f06 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 26 Apr 2012 13:10:39 -0500 Subject: ext3: return 32/64-bit dir name hash according to usage type This is based on commit d1f5273e9adb40724a85272f248f210dc4ce919a ext4: return 32/64-bit dir name hash according to usage type by Fan Yong Traditionally ext2/3/4 has returned a 32-bit hash value from llseek() to appease NFSv2, which can only handle a 32-bit cookie for seekdir() and telldir(). However, this causes problems if there are 32-bit hash collisions, since the NFSv2 server can get stuck resending the same entries from the directory repeatedly. Allow ext3 to return a full 64-bit hash (both major and minor) for telldir to decrease the chance of hash collisions. This patch does implement a new ext3_dir_llseek op, because with 64-bit hashes, nfs will attempt to seek to a hash "offset" which is much larger than ext3's s_maxbytes. So for dx dirs, we call generic_file_llseek_size() with the appropriate max hash value as the maximum seekable size. Otherwise we just pass through to generic_file_llseek(). Patch-updated-by: Bernd Schubert Patch-updated-by: Eric Sandeen (blame us if something is not correct) Signed-off-by: Eric Sandeen Signed-off-by: Jan Kara --- fs/ext3/dir.c | 167 +++++++++++++++++++++++++++++++++++++++++---------------- fs/ext3/ext3.h | 6 ++- fs/ext3/hash.c | 4 +- 3 files changed, 129 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index cc761ad8fa5..92490e9f85c 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -21,30 +21,15 @@ * */ +#include #include "ext3.h" static unsigned char ext3_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int ext3_readdir(struct file *, void *, filldir_t); static int ext3_dx_readdir(struct file * filp, void * dirent, filldir_t filldir); -static int ext3_release_dir (struct inode * inode, - struct file * filp); - -const struct file_operations ext3_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .readdir = ext3_readdir, /* we take BKL. needed?*/ - .unlocked_ioctl = ext3_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext3_compat_ioctl, -#endif - .fsync = ext3_sync_file, /* BKL held */ - .release = ext3_release_dir, -}; - static unsigned char get_dtype(struct super_block *sb, int filetype) { @@ -55,6 +40,25 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) return (ext3_filetype_table[filetype]); } +/** + * Check if the given dir-inode refers to an htree-indexed directory + * (or a directory which chould potentially get coverted to use htree + * indexing). + * + * Return 1 if it is a dx dir, 0 if not + */ +static int is_dx_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_INDEX) && + ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) + return 1; + + return 0; +} int ext3_check_dir_entry (const char * function, struct inode * dir, struct ext3_dir_entry_2 * de, @@ -94,18 +98,13 @@ static int ext3_readdir(struct file * filp, unsigned long offset; int i, stored; struct ext3_dir_entry_2 *de; - struct super_block *sb; int err; struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; int ret = 0; int dir_has_error = 0; - sb = inode->i_sb; - - if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, - EXT3_FEATURE_COMPAT_DIR_INDEX) && - ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) { + if (is_dx_dir(inode)) { err = ext3_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { ret = err; @@ -227,22 +226,87 @@ out: return ret; } +static inline int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return is_compat_task(); +#else + return (BITS_PER_LONG == 32); +#endif +} + /* * These functions convert from the major/minor hash to an f_pos - * value. + * value for dx directories * - * Currently we only use major hash numer. This is unfortunate, but - * on 32-bit machines, the same VFS interface is used for lseek and - * llseek, so if we use the 64 bit offset, then the 32-bit versions of - * lseek/telldir/seekdir will blow out spectacularly, and from within - * the ext2 low-level routine, we don't know if we're being called by - * a 64-bit version of the system call or the 32-bit version of the - * system call. Worse yet, NFSv2 only allows for a 32-bit readdir - * cookie. Sigh. + * Upper layer (for example NFS) should specify FMODE_32BITHASH or + * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted + * directly on both 32-bit and 64-bit nodes, under such case, neither + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. */ -#define hash2pos(major, minor) (major >> 1) -#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) -#define pos2min_hash(pos) (0) +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return major >> 1; + else + return ((__u64)(major >> 1) << 32) | (__u64)minor; +} + +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return (pos << 1) & 0xffffffff; + else + return ((pos >> 32) << 1) & 0xffffffff; +} + +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return 0; + else + return pos & 0xffffffff; +} + +/* + * Return 32- or 64-bit end-of-file for dx directories + */ +static inline loff_t ext3_get_htree_eof(struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return EXT3_HTREE_EOF_32BIT; + else + return EXT3_HTREE_EOF_64BIT; +} + + +/* + * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both + * non-htree and htree directories, where the "offset" is in terms + * of the filename hash value instead of the byte offset. + * + * Because we may return a 64-bit hash that is well beyond s_maxbytes, + * we need to pass the max hash as the maximum allowable offset in + * the htree directory case. + * + * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) + * will be invalid once the directory was converted into a dx directory + */ +loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + int dx_dir = is_dx_dir(inode); + + if (likely(dx_dir)) + return generic_file_llseek_size(file, offset, origin, + ext3_get_htree_eof(file)); + else + return generic_file_llseek(file, offset, origin); +} /* * This structure holds the nodes of the red-black tree used to store @@ -303,15 +367,16 @@ static void free_rb_tree_fname(struct rb_root *root) } -static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos) +static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, + loff_t pos) { struct dir_private_info *p; p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; - p->curr_hash = pos2maj_hash(pos); - p->curr_minor_hash = pos2min_hash(pos); + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); return p; } @@ -401,7 +466,7 @@ static int call_filldir(struct file * filp, void * dirent, printk("call_filldir: called with null fname?!?\n"); return 0; } - curr_pos = hash2pos(fname->hash, fname->minor_hash); + curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); while (fname) { error = filldir(dirent, fname->name, fname->name_len, curr_pos, @@ -426,13 +491,13 @@ static int ext3_dx_readdir(struct file * filp, int ret; if (!info) { - info = ext3_htree_create_dir_info(filp->f_pos); + info = ext3_htree_create_dir_info(filp, filp->f_pos); if (!info) return -ENOMEM; filp->private_data = info; } - if (filp->f_pos == EXT3_HTREE_EOF) + if (filp->f_pos == ext3_get_htree_eof(filp)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ @@ -440,8 +505,8 @@ static int ext3_dx_readdir(struct file * filp, free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(filp->f_pos); - info->curr_minor_hash = pos2min_hash(filp->f_pos); + info->curr_hash = pos2maj_hash(filp, filp->f_pos); + info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); } /* @@ -473,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp, if (ret < 0) return ret; if (ret == 0) { - filp->f_pos = EXT3_HTREE_EOF; + filp->f_pos = ext3_get_htree_eof(filp); break; } info->curr_node = rb_first(&info->root); @@ -493,7 +558,7 @@ static int ext3_dx_readdir(struct file * filp, info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { - filp->f_pos = EXT3_HTREE_EOF; + filp->f_pos = ext3_get_htree_eof(filp); break; } info->curr_hash = info->next_hash; @@ -512,3 +577,15 @@ static int ext3_release_dir (struct inode * inode, struct file * filp) return 0; } + +const struct file_operations ext3_dir_operations = { + .llseek = ext3_dir_llseek, + .read = generic_read_dir, + .readdir = ext3_readdir, + .unlocked_ioctl = ext3_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext3_compat_ioctl, +#endif + .fsync = ext3_sync_file, + .release = ext3_release_dir, +}; diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h index b6515fd7e56..fe5bef7914e 100644 --- a/fs/ext3/ext3.h +++ b/fs/ext3/ext3.h @@ -920,7 +920,11 @@ struct dx_hash_info u32 *seed; }; -#define EXT3_HTREE_EOF 0x7fffffff + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + /* * Control parameters used by ext3_htree_next_block diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c index d10231ddcf8..ede315cdf12 100644 --- a/fs/ext3/hash.c +++ b/fs/ext3/hash.c @@ -198,8 +198,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) return -1; } hash = hash & ~1; - if (hash == (EXT3_HTREE_EOF << 1)) - hash = (EXT3_HTREE_EOF-1) << 1; + if (hash == (EXT3_HTREE_EOF_32BIT << 1)) + hash = (EXT3_HTREE_EOF_32BIT - 1) << 1; hinfo->hash = hash; hinfo->minor_hash = minor_hash; return 0; -- cgit v1.2.3 From aa9e939d5296edb7d188e18892775b9938fd2923 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Tue, 15 May 2012 07:56:06 +0800 Subject: ext2: remove the redundant comment for ext2_export_ops The comment is outdated and isn't particularly informative anyway - NULL meaning the default behavior is very common in kernel. And we really set about half of entries. So remove the whole comment for ext2_export_ops. Signed-off-by: Wang Sheng-Hui Signed-off-by: Jan Kara --- fs/ext2/super.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 7c1207c0e0c..c21f67f8854 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -352,11 +352,6 @@ static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid, ext2_nfs_get_inode); } -/* Yes, most of these are left as NULL!! - * A NULL value implies the default, which works with ext2-like file - * systems, but can be improved upon. - * Currently only get_parent is required. - */ static const struct export_operations ext2_export_ops = { .fh_to_dentry = ext2_fh_to_dentry, .fh_to_parent = ext2_fh_to_parent, -- cgit v1.2.3 From 0324876628a9c7faf8127e20af29373dc6dec876 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 18 May 2012 15:04:45 +0800 Subject: ext2: trivial fix to comment for ext2_free_blocks The function is ext2_free_blocks(), not ext2_free_blocks_sb(). Signed-off-by: Wang Sheng-Hui Signed-off-by: Jan Kara --- fs/ext2/balloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index a9bba1e3920..89991ffb006 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -477,7 +477,7 @@ void ext2_discard_reservation(struct inode *inode) } /** - * ext2_free_blocks_sb() -- Free given blocks and update quota and i_blocks + * ext2_free_blocks() -- Free given blocks and update quota and i_blocks * @inode: inode * @block: start physcial block to free * @count: number of blocks to free -- cgit v1.2.3