From b72f78cb63fb595af63fc781dced0a6fd354e572 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 8 Nov 2012 10:33:36 -0500 Subject: ext4: fix overhead calculations in ext4_stats, again "overhead" was a write-only variable in this function after commit 952fc18e; we set it to 0 for minixdf, or to sbi->s_overhead if !minixdf, but never read it again after that. We need to use it, not sbi->s_overhead, when subtracting out overhead for f_blocks, or we get the wrong answer for minixdf. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 80928f71685..1982d3cd913 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4790,7 +4790,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, sbi->s_overhead); + buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ -- cgit v1.2.3 From 6d138ced751d4e41e02c38ad55d1b3cd2913b150 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 8 Nov 2012 11:11:59 -0500 Subject: ext4: fix awful goto in ext4_mb_new_blocks() I think the whole function could be made prettier, but that goto really took the cake for too-clever-by-half. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 526e5535860..27f421c8043 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4310,8 +4310,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); - if (*errp) + if (*errp) { + ext4_discard_allocated_blocks(ac); goto errout; + } /* as we've just preallocated more space than * user requested orinally, we store allocated @@ -4333,10 +4335,10 @@ repeat: ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; goto repeat; - } else if (*errp) - errout: + } else if (*errp) { ext4_discard_allocated_blocks(ac); - else { + goto errout; + } else { block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; } @@ -4347,6 +4349,7 @@ repeat: *errp = -ENOSPC; } +errout: if (*errp) { ac->ac_b_ex.fe_len = 0; ar->len = 0; -- cgit v1.2.3 From 37be2f59d3149b95afaeeeff94edde2c07f165d2 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 8 Nov 2012 11:22:46 -0500 Subject: ext4: remove ext4_handle_release_buffer() ext4_handle_release_buffer() was intended to remove journal write access from a buffer, but it doesn't actually do anything at all other than add a BUFFER_TRACE point, but it's not reliably used for that either. Remove all the associated dead code. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Reviewed-by: Carlos Maiolino --- fs/ext4/ext4_jbd2.h | 7 ------- fs/ext4/resize.c | 17 +++-------------- fs/ext4/xattr.c | 1 - fs/jbd2/journal.c | 1 - fs/jbd2/transaction.c | 11 ----------- include/linux/jbd2.h | 1 - 6 files changed, 3 insertions(+), 35 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 56d258c1830..7177f9b21cb 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -254,13 +254,6 @@ static inline void ext4_handle_sync(handle_t *handle) handle->h_sync = 1; } -static inline void ext4_handle_release_buffer(handle_t *handle, - struct buffer_head *bh) -{ - if (ext4_handle_valid(handle)) - jbd2_journal_release_buffer(handle, bh); -} - static inline int ext4_handle_is_aborted(handle_t *handle) { if (ext4_handle_valid(handle)) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 47bf06a2765..d99387b89ed 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -783,7 +783,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) - goto exit_sbh; + goto exit_dind; err = ext4_journal_get_write_access(handle, dind); if (unlikely(err)) @@ -792,7 +792,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, /* ext4_reserve_inode_write() gets a reference on the iloc */ err = ext4_reserve_inode_write(handle, inode, &iloc); if (unlikely(err)) - goto exit_dindj; + goto exit_dind; n_group_desc = ext4_kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), @@ -846,12 +846,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, exit_inode: ext4_kvfree(n_group_desc); - /* ext4_handle_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); -exit_dindj: - /* ext4_handle_release_buffer(handle, dind); */ -exit_sbh: - /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ exit_dind: brelse(dind); exit_bh: @@ -969,14 +964,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, } for (i = 0; i < reserved_gdb; i++) { - if ((err = ext4_journal_get_write_access(handle, primary[i]))) { - /* - int j; - for (j = 0; j < i; j++) - ext4_handle_release_buffer(handle, primary[j]); - */ + if ((err = ext4_journal_get_write_access(handle, primary[i]))) goto exit_bh; - } } if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2cdb98d6298..b1adda1b750 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -794,7 +794,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, int offset = (char *)s->here - bs->bh->b_data; unlock_buffer(bs->bh); - ext4_handle_release_buffer(handle, bs->bh); if (ce) { mb_cache_entry_release(ce); ce = NULL; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 484b8d1c6cb..dbf41f9452d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -60,7 +60,6 @@ EXPORT_SYMBOL(jbd2_journal_get_create_access); EXPORT_SYMBOL(jbd2_journal_get_undo_access); EXPORT_SYMBOL(jbd2_journal_set_triggers); EXPORT_SYMBOL(jbd2_journal_dirty_metadata); -EXPORT_SYMBOL(jbd2_journal_release_buffer); EXPORT_SYMBOL(jbd2_journal_forget); #if 0 EXPORT_SYMBOL(journal_sync_buffer); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index a74ba465954..deffd945c8e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1207,17 +1207,6 @@ out: return ret; } -/* - * jbd2_journal_release_buffer: undo a get_write_access without any buffer - * updates, if the update decided in the end that it didn't need access. - * - */ -void -jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh) -{ - BUFFER_TRACE(bh, "entry"); -} - /** * void jbd2_journal_forget() - bforget() for potentially-journaled buffers. * @handle: transaction handle diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 3efc43f3f16..de7f5568208 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1096,7 +1096,6 @@ extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); void jbd2_journal_set_triggers(struct buffer_head *, struct jbd2_buffer_trigger_type *type); extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); -extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); extern void journal_sync_buffer (struct buffer_head *); extern void jbd2_journal_invalidatepage(journal_t *, -- cgit v1.2.3 From fd47d3e1c2949838e858379aaf2bc20647be2912 Mon Sep 17 00:00:00 2001 From: Behan Webster Date: Thu, 8 Nov 2012 11:24:46 -0500 Subject: jbd2: remove VLAIS usage from JBD2 code The use of variable length arrays in structs (VLAIS) in the Linux Kernel code precludes the use of compilers which don't implement VLAIS (for instance the Clang compiler). Since ctx is always a 32-bit CRC, hard coding a size of 4 bytes accomplishes the same thing without the use of VLAIS. This is the same technique already employed in fs/ext4/ext4.h Signed-off-by: Mark Charlebois Signed-off-by: Behan Webster Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index de7f5568208..1be23d9fdac 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1302,15 +1302,21 @@ static inline int jbd_space_needed(journal_t *journal) extern int jbd_blocks_per_page(struct inode *inode); +/* JBD uses a CRC32 checksum */ +#define JBD_MAX_CHECKSUM_SIZE 4 + static inline u32 jbd2_chksum(journal_t *journal, u32 crc, const void *address, unsigned int length) { struct { struct shash_desc shash; - char ctx[crypto_shash_descsize(journal->j_chksum_driver)]; + char ctx[JBD_MAX_CHECKSUM_SIZE]; } desc; int err; + BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) > + JBD_MAX_CHECKSUM_SIZE); + desc.shash.tfm = journal->j_chksum_driver; desc.shash.flags = 0; *(u32 *)desc.ctx = crc; -- cgit v1.2.3 From d339450ccad1acb942fc880ca0b44c956e6d2762 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Thu, 8 Nov 2012 12:07:33 -0500 Subject: ext4: get rid of redundant code in ext4_fill_super() Signed-off-by: Zhao Hongjiang Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1982d3cd913..ea21231633e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3272,9 +3272,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_sb = sb; - sbi->s_mount_opt = 0; - sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID); - sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID); sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; if (sb->s_bdev->bd_part) -- cgit v1.2.3 From d8ec0c396083ef633a065629df1565246dbb2f33 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 8 Nov 2012 12:19:58 -0500 Subject: ext4: remove unused assignment Signed-off-by: Alan Cox Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 27f421c8043..442caae80a9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1373,7 +1373,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block, ex->fe_start += next; while (needed > ex->fe_len && - (buddy = mb_find_buddy(e4b, order, &max))) { + mb_find_buddy(e4b, order, &max)) { if (block + 1 >= max) break; -- cgit v1.2.3 From 79add3a3f795e688e35d5e703d5a8cfa8ef923ac Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 8 Nov 2012 13:28:29 -0500 Subject: ext4: notify when discard is not supported Notify user when mounting the file system with -o discard option, but the device does not support discard. Obviously we do not want to fail the mount or disable the options, because the underlying device might change in future even without file system remount. Reviewed-by: Carlos Maiolino Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ea21231633e..6729470ee1a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4015,6 +4015,14 @@ no_journal: } #endif /* CONFIG_QUOTA */ + if (test_opt(sb, DISCARD)) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + if (!blk_queue_discard(q)) + ext4_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but " + "the device does not support discard"); + } + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, *sbi->s_es->s_mount_opts ? "; " : "", orig_data); -- cgit v1.2.3 From d71c1ae23aa3e7822715c63dc242de6d73002541 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 8 Nov 2012 14:04:52 -0500 Subject: ext4: warn when discard request fails other than EOPNOTSUPP We should warn user then the discard request fails. However we need to exclude -EOPNOTSUPP case since parts of the device might not support it while other parts can. So print the kernel warning when the error != -EOPNOTSUPP is returned from ext4_issue_discard(). We should also handle error cases in batched discard, again excluding EOPNOTSUPP. Reviewed-by: Carlos Maiolino Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 47 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 442caae80a9..1bf6fe785c4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2607,9 +2607,17 @@ static void ext4_free_data_callback(struct super_block *sb, mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, entry->efd_group, - entry->efd_start_cluster, entry->efd_count); + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, + entry->efd_count); + if (err && err != -EOPNOTSUPP) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%d failed" + " with %d", entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, err); + } err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -4659,8 +4667,16 @@ do_more: * with group lock held. generate_buddy look at * them with group lock_held */ - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, block_group, bit, count); + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, block_group, bit, count); + if (err && err != -EOPNOTSUPP) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%lu failed" + " with %d", block_group, bit, count, + err); + } + + ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); mb_free_blocks(inode, &e4b, bit, count_clusters); @@ -4854,10 +4870,11 @@ error_return: * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ -static void ext4_trim_extent(struct super_block *sb, int start, int count, +static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b) { struct ext4_free_extent ex; + int ret = 0; trace_ext4_trim_extent(sb, group, start, count); @@ -4873,9 +4890,10 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ext4_issue_discard(sb, group, start, count); + ret = ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); + return ret; } /** @@ -4904,7 +4922,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, void *bitmap; ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; - int ret; + int ret = 0; trace_ext4_trim_all_free(sb, group, start, max); @@ -4931,8 +4949,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { - ext4_trim_extent(sb, start, - next - start, group, &e4b); + ret = ext4_trim_extent(sb, start, + next - start, group, &e4b); + if (ret && ret != -EOPNOTSUPP) + break; + ret = 0; count += next - start; } free_count += next - start; @@ -4953,8 +4974,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, break; } - if (!ret) + if (!ret) { + ret = count; EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); + } out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -4962,7 +4985,7 @@ out: ext4_debug("trimmed %d blocks in the group %d\n", count, group); - return count; + return ret; } /** -- cgit v1.2.3 From b5645534ce84c21695c2f82d4d4f67cf2a67229a Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 14:33:43 -0500 Subject: ext4: print 'flags' in ext4_ext_handle_uninitialized_extents In trace_ext4_ext_handle_uninitialized_extents we don't care about the value of map->m_flags because this value is probably 0, and we prefer to get the value of flags because we can know how to handle this extent in this function. Reviewed-by: Lukas Czerner Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 ++-- include/trace/events/ext4.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7011ac96720..59e6e12e002 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3663,8 +3663,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, flags, allocated); ext4_ext_show_leaf(inode, path); - trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated, - newblock); + trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, + allocated, newblock); /* get_block() before submit the IO, split the extent */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d49b285385e..25914e3002c 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1680,10 +1680,10 @@ DEFINE_EVENT(ext4__trim, ext4_trim_all_free, ); TRACE_EVENT(ext4_ext_handle_uninitialized_extents, - TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags, unsigned int allocated, ext4_fsblk_t newblock), - TP_ARGS(inode, map, allocated, newblock), + TP_ARGS(inode, map, flags, allocated, newblock), TP_STRUCT__entry( __field( dev_t, dev ) @@ -1699,7 +1699,7 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->flags = map->m_flags; + __entry->flags = flags; __entry->lblk = map->m_lblk; __entry->pblk = map->m_pblk; __entry->len = map->m_len; @@ -1707,7 +1707,7 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents, __entry->newblk = newblock; ), - TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %d" + TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %x " "allocated %d newblock %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, -- cgit v1.2.3 From 19b303d8b5a0e8150a4697c01ca03e75a0a17469 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 14:34:04 -0500 Subject: ext4: print map->m_flags in trace_ext4_ext/ind_map_blocks_exit When we use trace_ext4_ext/ind_map_blocks_exit, print the value of map->m_flags in order that we can understand the extent's current status. Reviewed-by: Lukas Czerner Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 3 +-- fs/ext4/indirect.c | 3 +-- include/trace/events/ext4.h | 27 +++++++++++++-------------- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 59e6e12e002..7a64c193b2a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4284,8 +4284,7 @@ out2: kfree(path); } - trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, - newblock, map->m_len, err ? err : allocated); + trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); return err ? err : allocated; } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 792e388e7b4..292337f27c9 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -755,8 +755,7 @@ cleanup: partial--; } out: - trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, - map->m_pblk, map->m_len, err); + trace_ext4_ind_map_blocks_exit(inode, map, err); return err; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 25914e3002c..d2a125a6db8 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1519,10 +1519,9 @@ DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter, ); DECLARE_EVENT_CLASS(ext4__map_blocks_exit, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, - ext4_fsblk_t pblk, unsigned int len, int ret), + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), - TP_ARGS(inode, lblk, pblk, len, ret), + TP_ARGS(inode, map, ret), TP_STRUCT__entry( __field( dev_t, dev ) @@ -1530,37 +1529,37 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_exit, __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) + __field( unsigned int, flags ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->pblk = pblk; - __entry->lblk = lblk; - __entry->len = len; + __entry->pblk = map->m_pblk; + __entry->lblk = map->m_lblk; + __entry->len = map->m_len; + __entry->flags = map->m_flags; __entry->ret = ret; ), - TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d", + TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u flags %x ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->pblk, - __entry->len, __entry->ret) + __entry->len, __entry->flags, __entry->ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, - ext4_fsblk_t pblk, unsigned len, int ret), + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), - TP_ARGS(inode, lblk, pblk, len, ret) + TP_ARGS(inode, map, ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, - ext4_fsblk_t pblk, unsigned len, int ret), + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), - TP_ARGS(inode, lblk, pblk, len, ret) + TP_ARGS(inode, map, ret) ); TRACE_EVENT(ext4_ext_load_extent, -- cgit v1.2.3 From 37794732467dd998a34bfce19738ad3ef1f37507 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 14:47:52 -0500 Subject: ext4: fix missing call to trace_ext4_ext_map_blocks_exit When ext4_ext_handle_uninitialized_extents(), we will directly return from ext4_ext_map_blocks(). The trace point of trace_ext4_ext_map_blocks_exit isn't called, and the user doesn't see any result. This patch tries to fix this problem. Meanwhile in ext4_ext_handle_uninitialized_extents it returns errors or the number of allocated blocks. So 'ret' variable can be removed due to previously modifications. Signed-off-by: Zheng Liu --- fs/ext4/extents.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7a64c193b2a..dce97de6a40 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3911,7 +3911,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent newex, *ex, *ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; - int free_on_err = 0, err = 0, depth, ret; + int free_on_err = 0, err = 0, depth; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; @@ -4007,10 +4007,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ee_len, ee_start); goto out; } - ret = ext4_ext_handle_uninitialized_extents( + allocated = ext4_ext_handle_uninitialized_extents( handle, inode, map, path, flags, allocated, newblock); - return ret; + goto out3; } } @@ -4284,6 +4284,7 @@ out2: kfree(path); } +out3: trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); return err ? err : allocated; -- cgit v1.2.3 From 8d8c1825709020c73b5e66f96c114f6a1f6461e7 Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Thu, 8 Nov 2012 14:53:35 -0500 Subject: ext4: use 'inode' variable that is already dereferenced Tested: xfs tests Reviewed-by: Zheng Liu Signed-off-by: Anatol Pomozov Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 68e896e12a6..0fd16e653eb 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -111,7 +111,7 @@ static int ext4_end_io(ext4_io_end_t *io) inode_dio_done(inode); /* Wake up anyone waiting on unwritten extent conversion */ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(io->inode)); + wake_up_all(ext4_ioend_wq(inode)); return ret; } -- cgit v1.2.3 From 8b0f165f790c897fa744e7fed6f0bfeb6eb6f494 Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Thu, 8 Nov 2012 15:07:16 -0500 Subject: ext4: remove code duplication in ext4_get_block_write_nolock() 729f52c6be51013 introduced function ext4_get_block_write_nolock() that is very similar to _ext4_get_block(). Eliminate code duplication by passing different flags to _ext4_get_block() Tested: xfs tests Reviewed-by: Zheng Liu Signed-off-by: Anatol Pomozov Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 63 ++++++++++++++++++++++----------------------------------- 1 file changed, 24 insertions(+), 39 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b3c243b9afa..f84bfd6d186 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -683,7 +683,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - if (flags && !handle) { + if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) { /* Direct IO write... */ if (map.m_len > DIO_MAX_BLOCKS) map.m_len = DIO_MAX_BLOCKS; @@ -880,6 +880,8 @@ static int do_journal_get_write_access(handle_t *handle, static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); +static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -2850,29 +2852,12 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock, } static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int flags) + struct buffer_head *bh_result, int create) { - handle_t *handle = ext4_journal_current_handle(); - struct ext4_map_blocks map; - int ret = 0; - - ext4_debug("ext4_get_block_write_nolock: inode %lu, flag %d\n", - inode->i_ino, flags); - - flags = EXT4_GET_BLOCKS_NO_LOCK; - - map.m_lblk = iblock; - map.m_len = bh_result->b_size >> inode->i_blkbits; - - ret = ext4_map_blocks(handle, inode, &map, flags); - if (ret > 0) { - map_bh(bh_result, inode->i_sb, map.m_pblk); - bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | - map.m_flags; - bh_result->b_size = inode->i_sb->s_blocksize * map.m_len; - ret = 0; - } - return ret; + ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_NO_LOCK); } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, @@ -3003,6 +2988,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, loff_t final_size = offset + count; if (rw == WRITE && final_size <= inode->i_size) { int overwrite = 0; + get_block_t *get_block_func = NULL; + int dio_flags = 0; BUG_ON(iocb->private == NULL); @@ -3056,22 +3043,20 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ext4_inode_aio_set(inode, io_end); } - if (overwrite) - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block_write_nolock, - ext4_end_io_dio, - NULL, - 0); - else - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block_write, - ext4_end_io_dio, - NULL, - DIO_LOCKING); + if (overwrite) { + get_block_func = ext4_get_block_write_nolock; + } else { + get_block_func = ext4_get_block_write; + dio_flags = DIO_LOCKING; + } + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + get_block_func, + ext4_end_io_dio, + NULL, + dio_flags); + if (iocb->private) ext4_inode_aio_set(inode, NULL); /* -- cgit v1.2.3 From 24ec19b0ae83a385ad9c55520716da671274b96c Mon Sep 17 00:00:00 2001 From: Eugene Shatokhin Date: Thu, 8 Nov 2012 15:11:11 -0500 Subject: ext4: fix memory leak in ext4_xattr_set_acl()'s error path In ext4_xattr_set_acl(), if ext4_journal_start() returns an error, posix_acl_release() will not be called for 'acl' which may result in a memory leak. This patch fixes that. Reviewed-by: Lukas Czerner Signed-off-by: Eugene Shatokhin Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/acl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index d3c5b88fd89..e6e0d988439 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -423,8 +423,10 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, retry: handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto release_and_out; + } error = ext4_set_acl(handle, inode, type, acl); ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) -- cgit v1.2.3 From 07aa2ea13814ea60d12f7330b6d5ccfdb0c3ba4d Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 8 Nov 2012 15:16:54 -0500 Subject: ext4: fix error handling in ext4_fill_super() There are some places in ext4_fill_super() where we would not return proper error code if something fails. The confusion is caused probably due to the fact that we have two "kind-of" return variables 'ret'and 'err'. 'ret' is used to return error code from ext4_fill_super() where err is used to store return values from other functions within ext4_fill_super(). However some places were missing the obligatory 'ret = err'. We could put the assignment where it is missing, but we can have better "future proof" solution. Or we could convert the code to use just one, but it would require more rewrites. This commit fixes the problem by returning value from 'err' variable if it is set and 'ret' otherwise in error handling branch of the ext4_fill_super(). The reasoning is that 'ret' value is often set to default "-EINVAL" or explicit value, where 'err' is used to store return value from other functions and should be otherwise zero. https://bugzilla.kernel.org/show_bug.cgi?id=48431 Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6729470ee1a..18e89fafebd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3256,7 +3256,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned int i; int needs_recovery, has_huge_files, has_bigalloc; __u64 blocks_count; - int err; + int err = 0; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ext4_group_t first_not_zeroed; @@ -3282,6 +3282,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (cp = sb->s_id; (cp = strchr(cp, '/'));) *cp = '!'; + /* -EINVAL is default */ ret = -EINVAL; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { @@ -3659,7 +3660,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); - ret = err; goto failed_mount; } @@ -3767,7 +3767,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); - ret = err; goto failed_mount3; } @@ -3894,8 +3893,8 @@ no_journal: if (es->s_overhead_clusters) sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); else { - ret = ext4_calculate_overhead(sb); - if (ret) + err = ext4_calculate_overhead(sb); + if (err) goto failed_mount_wq; } @@ -3907,6 +3906,7 @@ no_journal: alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); + ret = -ENOMEM; goto failed_mount_wq; } @@ -4009,8 +4009,8 @@ no_journal: /* Enable quota usage during mount. */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && !(sb->s_flags & MS_RDONLY)) { - ret = ext4_enable_quotas(sb); - if (ret) + err = ext4_enable_quotas(sb); + if (err) goto failed_mount7; } #endif /* CONFIG_QUOTA */ @@ -4089,7 +4089,7 @@ out_fail: kfree(sbi); out_free_orig: kfree(orig_data); - return ret; + return err ? err : ret; } /* -- cgit v1.2.3 From c0677e6d0f9d991adff972b8d06cb83de1f8ee8e Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 15:18:54 -0500 Subject: ext4: add data structures for the extent status tree This patch adds two structures that supports extent status tree, extent_status and ext4_es_tree. Currently extent_status is used to track a delay extent for an inode, which record the start block and the length of the delay extent. ext4_es_tree is used to store all extent_status for an inode in memory. Signed-off-by: Yongqiang Yang Signed-off-by: Allison Henderson Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 6 ++++++ fs/ext4/extents_status.h | 25 +++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 fs/ext4/extents_status.h diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3c20de1d59d..bcc634b26d4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -811,6 +811,8 @@ struct ext4_ext_cache { __u32 ec_len; /* must be 32bit to return holes */ }; +#include "extents_status.h" + /* * fourth extended file system inode data in memory */ @@ -888,6 +890,10 @@ struct ext4_inode_info { struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + /* ialloc */ ext4_group_t i_last_alloc_group; diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h new file mode 100644 index 00000000000..8be2ab9c942 --- /dev/null +++ b/fs/ext4/extents_status.h @@ -0,0 +1,25 @@ +/* + * fs/ext4/extents_status.h + * + * Written by Yongqiang Yang + * Modified by + * Allison Henderson + * Zheng Liu + * + */ + +#ifndef _EXT4_EXTENTS_STATUS_H +#define _EXT4_EXTENTS_STATUS_H + +struct extent_status { + struct rb_node rb_node; + ext4_lblk_t start; /* first block extent covers */ + ext4_lblk_t len; /* length of extent in block */ +}; + +struct ext4_es_tree { + struct rb_root root; + struct extent_status *cache_es; /* recently accessed extent */ +}; + +#endif /* _EXT4_EXTENTS_STATUS_H */ -- cgit v1.2.3 From 654598bef3731c9ae9b068ac35e6b69674c02841 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:20 -0500 Subject: ext4: add operations on extent status tree This patch adds operations on a extent status tree. CC: Lukas Czerner Signed-off-by: Yongqiang Yang Signed-off-by: Allison Henderson Signed-off-by: Hugh Dickins Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/Makefile | 2 +- fs/ext4/extents_status.c | 492 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 20 ++ 3 files changed, 513 insertions(+), 1 deletion(-) create mode 100644 fs/ext4/extents_status.c diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 56fd8f86593..41f22be2ffa 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o indirect.o + mmp.o indirect.o extents_status.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c new file mode 100644 index 00000000000..02c09be3d77 --- /dev/null +++ b/fs/ext4/extents_status.c @@ -0,0 +1,492 @@ +/* + * fs/ext4/extents_status.c + * + * Written by Yongqiang Yang + * Modified by + * Allison Henderson + * Hugh Dickins + * Zheng Liu + * + * Ext4 extents status tree core functions. + */ +#include +#include "ext4.h" +#include "extents_status.h" +#include "ext4_extents.h" + +/* + * According to previous discussion in Ext4 Developer Workshop, we + * will introduce a new structure called io tree to track all extent + * status in order to solve some problems that we have met + * (e.g. Reservation space warning), and provide extent-level locking. + * Delay extent tree is the first step to achieve this goal. It is + * original built by Yongqiang Yang. At that time it is called delay + * extent tree, whose goal is only track delay extent in memory to + * simplify the implementation of fiemap and bigalloc, and introduce + * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called + * delay extent tree at the following comment. But for better + * understand what it does, it has been rename to extent status tree. + * + * Currently the first step has been done. All delay extents are + * tracked in the tree. It maintains the delay extent when a delay + * allocation is issued, and the delay extent is written out or + * invalidated. Therefore the implementation of fiemap and bigalloc + * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. + * + * The following comment describes the implemenmtation of extent + * status tree and future works. + */ + +/* + * extents status tree implementation for ext4. + * + * + * ========================================================================== + * Extents status encompass delayed extents and extent locks + * + * 1. Why delayed extent implementation ? + * + * Without delayed extent, ext4 identifies a delayed extent by looking + * up page cache, this has several deficiencies - complicated, buggy, + * and inefficient code. + * + * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need + * to know if a block or a range of blocks are belonged to a delayed + * extent. + * + * Let us have a look at how they do without delayed extents implementation. + * -- FIEMAP + * FIEMAP looks up page cache to identify delayed allocations from holes. + * + * -- SEEK_HOLE/DATA + * SEEK_HOLE/DATA has the same problem as FIEMAP. + * + * -- bigalloc + * bigalloc looks up page cache to figure out if a block is + * already under delayed allocation or not to determine whether + * quota reserving is needed for the cluster. + * + * -- punch hole + * punch hole looks up page cache to identify a delayed extent. + * + * -- writeout + * Writeout looks up whole page cache to see if a buffer is + * mapped, If there are not very many delayed buffers, then it is + * time comsuming. + * + * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, + * bigalloc and writeout can figure out if a block or a range of + * blocks is under delayed allocation(belonged to a delayed extent) or + * not by searching the delayed extent tree. + * + * + * ========================================================================== + * 2. ext4 delayed extents impelmentation + * + * -- delayed extent + * A delayed extent is a range of blocks which are contiguous + * logically and under delayed allocation. Unlike extent in + * ext4, delayed extent in ext4 is a in-memory struct, there is + * no corresponding on-disk data. There is no limit on length of + * delayed extent, so a delayed extent can contain as many blocks + * as they are contiguous logically. + * + * -- delayed extent tree + * Every inode has a delayed extent tree and all under delayed + * allocation blocks are added to the tree as delayed extents. + * Delayed extents in the tree are ordered by logical block no. + * + * -- operations on a delayed extent tree + * There are three operations on a delayed extent tree: find next + * delayed extent, adding a space(a range of blocks) and removing + * a space. + * + * -- race on a delayed extent tree + * Delayed extent tree is protected inode->i_es_lock. + * + * + * ========================================================================== + * 3. performance analysis + * -- overhead + * 1. There is a cache extent for write access, so if writes are + * not very random, adding space operaions are in O(1) time. + * + * -- gain + * 2. Code is much simpler, more readable, more maintainable and + * more efficient. + * + * + * ========================================================================== + * 4. TODO list + * -- Track all extent status + * + * -- Improve get block process + * + * -- Extent-level locking + */ + +static struct kmem_cache *ext4_es_cachep; + +int __init ext4_init_es(void) +{ + ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); + if (ext4_es_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_es(void) +{ + if (ext4_es_cachep) + kmem_cache_destroy(ext4_es_cachep); +} + +void ext4_es_init_tree(struct ext4_es_tree *tree) +{ + tree->root = RB_ROOT; + tree->cache_es = NULL; +} + +#ifdef ES_DEBUG__ +static void ext4_es_print_tree(struct inode *inode) +{ + struct ext4_es_tree *tree; + struct rb_node *node; + + printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); + tree = &EXT4_I(inode)->i_es_tree; + node = rb_first(&tree->root); + while (node) { + struct extent_status *es; + es = rb_entry(node, struct extent_status, rb_node); + printk(KERN_DEBUG " [%u/%u)", es->start, es->len); + node = rb_next(node); + } + printk(KERN_DEBUG "\n"); +} +#else +#define ext4_es_print_tree(inode) +#endif + +static inline ext4_lblk_t extent_status_end(struct extent_status *es) +{ + BUG_ON(es->start + es->len < es->start); + return es->start + es->len - 1; +} + +/* + * search through the tree for an delayed extent with a given offset. If + * it can't be found, try to find next extent. + */ +static struct extent_status *__es_tree_search(struct rb_root *root, + ext4_lblk_t offset) +{ + struct rb_node *node = root->rb_node; + struct extent_status *es = NULL; + + while (node) { + es = rb_entry(node, struct extent_status, rb_node); + if (offset < es->start) + node = node->rb_left; + else if (offset > extent_status_end(es)) + node = node->rb_right; + else + return es; + } + + if (es && offset < es->start) + return es; + + if (es && offset > extent_status_end(es)) { + node = rb_next(&es->rb_node); + return node ? rb_entry(node, struct extent_status, rb_node) : + NULL; + } + + return NULL; +} + +/* + * ext4_es_find_extent: find the 1st delayed extent covering @es->start + * if it exists, otherwise, the next extent after @es->start. + * + * @inode: the inode which owns delayed extents + * @es: delayed extent that we found + * + * Returns the first block of the next extent after es, otherwise + * EXT_MAX_BLOCKS if no delay extent is found. + * Delayed extent is returned via @es. + */ +ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) +{ + struct ext4_es_tree *tree = NULL; + struct extent_status *es1 = NULL; + struct rb_node *node; + ext4_lblk_t ret = EXT_MAX_BLOCKS; + + read_lock(&EXT4_I(inode)->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + + /* find delay extent in cache firstly */ + if (tree->cache_es) { + es1 = tree->cache_es; + if (in_range(es->start, es1->start, es1->len)) { + es_debug("%u cached by [%u/%u)\n", + es->start, es1->start, es1->len); + goto out; + } + } + + es->len = 0; + es1 = __es_tree_search(&tree->root, es->start); + +out: + if (es1) { + tree->cache_es = es1; + es->start = es1->start; + es->len = es1->len; + node = rb_next(&es1->rb_node); + if (node) { + es1 = rb_entry(node, struct extent_status, rb_node); + ret = es1->start; + } + } + + read_unlock(&EXT4_I(inode)->i_es_lock); + return ret; +} + +static struct extent_status * +ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len) +{ + struct extent_status *es; + es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); + if (es == NULL) + return NULL; + es->start = start; + es->len = len; + return es; +} + +static void ext4_es_free_extent(struct extent_status *es) +{ + kmem_cache_free(ext4_es_cachep, es); +} + +static struct extent_status * +ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) +{ + struct extent_status *es1; + struct rb_node *node; + + node = rb_prev(&es->rb_node); + if (!node) + return es; + + es1 = rb_entry(node, struct extent_status, rb_node); + if (es->start == extent_status_end(es1) + 1) { + es1->len += es->len; + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(es); + es = es1; + } + + return es; +} + +static struct extent_status * +ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) +{ + struct extent_status *es1; + struct rb_node *node; + + node = rb_next(&es->rb_node); + if (!node) + return es; + + es1 = rb_entry(node, struct extent_status, rb_node); + if (es1->start == extent_status_end(es) + 1) { + es->len += es1->len; + rb_erase(node, &tree->root); + ext4_es_free_extent(es1); + } + + return es; +} + +static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset, + ext4_lblk_t len) +{ + struct rb_node **p = &tree->root.rb_node; + struct rb_node *parent = NULL; + struct extent_status *es; + ext4_lblk_t end = offset + len - 1; + + BUG_ON(end < offset); + es = tree->cache_es; + if (es && offset == (extent_status_end(es) + 1)) { + es_debug("cached by [%u/%u)\n", es->start, es->len); + es->len += len; + es = ext4_es_try_to_merge_right(tree, es); + goto out; + } else if (es && es->start == end + 1) { + es_debug("cached by [%u/%u)\n", es->start, es->len); + es->start = offset; + es->len += len; + es = ext4_es_try_to_merge_left(tree, es); + goto out; + } else if (es && es->start <= offset && + end <= extent_status_end(es)) { + es_debug("cached by [%u/%u)\n", es->start, es->len); + goto out; + } + + while (*p) { + parent = *p; + es = rb_entry(parent, struct extent_status, rb_node); + + if (offset < es->start) { + if (es->start == end + 1) { + es->start = offset; + es->len += len; + es = ext4_es_try_to_merge_left(tree, es); + goto out; + } + p = &(*p)->rb_left; + } else if (offset > extent_status_end(es)) { + if (offset == extent_status_end(es) + 1) { + es->len += len; + es = ext4_es_try_to_merge_right(tree, es); + goto out; + } + p = &(*p)->rb_right; + } else { + if (extent_status_end(es) <= end) + es->len = offset - es->start + len; + goto out; + } + } + + es = ext4_es_alloc_extent(offset, len); + if (!es) + return -ENOMEM; + rb_link_node(&es->rb_node, parent, p); + rb_insert_color(&es->rb_node, &tree->root); + +out: + tree->cache_es = es; + return 0; +} + +/* + * ext4_es_insert_extent() adds a space to a delayed extent tree. + * Caller holds inode->i_es_lock. + * + * ext4_es_insert_extent is called by ext4_da_write_begin and + * ext4_es_remove_extent. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, + ext4_lblk_t len) +{ + struct ext4_es_tree *tree; + int err = 0; + + es_debug("add [%u/%u) to extent status tree of inode %lu\n", + offset, len, inode->i_ino); + + write_lock(&EXT4_I(inode)->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + err = __es_insert_extent(tree, offset, len); + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_es_print_tree(inode); + + return err; +} + +/* + * ext4_es_remove_extent() removes a space from a delayed extent tree. + * Caller holds inode->i_es_lock. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, + ext4_lblk_t len) +{ + struct rb_node *node; + struct ext4_es_tree *tree; + struct extent_status *es; + struct extent_status orig_es; + ext4_lblk_t len1, len2, end; + int err = 0; + + es_debug("remove [%u/%u) from extent status tree of inode %lu\n", + offset, len, inode->i_ino); + + end = offset + len - 1; + BUG_ON(end < offset); + write_lock(&EXT4_I(inode)->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + es = __es_tree_search(&tree->root, offset); + if (!es) + goto out; + if (es->start > end) + goto out; + + /* Simply invalidate cache_es. */ + tree->cache_es = NULL; + + orig_es.start = es->start; + orig_es.len = es->len; + len1 = offset > es->start ? offset - es->start : 0; + len2 = extent_status_end(es) > end ? + extent_status_end(es) - end : 0; + if (len1 > 0) + es->len = len1; + if (len2 > 0) { + if (len1 > 0) { + err = __es_insert_extent(tree, end + 1, len2); + if (err) { + es->start = orig_es.start; + es->len = orig_es.len; + goto out; + } + } else { + es->start = end + 1; + es->len = len2; + } + goto out; + } + + if (len1 > 0) { + node = rb_next(&es->rb_node); + if (node) + es = rb_entry(node, struct extent_status, rb_node); + else + es = NULL; + } + + while (es && extent_status_end(es) <= end) { + node = rb_next(&es->rb_node); + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(es); + if (!node) { + es = NULL; + break; + } + es = rb_entry(node, struct extent_status, rb_node); + } + + if (es && es->start < end + 1) { + len1 = extent_status_end(es) - end; + es->start = end + 1; + es->len = len1; + } + +out: + write_unlock(&EXT4_I(inode)->i_es_lock); + ext4_es_print_tree(inode); + return err; +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 8be2ab9c942..077f82db092 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -11,6 +11,15 @@ #ifndef _EXT4_EXTENTS_STATUS_H #define _EXT4_EXTENTS_STATUS_H +/* + * Turn on ES_DEBUG__ to get lots of info about extent status operations. + */ +#ifdef ES_DEBUG__ +#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + struct extent_status { struct rb_node rb_node; ext4_lblk_t start; /* first block extent covers */ @@ -22,4 +31,15 @@ struct ext4_es_tree { struct extent_status *cache_es; /* recently accessed extent */ }; +extern int __init ext4_init_es(void); +extern void ext4_exit_es(void); +extern void ext4_es_init_tree(struct ext4_es_tree *tree); + +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t len); +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t len); +extern ext4_lblk_t ext4_es_find_extent(struct inode *inode, + struct extent_status *es); + #endif /* _EXT4_EXTENTS_STATUS_H */ -- cgit v1.2.3 From 9a26b66175e1c221f39bbe09e2e1d0a31a14ba6d Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:30 -0500 Subject: ext4: initialize extent status tree Let ext4 initialize extent status tree of an inode. Signed-off-by: Yongqiang Yang Signed-off-by: Allison Henderson Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 18e89fafebd..6791d091fbc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -943,6 +943,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); + ext4_es_init_tree(&ei->i_es_tree); + rwlock_init(&ei->i_es_lock); ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; -- cgit v1.2.3 From 51865fda28e585bdcc164474ff6438a9ccdbfada Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:32 -0500 Subject: ext4: let ext4 maintain extent status tree This patch lets ext4 maintain extent status tree. Currently it only tracks delay extent status in extent status tree. When a delay allocation is issued, the related delay extent will be inserted into extent status tree. When a delay extent is written out or invalidated, it will be removed from this tree. Signed-off-by: Yongqiang Yang Signed-off-by: Allison Henderson Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 ++++ fs/ext4/indirect.c | 1 + fs/ext4/inode.c | 38 +++++++++++++++++++++++++++++++++++--- fs/ext4/super.c | 12 +++++++++++- 4 files changed, 51 insertions(+), 4 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index dce97de6a40..67660fa2a7e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4344,6 +4344,8 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); + err = ext4_es_remove_extent(inode, last_block, + EXT_MAX_BLOCKS - last_block); err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); /* In a multi-transaction truncate, we only make the final @@ -4971,6 +4973,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) ext4_ext_invalidate_cache(inode); ext4_discard_preallocations(inode); + err = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); err = ext4_ext_remove_space(inode, first_block, stop_block - 1); ext4_ext_invalidate_cache(inode); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 292337f27c9..f6663c3a946 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1411,6 +1411,7 @@ void ext4_ind_truncate(struct inode *inode) down_write(&ei->i_data_sem); ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); /* * The orphan list entry will now protect us from any crash which diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f84bfd6d186..1e92349272e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -574,7 +574,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, up_read((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + int ret; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + /* delayed alloc may be allocated by fallocate and + * coverted to initialized by directIO. + * we need to handle delayed extent here. + */ + down_write((&EXT4_I(inode)->i_data_sem)); + goto delayed_mapped; + } + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -656,8 +665,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * set the BH_Da_Mapped bit on them. Its important to do this * under the protection of i_data_sem. */ - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + int ret; set_buffers_da_mapped(inode, map); +delayed_mapped: + /* delayed allocation blocks has been allocated */ + ret = ext4_es_remove_extent(inode, map->m_lblk, + map->m_len); + if (ret < 0) + retval = ret; + } } up_write((&EXT4_I(inode)->i_data_sem)); @@ -1303,6 +1320,7 @@ static void ext4_da_page_release_reservation(struct page *page, struct inode *inode = page->mapping->host; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int num_clusters; + ext4_fsblk_t lblk; head = page_buffers(page); bh = head; @@ -1317,11 +1335,15 @@ static void ext4_da_page_release_reservation(struct page *page, curr_off = next_off; } while ((bh = bh->b_this_page) != head); + if (to_release) { + lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + ext4_es_remove_extent(inode, lblk, to_release); + } + /* If we have released all the blocks belonging to a cluster, then we * need to release the reserved space for that cluster. */ num_clusters = EXT4_NUM_B2C(sbi, to_release); while (num_clusters > 0) { - ext4_fsblk_t lblk; lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + ((num_clusters - 1) << sbi->s_cluster_bits); if (sbi->s_cluster_ratio == 1 || @@ -1502,9 +1524,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) struct pagevec pvec; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; + ext4_lblk_t start, last; index = mpd->first_page; end = mpd->next_page - 1; + + start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); + ext4_es_remove_extent(inode, start, last - start + 1); + while (index <= end) { nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); if (nr_pages == 0) @@ -1816,6 +1844,10 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, goto out_unlock; } + retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len); + if (retval) + goto out_unlock; + /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served * and it should not appear on the bh->b_state. */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6791d091fbc..ad6cd8aeb94 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -50,6 +50,7 @@ #include "xattr.h" #include "acl.h" #include "mballoc.h" +#include "ext4_extents.h" #define CREATE_TRACE_POINTS #include @@ -1033,6 +1034,7 @@ void ext4_clear_inode(struct inode *inode) clear_inode(inode); dquot_drop(inode); ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -5296,9 +5298,14 @@ static int __init ext4_init_fs(void) init_waitqueue_head(&ext4__ioend_wq[i]); } - err = ext4_init_pageio(); + err = ext4_init_es(); if (err) return err; + + err = ext4_init_pageio(); + if (err) + goto out7; + err = ext4_init_system_zone(); if (err) goto out6; @@ -5348,6 +5355,9 @@ out5: ext4_exit_system_zone(); out6: ext4_exit_pageio(); +out7: + ext4_exit_es(); + return err; } -- cgit v1.2.3 From 992e9fdd7b3f656ab8aea895f0038336950774ed Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:33 -0500 Subject: ext4: add some tracepoints in extent status tree This patch adds some tracepoints in extent status tree. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents_status.c | 8 ++++ include/trace/events/ext4.h | 101 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 02c09be3d77..564d981a2fc 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -14,6 +14,8 @@ #include "extents_status.h" #include "ext4_extents.h" +#include + /* * According to previous discussion in Ext4 Developer Workshop, we * will introduce a new structure called io tree to track all extent @@ -224,6 +226,8 @@ ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) struct rb_node *node; ext4_lblk_t ret = EXT_MAX_BLOCKS; + trace_ext4_es_find_extent_enter(inode, es->start); + read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; @@ -253,6 +257,8 @@ out: } read_unlock(&EXT4_I(inode)->i_es_lock); + + trace_ext4_es_find_extent_exit(inode, es, ret); return ret; } @@ -393,6 +399,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, struct ext4_es_tree *tree; int err = 0; + trace_ext4_es_insert_extent(inode, offset, len); es_debug("add [%u/%u) to extent status tree of inode %lu\n", offset, len, inode->i_ino); @@ -422,6 +429,7 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, ext4_lblk_t len1, len2, end; int err = 0; + trace_ext4_es_remove_extent(inode, offset, len); es_debug("remove [%u/%u) from extent status tree of inode %lu\n", offset, len, inode->i_ino); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d2a125a6db8..f6372b01136 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -15,6 +15,7 @@ struct ext4_inode_info; struct mpage_da_data; struct ext4_map_blocks; struct ext4_extent; +struct extent_status; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) @@ -2054,6 +2055,106 @@ TRACE_EVENT(ext4_ext_remove_space_done, (unsigned short) __entry->eh_entries) ); +TRACE_EVENT(ext4_es_insert_extent, + TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len), + + TP_ARGS(inode, start, len), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, start ) + __field( loff_t, len ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start = start; + __entry->len = len; + ), + + TP_printk("dev %d,%d ino %lu es [%lld/%lld)", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->start, __entry->len) +); + +TRACE_EVENT(ext4_es_remove_extent, + TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len), + + TP_ARGS(inode, start, len), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, start ) + __field( loff_t, len ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start = start; + __entry->len = len; + ), + + TP_printk("dev %d,%d ino %lu es [%lld/%lld)", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->start, __entry->len) +); + +TRACE_EVENT(ext4_es_find_extent_enter, + TP_PROTO(struct inode *inode, ext4_lblk_t start), + + TP_ARGS(inode, start), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ext4_lblk_t, start ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start = start; + ), + + TP_printk("dev %d,%d ino %lu start %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, __entry->start) +); + +TRACE_EVENT(ext4_es_find_extent_exit, + TP_PROTO(struct inode *inode, struct extent_status *es, + ext4_lblk_t ret), + + TP_ARGS(inode, es, ret), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ext4_lblk_t, start ) + __field( ext4_lblk_t, len ) + __field( ext4_lblk_t, ret ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start = es->start; + __entry->len = es->len; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu es [%u/%u) ret %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->start, __entry->len, __entry->ret) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 7d1b1fbc95ebf41fee246dde437a77921f3bfec5 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:35 -0500 Subject: ext4: reimplement ext4_find_delay_alloc_range on extent status tree Signed-off-by: Yongqiang Yang Signed-off-by: Allison Henderson Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 -- fs/ext4/ext4_extents.h | 3 +- fs/ext4/extents.c | 117 ++++++++----------------------------------------- fs/ext4/inode.c | 53 +--------------------- 4 files changed, 20 insertions(+), 157 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bcc634b26d4..246e38f3915 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2451,14 +2451,10 @@ enum ext4_state_bits { * never, ever appear in a buffer_head's state * flag. See EXT4_MAP_FROM_CLUSTER to see where * this is used. */ - BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This - * flag is set when ext4_map_blocks is called on a - * delayed allocated block to get its real mapping. */ }; BUFFER_FNS(Uninit, uninit) TAS_BUFFER_FNS(Uninit, uninit) -BUFFER_FNS(Da_Mapped, da_mapped) /* * Add new method to test wether block and inode bitmaps are properly diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index cb1b2c91996..603bb114735 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -314,7 +314,6 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); -extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, - int search_hint_reverse); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 67660fa2a7e..e0bedd1a4ac 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3461,115 +3461,34 @@ out: /** * ext4_find_delalloc_range: find delayed allocated block in the given range. * - * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns - * whether there are any buffers marked for delayed allocation. It returns '1' - * on the first delalloc'ed buffer head found. If no buffer head in the given - * range is marked for delalloc, it returns 0. - * lblk_start should always be <= lblk_end. - * search_hint_reverse is to indicate that searching in reverse from lblk_end to - * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed - * block sooner). This is useful when blocks are truncated sequentially from - * lblk_start towards lblk_end. + * Return 1 if there is a delalloc block in the range, otherwise 0. */ static int ext4_find_delalloc_range(struct inode *inode, ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end, - int search_hint_reverse) + ext4_lblk_t lblk_end) { - struct address_space *mapping = inode->i_mapping; - struct buffer_head *head, *bh = NULL; - struct page *page; - ext4_lblk_t i, pg_lblk; - pgoff_t index; - - if (!test_opt(inode->i_sb, DELALLOC)) - return 0; - - /* reverse search wont work if fs block size is less than page size */ - if (inode->i_blkbits < PAGE_CACHE_SHIFT) - search_hint_reverse = 0; + struct extent_status es; - if (search_hint_reverse) - i = lblk_end; + es.start = lblk_start; + ext4_es_find_extent(inode, &es); + if (es.len == 0) + return 0; /* there is no delay extent in this tree */ + else if (es.start <= lblk_start && lblk_start < es.start + es.len) + return 1; + else if (lblk_start <= es.start && es.start <= lblk_end) + return 1; else - i = lblk_start; - - index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - - while ((i >= lblk_start) && (i <= lblk_end)) { - page = find_get_page(mapping, index); - if (!page) - goto nextpage; - - if (!page_has_buffers(page)) - goto nextpage; - - head = page_buffers(page); - if (!head) - goto nextpage; - - bh = head; - pg_lblk = index << (PAGE_CACHE_SHIFT - - inode->i_blkbits); - do { - if (unlikely(pg_lblk < lblk_start)) { - /* - * This is possible when fs block size is less - * than page size and our cluster starts/ends in - * middle of the page. So we need to skip the - * initial few blocks till we reach the 'lblk' - */ - pg_lblk++; - continue; - } - - /* Check if the buffer is delayed allocated and that it - * is not yet mapped. (when da-buffers are mapped during - * their writeout, their da_mapped bit is set.) - */ - if (buffer_delay(bh) && !buffer_da_mapped(bh)) { - page_cache_release(page); - trace_ext4_find_delalloc_range(inode, - lblk_start, lblk_end, - search_hint_reverse, - 1, i); - return 1; - } - if (search_hint_reverse) - i--; - else - i++; - } while ((i >= lblk_start) && (i <= lblk_end) && - ((bh = bh->b_this_page) != head)); -nextpage: - if (page) - page_cache_release(page); - /* - * Move to next page. 'i' will be the first lblk in the next - * page. - */ - if (search_hint_reverse) - index--; - else - index++; - i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - } - - trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end, - search_hint_reverse, 0, 0); - return 0; + return 0; } -int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, - int search_hint_reverse) +int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; - return ext4_find_delalloc_range(inode, lblk_start, lblk_end, - search_hint_reverse); + return ext4_find_delalloc_range(inode, lblk_start, lblk_end); } /** @@ -3630,7 +3549,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); lblk_to = lblk_from + c_offset - 1; - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) allocated_clusters--; } @@ -3640,7 +3559,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, lblk_from = lblk_start + num_blks; lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) allocated_clusters--; } @@ -3927,7 +3846,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + ext4_find_delalloc_cluster(inode, map->m_lblk)) map->m_flags |= EXT4_MAP_FROM_CLUSTER; if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { @@ -4015,7 +3934,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, } if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + ext4_find_delalloc_cluster(inode, map->m_lblk)) map->m_flags |= EXT4_MAP_FROM_CLUSTER; /* diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1e92349272e..7f9ccc1381a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -483,49 +483,6 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, return num; } -/* - * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map. - */ -static void set_buffers_da_mapped(struct inode *inode, - struct ext4_map_blocks *map) -{ - struct address_space *mapping = inode->i_mapping; - struct pagevec pvec; - int i, nr_pages; - pgoff_t index, end; - - index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - end = (map->m_lblk + map->m_len - 1) >> - (PAGE_CACHE_SHIFT - inode->i_blkbits); - - pagevec_init(&pvec, 0); - while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, - min(end - index + 1, - (pgoff_t)PAGEVEC_SIZE)); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; - - if (unlikely(page->mapping != mapping) || - !PageDirty(page)) - break; - - if (page_has_buffers(page)) { - bh = head = page_buffers(page); - do { - set_buffer_da_mapped(bh); - bh = bh->b_this_page; - } while (bh != head); - } - index++; - } - pagevec_release(&pvec); - } -} - /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -661,13 +618,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); - /* If we have successfully mapped the delayed allocated blocks, - * set the BH_Da_Mapped bit on them. Its important to do this - * under the protection of i_data_sem. - */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { int ret; - set_buffers_da_mapped(inode, map); delayed_mapped: /* delayed allocation blocks has been allocated */ ret = ext4_es_remove_extent(inode, map->m_lblk, @@ -1330,7 +1282,6 @@ static void ext4_da_page_release_reservation(struct page *page, if ((offset <= curr_off) && (buffer_delay(bh))) { to_release++; clear_buffer_delay(bh); - clear_buffer_da_mapped(bh); } curr_off = next_off; } while ((bh = bh->b_this_page) != head); @@ -1347,7 +1298,7 @@ static void ext4_da_page_release_reservation(struct page *page, lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + ((num_clusters - 1) << sbi->s_cluster_bits); if (sbi->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, lblk, 1)) + !ext4_find_delalloc_cluster(inode, lblk)) ext4_da_release_space(inode, 1); num_clusters--; @@ -1453,8 +1404,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, clear_buffer_delay(bh); bh->b_blocknr = pblock; } - if (buffer_da_mapped(bh)) - clear_buffer_da_mapped(bh); if (buffer_unwritten(bh) || buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); -- cgit v1.2.3 From b3aff3e3f61d13586fd46d1ee6f7353ab3050b6d Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:37 -0500 Subject: ext4: reimplement fiemap using extent status tree Signed-off-by: Yongqiang Yang Signed-off-by: Allison Henderson Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 184 +++++++----------------------------------------------- 1 file changed, 21 insertions(+), 163 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e0bedd1a4ac..d3dd6182c07 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4499,193 +4499,51 @@ static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, struct ext4_ext_cache *newex, struct ext4_extent *ex, void *data) { + struct extent_status es; __u64 logical; __u64 physical; __u64 length; __u32 flags = 0; + ext4_lblk_t next_del; int ret = 0; struct fiemap_extent_info *fieinfo = data; unsigned char blksize_bits; - blksize_bits = inode->i_sb->s_blocksize_bits; - logical = (__u64)newex->ec_block << blksize_bits; + es.start = newex->ec_block; + next_del = ext4_es_find_extent(inode, &es); + next = min(next_del, next); if (newex->ec_start == 0) { /* * No extent in extent-tree contains block @newex->ec_start, * then the block may stay in 1)a hole or 2)delayed-extent. - * - * Holes or delayed-extents are processed as follows. - * 1. lookup dirty pages with specified range in pagecache. - * If no page is got, then there is no delayed-extent and - * return with EXT_CONTINUE. - * 2. find the 1st mapped buffer, - * 3. check if the mapped buffer is both in the request range - * and a delayed buffer. If not, there is no delayed-extent, - * then return. - * 4. a delayed-extent is found, the extent will be collected. */ - ext4_lblk_t end = 0; - pgoff_t last_offset; - pgoff_t offset; - pgoff_t index; - pgoff_t start_index = 0; - struct page **pages = NULL; - struct buffer_head *bh = NULL; - struct buffer_head *head = NULL; - unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *); - - pages = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (pages == NULL) - return -ENOMEM; - - offset = logical >> PAGE_SHIFT; -repeat: - last_offset = offset; - head = NULL; - ret = find_get_pages_tag(inode->i_mapping, &offset, - PAGECACHE_TAG_DIRTY, nr_pages, pages); - - if (!(flags & FIEMAP_EXTENT_DELALLOC)) { - /* First time, try to find a mapped buffer. */ - if (ret == 0) { -out: - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - /* just a hole. */ - kfree(pages); - return EXT_CONTINUE; - } - index = 0; - -next_page: - /* Try to find the 1st mapped buffer. */ - end = ((__u64)pages[index]->index << PAGE_SHIFT) >> - blksize_bits; - if (!page_has_buffers(pages[index])) - goto out; - head = page_buffers(pages[index]); - if (!head) - goto out; - - index++; - bh = head; - do { - if (end >= newex->ec_block + - newex->ec_len) - /* The buffer is out of - * the request range. - */ - goto out; - - if (buffer_mapped(bh) && - end >= newex->ec_block) { - start_index = index - 1; - /* get the 1st mapped buffer. */ - goto found_mapped_buffer; - } - - bh = bh->b_this_page; - end++; - } while (bh != head); - - /* No mapped buffer in the range found in this page, - * We need to look up next page. - */ - if (index >= ret) { - /* There is no page left, but we need to limit - * newex->ec_len. - */ - newex->ec_len = end - newex->ec_block; - goto out; - } - goto next_page; - } else { - /*Find contiguous delayed buffers. */ - if (ret > 0 && pages[0]->index == last_offset) - head = page_buffers(pages[0]); - bh = head; - index = 1; - start_index = 0; + if (es.len == 0) + /* A hole found. */ + return EXT_CONTINUE; + + if (es.start > newex->ec_block) { + /* A hole found. */ + newex->ec_len = min(es.start - newex->ec_block, + newex->ec_len); + return EXT_CONTINUE; } -found_mapped_buffer: - if (bh != NULL && buffer_delay(bh)) { - /* 1st or contiguous delayed buffer found. */ - if (!(flags & FIEMAP_EXTENT_DELALLOC)) { - /* - * 1st delayed buffer found, record - * the start of extent. - */ - flags |= FIEMAP_EXTENT_DELALLOC; - newex->ec_block = end; - logical = (__u64)end << blksize_bits; - } - /* Find contiguous delayed buffers. */ - do { - if (!buffer_delay(bh)) - goto found_delayed_extent; - bh = bh->b_this_page; - end++; - } while (bh != head); - - for (; index < ret; index++) { - if (!page_has_buffers(pages[index])) { - bh = NULL; - break; - } - head = page_buffers(pages[index]); - if (!head) { - bh = NULL; - break; - } - - if (pages[index]->index != - pages[start_index]->index + index - - start_index) { - /* Blocks are not contiguous. */ - bh = NULL; - break; - } - bh = head; - do { - if (!buffer_delay(bh)) - /* Delayed-extent ends. */ - goto found_delayed_extent; - bh = bh->b_this_page; - end++; - } while (bh != head); - } - } else if (!(flags & FIEMAP_EXTENT_DELALLOC)) - /* a hole found. */ - goto out; - -found_delayed_extent: - newex->ec_len = min(end - newex->ec_block, - (ext4_lblk_t)EXT_INIT_MAX_LEN); - if (ret == nr_pages && bh != NULL && - newex->ec_len < EXT_INIT_MAX_LEN && - buffer_delay(bh)) { - /* Have not collected an extent and continue. */ - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - goto repeat; - } - - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - kfree(pages); + flags |= FIEMAP_EXTENT_DELALLOC; + newex->ec_len = es.start + es.len - newex->ec_block; } - physical = (__u64)newex->ec_start << blksize_bits; - length = (__u64)newex->ec_len << blksize_bits; - if (ex && ext4_ext_is_uninitialized(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; if (next == EXT_MAX_BLOCKS) flags |= FIEMAP_EXTENT_LAST; + blksize_bits = inode->i_sb->s_blocksize_bits; + logical = (__u64)newex->ec_block << blksize_bits; + physical = (__u64)newex->ec_start << blksize_bits; + length = (__u64)newex->ec_len << blksize_bits; + ret = fiemap_fill_next_extent(fieinfo, logical, physical, length, flags); if (ret < 0) -- cgit v1.2.3 From c8c0df241cc2719b1262e627f999638411934f60 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:40 -0500 Subject: ext4: introduce lseek SEEK_DATA/SEEK_HOLE support This patch makes ext4 really support SEEK_DATA/SEEK_HOLE flags. Block-mapped and extent-mapped files are fully implemented together because ext4_map_blocks hides this differences. After applying this patch, it will cause a failure in xfstest #285 when the file is block-mapped due to block-mapped file isn't support fallocate(2). I had tried to use ext4_ext_walk_space() to retrieve the offset for a extent-mapped file. But finally I decide to keep using ext4_map_blocks() to support SEEK_DATA/SEEK_HOLE because ext4_map_blocks() can hide the difference between block-mapped file and extent-mapped file. Moreover, in next step, extent status tree will track all extent status, and we can get all mappings from this tree. So I think that using ext4_map_blocks() is a better choice. CC: Hugh Dickins Signed-off-by: Jie Liu Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/file.c | 334 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 332 insertions(+), 2 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index bf3966bccd3..2f5759eb9f8 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -285,6 +286,324 @@ static int ext4_file_open(struct inode * inode, struct file * filp) return dquot_file_open(inode, filp); } +/* + * Here we use ext4_map_blocks() to get a block mapping for a extent-based + * file rather than ext4_ext_walk_space() because we can introduce + * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same + * function. When extent status tree has been fully implemented, it will + * track all extent status for a file and we can directly use it to + * retrieve the offset for SEEK_DATA/SEEK_HOLE. + */ + +/* + * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to + * lookup page cache to check whether or not there has some data between + * [startoff, endoff] because, if this range contains an unwritten extent, + * we determine this extent as a data or a hole according to whether the + * page cache has data or not. + */ +static int ext4_find_unwritten_pgoff(struct inode *inode, + int origin, + struct ext4_map_blocks *map, + loff_t *offset) +{ + struct pagevec pvec; + unsigned int blkbits; + pgoff_t index; + pgoff_t end; + loff_t endoff; + loff_t startoff; + loff_t lastoff; + int found = 0; + + blkbits = inode->i_sb->s_blocksize_bits; + startoff = *offset; + lastoff = startoff; + endoff = (map->m_lblk + map->m_len) << blkbits; + + index = startoff >> PAGE_CACHE_SHIFT; + end = endoff >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + do { + int i, num; + unsigned long nr_pages; + + num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + (pgoff_t)num); + if (nr_pages == 0) { + if (origin == SEEK_DATA) + break; + + BUG_ON(origin != SEEK_HOLE); + /* + * If this is the first time to go into the loop and + * offset is not beyond the end offset, it will be a + * hole at this offset + */ + if (lastoff == startoff || lastoff < endoff) + found = 1; + break; + } + + /* + * If this is the first time to go into the loop and + * offset is smaller than the first page offset, it will be a + * hole at this offset. + */ + if (lastoff == startoff && origin == SEEK_HOLE && + lastoff < page_offset(pvec.pages[0])) { + found = 1; + break; + } + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + /* + * If the current offset is not beyond the end of given + * range, it will be a hole. + */ + if (lastoff < endoff && origin == SEEK_HOLE && + page->index > end) { + found = 1; + *offset = lastoff; + goto out; + } + + lock_page(page); + + if (unlikely(page->mapping != inode->i_mapping)) { + unlock_page(page); + continue; + } + + if (!page_has_buffers(page)) { + unlock_page(page); + continue; + } + + if (page_has_buffers(page)) { + lastoff = page_offset(page); + bh = head = page_buffers(page); + do { + if (buffer_uptodate(bh) || + buffer_unwritten(bh)) { + if (origin == SEEK_DATA) + found = 1; + } else { + if (origin == SEEK_HOLE) + found = 1; + } + if (found) { + *offset = max_t(loff_t, + startoff, lastoff); + unlock_page(page); + goto out; + } + lastoff += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); + } + + lastoff = page_offset(page) + PAGE_SIZE; + unlock_page(page); + } + + /* + * The no. of pages is less than our desired, that would be a + * hole in there. + */ + if (nr_pages < num && origin == SEEK_HOLE) { + found = 1; + *offset = lastoff; + break; + } + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index <= end); + +out: + pagevec_release(&pvec); + return found; +} + +/* + * ext4_seek_data() retrieves the offset for SEEK_DATA. + */ +static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_map_blocks map; + struct extent_status es; + ext4_lblk_t start, last, end; + loff_t dataoff, isize; + int blkbits; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + + blkbits = inode->i_sb->s_blocksize_bits; + start = offset >> blkbits; + last = start; + end = isize >> blkbits; + dataoff = offset; + + do { + map.m_lblk = last; + map.m_len = end - last + 1; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { + if (last != start) + dataoff = last << blkbits; + break; + } + + /* + * If there is a delay extent at this offset, + * it will be as a data. + */ + es.start = last; + (void)ext4_es_find_extent(inode, &es); + if (last >= es.start && + last < es.start + es.len) { + if (last != start) + dataoff = last << blkbits; + break; + } + + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int unwritten; + unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, + &map, &dataoff); + if (unwritten) + break; + } + + last++; + dataoff = last << blkbits; + } while (last <= end); + + mutex_unlock(&inode->i_mutex); + + if (dataoff > isize) + return -ENXIO; + + if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) + return -EINVAL; + if (dataoff > maxsize) + return -EINVAL; + + if (dataoff != file->f_pos) { + file->f_pos = dataoff; + file->f_version = 0; + } + + return dataoff; +} + +/* + * ext4_seek_hole() retrieves the offset for SEEK_HOLE. + */ +static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_map_blocks map; + struct extent_status es; + ext4_lblk_t start, last, end; + loff_t holeoff, isize; + int blkbits; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + + blkbits = inode->i_sb->s_blocksize_bits; + start = offset >> blkbits; + last = start; + end = isize >> blkbits; + holeoff = offset; + + do { + map.m_lblk = last; + map.m_len = end - last + 1; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { + last += ret; + holeoff = last << blkbits; + continue; + } + + /* + * If there is a delay extent at this offset, + * we will skip this extent. + */ + es.start = last; + (void)ext4_es_find_extent(inode, &es); + if (last >= es.start && + last < es.start + es.len) { + last = es.start + es.len; + holeoff = last << blkbits; + continue; + } + + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int unwritten; + unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, + &map, &holeoff); + if (!unwritten) { + last += ret; + holeoff = last << blkbits; + continue; + } + } + + /* find a hole */ + break; + } while (last <= end); + + mutex_unlock(&inode->i_mutex); + + if (holeoff > isize) + holeoff = isize; + + if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) + return -EINVAL; + if (holeoff > maxsize) + return -EINVAL; + + if (holeoff != file->f_pos) { + file->f_pos = holeoff; + file->f_version = 0; + } + + return holeoff; +} + /* * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values * by calling generic_file_llseek_size() with the appropriate maxbytes @@ -300,8 +619,19 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) else maxbytes = inode->i_sb->s_maxbytes; - return generic_file_llseek_size(file, offset, origin, - maxbytes, i_size_read(inode)); + switch (origin) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek_size(file, offset, origin, + maxbytes, i_size_read(inode)); + case SEEK_DATA: + return ext4_seek_data(file, offset, maxbytes); + case SEEK_HOLE: + return ext4_seek_hole(file, offset, maxbytes); + } + + return -EINVAL; } const struct file_operations ext4_file_operations = { -- cgit v1.2.3 From dffe9d8da715bed4d395883add90a2d150d85729 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 10 Nov 2012 22:20:05 -0500 Subject: ext4: do not use ext4_error() when there is no space in dir leaf for csum If there is no space for a checksum in a directory leaf node, previously we would use EXT4_ERROR_INODE() which would mark the file system as inconsistent. While it would be nice to use e2fsck -D, it certainly isn't required, so just print a warning using ext4_warning(). Signed-off-by: "Theodore Ts'o" Cc: "Darrick J. Wong" --- fs/ext4/namei.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6d600a69fc9..580af3dfc0e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -261,6 +261,12 @@ static __le32 ext4_dirent_csum(struct inode *inode, return cpu_to_le32(csum); } +static void warn_no_space_for_csum(struct inode *inode) +{ + ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for " + "checksum. Please run e2fsck -D.", inode->i_ino); +} + int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) { struct ext4_dir_entry_tail *t; @@ -271,8 +277,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) t = get_dirent_tail(inode, dirent); if (!t) { - EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " - "leaf for checksum. Please run e2fsck -D."); + warn_no_space_for_csum(inode); return 0; } @@ -294,8 +299,7 @@ static void ext4_dirent_csum_set(struct inode *inode, t = get_dirent_tail(inode, dirent); if (!t) { - EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " - "leaf for checksum. Please run e2fsck -D."); + warn_no_space_for_csum(inode); return; } @@ -377,8 +381,7 @@ static int ext4_dx_csum_verify(struct inode *inode, count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { - EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " - "tree checksum found. Run e2fsck -D."); + warn_no_space_for_csum(inode); return 1; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); @@ -408,8 +411,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { - EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " - "tree checksum. Run e2fsck -D."); + warn_no_space_for_csum(inode); return; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); -- cgit v1.2.3 From c6af8803cd4f56aa62a47448c55030d4905b6783 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 12 Nov 2012 23:51:02 -0500 Subject: ext4: don't verify checksums of dx non-leaf nodes during fallback scan During a directory entry lookup of a hashed directory, if the hash-based lookup functions fail and we fall back to a linear scan, don't try to verify the dirent checksum on the internal nodes of the hash tree because they don't store a checksum in a hidden dirent like the leaf nodes do. Reported-by: George Spelvin Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 580af3dfc0e..88e9a2c7e32 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1146,6 +1146,21 @@ static inline int search_dirblock(struct buffer_head *bh, return 0; } +static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, + struct ext4_dir_entry *de) +{ + struct super_block *sb = dir->i_sb; + + if (!is_dx(dir)) + return 0; + if (block == 0) + return 1; + if (de->inode == 0 && + ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) == + sb->s_blocksize) + return 1; + return 0; +} /* * ext4_find_entry() @@ -1246,6 +1261,8 @@ restart: goto next; } if (!buffer_verified(bh) && + !is_dx_internal_node(dir, block, + (struct ext4_dir_entry *)bh->b_data) && !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) { EXT4_ERROR_INODE(dir, "checksumming directory " -- cgit v1.2.3 From 66bea92c69477a75a5d37b9bfed5773c92a3c4b4 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 14 Nov 2012 22:22:05 -0500 Subject: ext4: init pagevec in ext4_da_block_invalidatepages ext4_da_block_invalidatepages is missing a pagevec_init(), which means that pvec->cold contains random garbage. This affects whether the page goes to the front or back of the LRU when ->cold makes it to free_hot_cold_page() Reviewed-by: Lukas Czerner Reviewed-by: Carlos Maiolino Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7f9ccc1381a..52f7ff2f2e7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1482,6 +1482,7 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); ext4_es_remove_extent(inode, start, last - start + 1); + pagevec_init(&pvec, 0); while (index <= end) { nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); if (nr_pages == 0) -- cgit v1.2.3 From f3b59291a69d0b734be1fc8be489fef2dd846d3d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 15 Nov 2012 23:08:57 -0500 Subject: ext4: remove calls to ext4_jbd2_file_inode() from delalloc write path The calls to ext4_jbd2_file_inode() are needed to guarantee that we do not expose stale data in the data=ordered mode. However, they are not necessary because in all of the cases where we have newly allocated blocks in the delayed allocation write path, we immediately submit the dirty pages for I/O. Hence, we can avoid the overhead of adding the inode to the list of inodes whose data pages will be to be flushed out to disk completely during the next commit operation. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 52f7ff2f2e7..cf5d30a7cce 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1636,15 +1636,6 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) for (i = 0; i < map.m_len; i++) unmap_underlying_metadata(bdev, map.m_pblk + i); - - if (ext4_should_order_data(mpd->inode)) { - err = ext4_jbd2_file_inode(handle, mpd->inode); - if (err) { - /* Only if the journal is aborted */ - mpd->retval = err; - goto submit_io; - } - } } /* @@ -2592,17 +2583,8 @@ static int ext4_da_write_end(struct file *file, if (copied && new_i_size > EXT4_I(inode)->i_disksize) { if (ext4_da_should_update_i_disksize(page, end)) { down_write(&EXT4_I(inode)->i_data_sem); - if (new_i_size > EXT4_I(inode)->i_disksize) { - /* - * Updating i_disksize when extending file - * without needing block allocation - */ - if (ext4_should_order_data(inode)) - ret = ext4_jbd2_file_inode(handle, - inode); - + if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - } up_write(&EXT4_I(inode)->i_data_sem); /* We need to mark inode dirty even if * new_i_size is less that inode->i_size -- cgit v1.2.3 From 91dd8c114499e9818f2d5919ef0b9eee61810220 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 28 Nov 2012 12:32:26 -0500 Subject: ext4: prevent race while walking extent tree for fiemap Currently ext4_ext_walk_space() only takes i_data_sem for read when searching for the extent at given block with ext4_ext_find_extent(). Then it drops the lock and the extent tree can be changed at will. However later on we're searching for the 'next' extent, but the extent tree might already have changed, so the information might not be accurate. In fact we can hit BUG_ON(end <= start) if the extent got inserted into the tree after the one we found and before the block we were searching for. This has been reproduced by running xfstests 225 in loop on s390x architecture, but theoretically we could hit this on any other architecture as well, but probably not as often. Moreover the extent currently in delayed allocation might be allocated after we search the extent tree and before we search extent status tree delayed buffers resulting in those delayed buffers being completely missed, even though completely written and allocated. We fix all those problems in several steps: 1. remove unnecessary callback indirection 2. rename functions ext4_ext_walk_space -> ext4_fill_fiemap_extents ext4_ext_fiemap_cb -> ext4_find_delayed_extent 3. move fiemap_fill_next_extent() into ext4_fill_fiemap_extents() 4. hold the i_data_sem for: ext4_ext_find_extent() ext4_ext_next_allocated_block() ext4_find_delayed_extent() 5. call fiemap_fill_next_extent after releasing the i_data_sem 6. move path reinitialization into the critical section. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_extents.h | 14 ----- fs/ext4/extents.c | 136 +++++++++++++++++++++++++++---------------------- 2 files changed, 76 insertions(+), 74 deletions(-) diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 603bb114735..173b6c54532 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -143,20 +143,6 @@ struct ext4_ext_path { * structure for external API */ -/* - * to be called by ext4_ext_walk_space() - * negative retcode - error - * positive retcode - signal for ext4_ext_walk_space(), see below - * callback must return valid extent (passed or newly created) - */ -typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t, - struct ext4_ext_cache *, - struct ext4_extent *, void *); - -#define EXT_CONTINUE 0 -#define EXT_BREAK 1 -#define EXT_REPEAT 2 - /* * Maximum number of logical blocks in a file; ext4_extent's ee_block is * __le32. diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d3dd6182c07..fbe7dc28424 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -109,6 +109,9 @@ static int ext4_split_extent_at(handle_t *handle, int split_flag, int flags); +static int ext4_find_delayed_extent(struct inode *inode, + struct ext4_ext_cache *newex); + static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -1959,27 +1962,33 @@ cleanup: return err; } -static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, - ext4_lblk_t num, ext_prepare_callback func, - void *cbdata) +static int ext4_fill_fiemap_extents(struct inode *inode, + ext4_lblk_t block, ext4_lblk_t num, + struct fiemap_extent_info *fieinfo) { struct ext4_ext_path *path = NULL; struct ext4_ext_cache cbex; struct ext4_extent *ex; - ext4_lblk_t next, start = 0, end = 0; + ext4_lblk_t next, next_del, start = 0, end = 0; ext4_lblk_t last = block + num; - int depth, exists, err = 0; - - BUG_ON(func == NULL); - BUG_ON(inode == NULL); + int exists, depth = 0, err = 0; + unsigned int flags = 0; + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; while (block < last && block != EXT_MAX_BLOCKS) { num = last - block; /* find extent for this block */ down_read(&EXT4_I(inode)->i_data_sem); + + if (path && ext_depth(inode) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; + } + path = ext4_ext_find_extent(inode, block, path); - up_read(&EXT4_I(inode)->i_data_sem); if (IS_ERR(path)) { + up_read(&EXT4_I(inode)->i_data_sem); err = PTR_ERR(path); path = NULL; break; @@ -1987,13 +1996,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, depth = ext_depth(inode); if (unlikely(path[depth].p_hdr == NULL)) { + up_read(&EXT4_I(inode)->i_data_sem); EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); err = -EIO; break; } ex = path[depth].p_ext; next = ext4_ext_next_allocated_block(path); + ext4_ext_drop_refs(path); + flags = 0; exists = 0; if (!ex) { /* there is no extent yet, so try to allocate @@ -2037,30 +2049,54 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, cbex.ec_block = le32_to_cpu(ex->ee_block); cbex.ec_len = ext4_ext_get_actual_len(ex); cbex.ec_start = ext4_ext_pblock(ex); + if (ext4_ext_is_uninitialized(ex)) + flags |= FIEMAP_EXTENT_UNWRITTEN; } + /* + * Find delayed extent and update cbex accordingly. We call + * it even in !exists case to find out whether cbex is the + * last existing extent or not. + */ + next_del = ext4_find_delayed_extent(inode, &cbex); + if (!exists && next_del) { + exists = 1; + flags |= FIEMAP_EXTENT_DELALLOC; + } + up_read(&EXT4_I(inode)->i_data_sem); + if (unlikely(cbex.ec_len == 0)) { EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); err = -EIO; break; } - err = func(inode, next, &cbex, ex, cbdata); - ext4_ext_drop_refs(path); - - if (err < 0) - break; - if (err == EXT_REPEAT) - continue; - else if (err == EXT_BREAK) { - err = 0; - break; + /* This is possible iff next == next_del == EXT_MAX_BLOCKS */ + if (next == next_del) { + flags |= FIEMAP_EXTENT_LAST; + if (unlikely(next_del != EXT_MAX_BLOCKS || + next != EXT_MAX_BLOCKS)) { + EXT4_ERROR_INODE(inode, + "next extent == %u, next " + "delalloc extent = %u", + next, next_del); + err = -EIO; + break; + } } - if (ext_depth(inode) != depth) { - /* depth was changed. we have to realloc path */ - kfree(path); - path = NULL; + if (exists) { + err = fiemap_fill_next_extent(fieinfo, + (__u64)cbex.ec_block << blksize_bits, + (__u64)cbex.ec_start << blksize_bits, + (__u64)cbex.ec_len << blksize_bits, + flags); + if (err < 0) + break; + if (err == 1) { + err = 0; + break; + } } block = cbex.ec_block + cbex.ec_len; @@ -4493,26 +4529,23 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, } /* - * Callback function called for each extent to gather FIEMAP information. + * If newex is not existing extent (newex->ec_start equals zero) find + * delayed extent at start of newex and update newex accordingly and + * return start of the next delayed extent. + * + * If newex is existing extent (newex->ec_start is not equal zero) + * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed + * extent found. Leave newex unmodified. */ -static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, - struct ext4_ext_cache *newex, struct ext4_extent *ex, - void *data) +static int ext4_find_delayed_extent(struct inode *inode, + struct ext4_ext_cache *newex) { struct extent_status es; - __u64 logical; - __u64 physical; - __u64 length; - __u32 flags = 0; ext4_lblk_t next_del; - int ret = 0; - struct fiemap_extent_info *fieinfo = data; - unsigned char blksize_bits; es.start = newex->ec_block; next_del = ext4_es_find_extent(inode, &es); - next = min(next_del, next); if (newex->ec_start == 0) { /* * No extent in extent-tree contains block @newex->ec_start, @@ -4520,37 +4553,19 @@ static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, */ if (es.len == 0) /* A hole found. */ - return EXT_CONTINUE; + return 0; if (es.start > newex->ec_block) { /* A hole found. */ newex->ec_len = min(es.start - newex->ec_block, newex->ec_len); - return EXT_CONTINUE; + return 0; } - flags |= FIEMAP_EXTENT_DELALLOC; newex->ec_len = es.start + es.len - newex->ec_block; } - if (ex && ext4_ext_is_uninitialized(ex)) - flags |= FIEMAP_EXTENT_UNWRITTEN; - - if (next == EXT_MAX_BLOCKS) - flags |= FIEMAP_EXTENT_LAST; - - blksize_bits = inode->i_sb->s_blocksize_bits; - logical = (__u64)newex->ec_block << blksize_bits; - physical = (__u64)newex->ec_start << blksize_bits; - length = (__u64)newex->ec_len << blksize_bits; - - ret = fiemap_fill_next_extent(fieinfo, logical, physical, - length, flags); - if (ret < 0) - return ret; - if (ret == 1) - return EXT_BREAK; - return EXT_CONTINUE; + return next_del; } /* fiemap flags we can handle specified here */ #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) @@ -4772,6 +4787,7 @@ out_mutex: mutex_unlock(&inode->i_mutex); return err; } + int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { @@ -4799,11 +4815,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* - * Walk the extent tree gathering extent information. - * ext4_ext_fiemap_cb will push extents back to user. + * Walk the extent tree gathering extent information + * and pushing extents back to the user. */ - error = ext4_ext_walk_space(inode, start_blk, len_blks, - ext4_ext_fiemap_cb, fieinfo); + error = ext4_fill_fiemap_extents(inode, start_blk, + len_blks, fieinfo); } return error; -- cgit v1.2.3 From 06348679c9f69b3b031cf84c1f5f9f2488fc1f7d Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 28 Nov 2012 12:33:22 -0500 Subject: ext4: simple cleanup in fiemap codepath This commit is simple cleanup of fiemap codepath which has not been included in previous commit to make the changes clearer. In this commit we rename cbex variable to newex in ext4_fill_fiemap_extents() because callback is no longer present Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index fbe7dc28424..56251466750 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1967,7 +1967,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, struct fiemap_extent_info *fieinfo) { struct ext4_ext_path *path = NULL; - struct ext4_ext_cache cbex; + struct ext4_ext_cache newex; struct ext4_extent *ex; ext4_lblk_t next, next_del, start = 0, end = 0; ext4_lblk_t last = block + num; @@ -2042,31 +2042,31 @@ static int ext4_fill_fiemap_extents(struct inode *inode, BUG_ON(end <= start); if (!exists) { - cbex.ec_block = start; - cbex.ec_len = end - start; - cbex.ec_start = 0; + newex.ec_block = start; + newex.ec_len = end - start; + newex.ec_start = 0; } else { - cbex.ec_block = le32_to_cpu(ex->ee_block); - cbex.ec_len = ext4_ext_get_actual_len(ex); - cbex.ec_start = ext4_ext_pblock(ex); + newex.ec_block = le32_to_cpu(ex->ee_block); + newex.ec_len = ext4_ext_get_actual_len(ex); + newex.ec_start = ext4_ext_pblock(ex); if (ext4_ext_is_uninitialized(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; } /* - * Find delayed extent and update cbex accordingly. We call - * it even in !exists case to find out whether cbex is the + * Find delayed extent and update newex accordingly. We call + * it even in !exists case to find out whether newex is the * last existing extent or not. */ - next_del = ext4_find_delayed_extent(inode, &cbex); + next_del = ext4_find_delayed_extent(inode, &newex); if (!exists && next_del) { exists = 1; flags |= FIEMAP_EXTENT_DELALLOC; } up_read(&EXT4_I(inode)->i_data_sem); - if (unlikely(cbex.ec_len == 0)) { - EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); + if (unlikely(newex.ec_len == 0)) { + EXT4_ERROR_INODE(inode, "newex.ec_len == 0"); err = -EIO; break; } @@ -2087,9 +2087,9 @@ static int ext4_fill_fiemap_extents(struct inode *inode, if (exists) { err = fiemap_fill_next_extent(fieinfo, - (__u64)cbex.ec_block << blksize_bits, - (__u64)cbex.ec_start << blksize_bits, - (__u64)cbex.ec_len << blksize_bits, + (__u64)newex.ec_block << blksize_bits, + (__u64)newex.ec_start << blksize_bits, + (__u64)newex.ec_len << blksize_bits, flags); if (err < 0) break; @@ -2099,7 +2099,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, } } - block = cbex.ec_block + cbex.ec_len; + block = newex.ec_block + newex.ec_len; } if (path) { -- cgit v1.2.3 From 766f44d46a726cb59f52a75c5c87425a10c4bade Mon Sep 17 00:00:00 2001 From: Vahram Martirosyan Date: Wed, 28 Nov 2012 12:44:16 -0500 Subject: ext4: fixed potential NULL dereference in ext4_calculate_overhead() The memset operation before check can cause a BUG if the memory allocation failed. Since we are using get_zeroed_age, there is no need to use memset anyway. Found by the Spruce system in cooperation with the KEDR Framework. Signed-off-by: Vahram Martirosyan Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ad6cd8aeb94..66a4e20424c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3206,7 +3206,6 @@ int ext4_calculate_overhead(struct super_block *sb) ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_KERNEL); - memset(buf, 0, PAGE_SIZE); if (!buf) return -ENOMEM; -- cgit v1.2.3 From 4a092d737955301da22b9d5e07f5036da821a932 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 28 Nov 2012 13:03:30 -0500 Subject: ext4: rationalize ext4_extents.h inclusion Previously, ext4_extents.h was being included at the end of ext4.h, which was bad for a number of reasons: (a) it was not being included in the expected place, and (b) it caused the header to be included multiple times. There were #ifdef's to prevent this from causing any problems, but it still was unnecessary. By moving the function declarations that were in ext4_extents.h to ext4.h, which is standard practice for where the function declarations for the rest of ext4.h can be found, we can remove ext4_extents.h from being included in ext4.h at all, and then we can only include ext4_extents.h where it is needed in ext4's source files. It should be possible to move a few more things into ext4.h, and further reduce the number of source files that need to #include ext4_extents.h, but that's a cleanup for another day. Reported-by: Sachin Kamat Reported-by: Wei Yongjun Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 34 ++++++++++++++++++++++++++++++++-- fs/ext4/ext4_extents.h | 25 ------------------------- fs/ext4/extents.c | 1 + fs/ext4/indirect.c | 1 + fs/ext4/migrate.c | 1 + fs/ext4/move_extent.c | 1 + fs/ext4/page-io.c | 1 - fs/ext4/super.c | 3 +-- 8 files changed, 37 insertions(+), 30 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 246e38f3915..2e9ffa9100b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -57,6 +57,16 @@ #define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + #define EXT4_ERROR_INODE(inode, fmt, a...) \ ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) @@ -2399,6 +2409,9 @@ extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); /* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, @@ -2416,8 +2429,27 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_extent_tree_init(handle_t *, struct inode *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, + struct ext4_ext_path *, + struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path *); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); + + /* move_extent.c */ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, @@ -2505,6 +2537,4 @@ extern void ext4_resize_end(struct super_block *sb); #endif /* __KERNEL__ */ -#include "ext4_extents.h" - #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 173b6c54532..487fda12bc0 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -42,16 +42,6 @@ */ #define CHECK_BINSEARCH__ -/* - * Turn on EXT_DEBUG to get lots of info about extents operations. - */ -#define EXT_DEBUG__ -#ifdef EXT_DEBUG -#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) -#else -#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) -#endif - /* * If EXT_STATS is defined then stats numbers are collected. * These number will be displayed at umount time. @@ -286,20 +276,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } -extern int ext4_ext_calc_metadata_amount(struct inode *inode, - ext4_lblk_t lblocks); -extern int ext4_extent_tree_init(handle_t *, struct inode *); -extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, - int num, - struct ext4_ext_path *path); -extern int ext4_can_extents_be_merged(struct inode *inode, - struct ext4_extent *ex1, - struct ext4_extent *ex2); -extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); -extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path *); -extern void ext4_ext_drop_refs(struct ext4_ext_path *); -extern int ext4_ext_check_inode(struct inode *inode); -extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 56251466750..1dc19a7b449 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -41,6 +41,7 @@ #include #include #include "ext4_jbd2.h" +#include "ext4_extents.h" #include diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index f6663c3a946..20862f96e8a 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -22,6 +22,7 @@ #include "ext4_jbd2.h" #include "truncate.h" +#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ #include diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index f1bb32ec016..db8226d595f 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -14,6 +14,7 @@ #include #include "ext4_jbd2.h" +#include "ext4_extents.h" /* * The contiguous blocks details which can be diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 292daeeed45..d9cc5ee42f5 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -18,6 +18,7 @@ #include #include "ext4_jbd2.h" #include "ext4.h" +#include "ext4_extents.h" /** * get_ext_path - Find an extent path for designated logical block number. diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 0fd16e653eb..0016fbca2a4 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -27,7 +27,6 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "ext4_extents.h" static struct kmem_cache *io_page_cachep, *io_end_cachep; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 66a4e20424c..856206f255a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -45,12 +45,11 @@ #include #include "ext4.h" -#include "ext4_extents.h" +#include "ext4_extents.h" /* Needed for trace points definition */ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "mballoc.h" -#include "ext4_extents.h" #define CREATE_TRACE_POINTS #include -- cgit v1.2.3 From 69c499d152a7fe2c4443e5ddd91568ad5a79145a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 29 Nov 2012 21:13:48 -0500 Subject: ext4: restructure ext4_ext_direct_IO() Remove a level of indentation by moving the DIO read and extending write case to the beginning of the file. This results in no actual programmatic changes to the file, but makes it easier to read/understand. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 211 +++++++++++++++++++++++++++----------------------------- 1 file changed, 103 insertions(+), 108 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cf5d30a7cce..91a24967b8a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2927,10 +2927,10 @@ retry: * fall back to buffered IO. * * For holes, we fallocate those blocks, mark them as uninitialized - * If those blocks were preallocated, we mark sure they are splited, but + * If those blocks were preallocated, we mark sure they are split, but * still keep the range to write as uninitialized. * - * The unwrritten extents will be converted to written when DIO is completed. + * The unwritten extents will be converted to written when DIO is completed. * For async direct IO, since the IO may still pending when return, we * set up an end_io call back function, which will do the conversion * when async direct IO completed. @@ -2948,125 +2948,120 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, struct inode *inode = file->f_mapping->host; ssize_t ret; size_t count = iov_length(iov, nr_segs); - + int overwrite = 0; + get_block_t *get_block_func = NULL; + int dio_flags = 0; loff_t final_size = offset + count; - if (rw == WRITE && final_size <= inode->i_size) { - int overwrite = 0; - get_block_t *get_block_func = NULL; - int dio_flags = 0; - BUG_ON(iocb->private == NULL); + /* Use the old path for reads and writes beyond i_size. */ + if (rw != WRITE || final_size > inode->i_size) + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); - /* If we do a overwrite dio, i_mutex locking can be released */ - overwrite = *((int *)iocb->private); + BUG_ON(iocb->private == NULL); - if (overwrite) { - atomic_inc(&inode->i_dio_count); - down_read(&EXT4_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); - } + /* If we do a overwrite dio, i_mutex locking can be released */ + overwrite = *((int *)iocb->private); - /* - * We could direct write to holes and fallocate. - * - * Allocated blocks to fill the hole are marked as uninitialized - * to prevent parallel buffered read to expose the stale data - * before DIO complete the data IO. - * - * As to previously fallocated extents, ext4 get_block - * will just simply mark the buffer mapped but still - * keep the extents uninitialized. - * - * for non AIO case, we will convert those unwritten extents - * to written after return back from blockdev_direct_IO. - * - * for async DIO, the conversion needs to be defered when - * the IO is completed. The ext4 end_io callback function - * will be called to take care of the conversion work. - * Here for async case, we allocate an io_end structure to - * hook to the iocb. - */ - iocb->private = NULL; - ext4_inode_aio_set(inode, NULL); - if (!is_sync_kiocb(iocb)) { - ext4_io_end_t *io_end = - ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) { - ret = -ENOMEM; - goto retake_lock; - } - io_end->flag |= EXT4_IO_END_DIRECT; - iocb->private = io_end; - /* - * we save the io structure for current async - * direct IO, so that later ext4_map_blocks() - * could flag the io structure whether there - * is a unwritten extents needs to be converted - * when IO is completed. - */ - ext4_inode_aio_set(inode, io_end); - } + if (overwrite) { + atomic_inc(&inode->i_dio_count); + down_read(&EXT4_I(inode)->i_data_sem); + mutex_unlock(&inode->i_mutex); + } - if (overwrite) { - get_block_func = ext4_get_block_write_nolock; - } else { - get_block_func = ext4_get_block_write; - dio_flags = DIO_LOCKING; + /* + * We could direct write to holes and fallocate. + * + * Allocated blocks to fill the hole are marked as + * uninitialized to prevent parallel buffered read to expose + * the stale data before DIO complete the data IO. + * + * As to previously fallocated extents, ext4 get_block will + * just simply mark the buffer mapped but still keep the + * extents uninitialized. + * + * For non AIO case, we will convert those unwritten extents + * to written after return back from blockdev_direct_IO. + * + * For async DIO, the conversion needs to be deferred when the + * IO is completed. The ext4 end_io callback function will be + * called to take care of the conversion work. Here for async + * case, we allocate an io_end structure to hook to the iocb. + */ + iocb->private = NULL; + ext4_inode_aio_set(inode, NULL); + if (!is_sync_kiocb(iocb)) { + ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) { + ret = -ENOMEM; + goto retake_lock; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - get_block_func, - ext4_end_io_dio, - NULL, - dio_flags); - - if (iocb->private) - ext4_inode_aio_set(inode, NULL); + io_end->flag |= EXT4_IO_END_DIRECT; + iocb->private = io_end; /* - * The io_end structure takes a reference to the inode, - * that structure needs to be destroyed and the - * reference to the inode need to be dropped, when IO is - * complete, even with 0 byte write, or failed. - * - * In the successful AIO DIO case, the io_end structure will be - * desctroyed and the reference to the inode will be dropped - * after the end_io call back function is called. - * - * In the case there is 0 byte write, or error case, since - * VFS direct IO won't invoke the end_io call back function, - * we need to free the end_io structure here. + * we save the io structure for current async direct + * IO, so that later ext4_map_blocks() could flag the + * io structure whether there is a unwritten extents + * needs to be converted when IO is completed. */ - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { - ext4_free_io_end(iocb->private); - iocb->private = NULL; - } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN)) { - int err; - /* - * for non AIO case, since the IO is already - * completed, we could do the conversion right here - */ - err = ext4_convert_unwritten_extents(inode, - offset, ret); - if (err < 0) - ret = err; - ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - } + ext4_inode_aio_set(inode, io_end); + } - retake_lock: - /* take i_mutex locking again if we do a ovewrite dio */ - if (overwrite) { - inode_dio_done(inode); - up_read(&EXT4_I(inode)->i_data_sem); - mutex_lock(&inode->i_mutex); - } + if (overwrite) { + get_block_func = ext4_get_block_write_nolock; + } else { + get_block_func = ext4_get_block_write; + dio_flags = DIO_LOCKING; + } + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + get_block_func, + ext4_end_io_dio, + NULL, + dio_flags); + + if (iocb->private) + ext4_inode_aio_set(inode, NULL); + /* + * The io_end structure takes a reference to the inode, that + * structure needs to be destroyed and the reference to the + * inode need to be dropped, when IO is complete, even with 0 + * byte write, or failed. + * + * In the successful AIO DIO case, the io_end structure will + * be destroyed and the reference to the inode will be dropped + * after the end_io call back function is called. + * + * In the case there is 0 byte write, or error case, since VFS + * direct IO won't invoke the end_io call back function, we + * need to free the end_io structure here. + */ + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { + ext4_free_io_end(iocb->private); + iocb->private = NULL; + } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN)) { + int err; + /* + * for non AIO case, since the IO is already + * completed, we could do the conversion right here + */ + err = ext4_convert_unwritten_extents(inode, + offset, ret); + if (err < 0) + ret = err; + ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + } - return ret; +retake_lock: + /* take i_mutex locking again if we do a ovewrite dio */ + if (overwrite) { + inode_dio_done(inode); + up_read(&EXT4_I(inode)->i_data_sem); + mutex_lock(&inode->i_mutex); } - /* for write the the end of file case, we fall back to old way */ - return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + return ret; } static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, -- cgit v1.2.3 From aeb1e5d69a5be592e86a926be73efb38c55af404 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 29 Nov 2012 21:21:22 -0500 Subject: ext4: fix possible use after free with metadata csum Commit fa77dcfafeaa introduces block bitmap checksum calculation into ext4_new_inode() in the case that block group was uninitialized. However we brelse() the bitmap buffer before we attempt to checksum it so we have no guarantee that the buffer is still there. Fix this by releasing the buffer after the possible checksum computation. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Acked-by: Darrick J. Wong Cc: stable@vger.kernel.org --- fs/ext4/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3a100e7a62a..c7efa88d714 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -762,7 +762,6 @@ got: BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); - brelse(block_bitmap_bh); /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); @@ -775,6 +774,7 @@ got: ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); + brelse(block_bitmap_bh); if (err) goto fail; -- cgit v1.2.3 From 152a7b0a808a00601328feba2001cbb2b530f771 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sun, 2 Dec 2012 11:13:24 -0500 Subject: ext4: move extra inode read to a new function Currently, in ext4_iget we do a simple check to see whether there does exist some information starting from the end of i_extra_size. With inline data added, this procedure is more complicated. So move it to a new function named ext4_iget_extra_inode. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 91a24967b8a..befa005711a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3700,6 +3700,16 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, } } +static inline void ext4_iget_extra_inode(struct inode *inode, + struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + __le32 *magic = (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) + ext4_set_inode_state(inode, EXT4_STATE_XATTR); +} + struct inode *ext4_iget(struct super_block *sb, unsigned long ino) { struct ext4_iloc iloc; @@ -3842,11 +3852,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { - __le32 *magic = (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + - ei->i_extra_isize; - if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) - ext4_set_inode_state(inode, EXT4_STATE_XATTR); + ext4_iget_extra_inode(inode, raw_inode, ei); } } -- cgit v1.2.3 From 879b38257bf2b6fa8406693a3b5b5a0649e7c594 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 5 Dec 2012 10:28:46 -0500 Subject: ext4: export inline xattr functions The inline data feature will need some inline xattr functions, so export them from fs/ext4/xattr.c so that inline.c can use them. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/xattr.c | 39 ++++++-------------------------------- fs/ext4/xattr.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 33 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b1adda1b750..a47dc3883a2 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -61,11 +61,6 @@ #include "xattr.h" #include "acl.h" -#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) -#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) -#define BFIRST(bh) ENTRY(BHDR(bh)+1) -#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) - #ifdef EXT4_XATTR_DEBUG # define ea_idebug(inode, f...) do { \ printk(KERN_DEBUG "inode %s:%lu: ", \ @@ -312,7 +307,7 @@ cleanup: return error; } -static int +int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { @@ -581,21 +576,6 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, return (*min_offs - ((void *)last - base) - sizeof(__u32)); } -struct ext4_xattr_info { - int name_index; - const char *name; - const void *value; - size_t value_len; -}; - -struct ext4_xattr_search { - struct ext4_xattr_entry *first; - void *base; - void *end; - struct ext4_xattr_entry *here; - int not_found; -}; - static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) { @@ -949,14 +929,8 @@ bad_block: #undef header } -struct ext4_xattr_ibody_find { - struct ext4_xattr_search s; - struct ext4_iloc iloc; -}; - -static int -ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) +int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; @@ -984,10 +958,9 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, return 0; } -static int -ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) +int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 91f31ca7d9a..40ca7a6f5ee 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -65,6 +65,32 @@ struct ext4_xattr_entry { EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) +#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) +#define BFIRST(bh) ENTRY(BHDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + + +struct ext4_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ext4_xattr_search { + struct ext4_xattr_entry *first; + void *base; + void *end; + struct ext4_xattr_entry *here; + int not_found; +}; + +struct ext4_xattr_ibody_find { + struct ext4_xattr_search s; + struct ext4_iloc iloc; +}; + # ifdef CONFIG_EXT4_FS_XATTR extern const struct xattr_handler ext4_xattr_user_handler; @@ -90,6 +116,15 @@ extern void ext4_exit_xattr(void); extern const struct xattr_handler *ext4_xattr_handlers[]; +extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); +extern int ext4_xattr_ibody_get(struct inode *inode, int name_index, + const char *name, + void *buffer, size_t buffer_size); +extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); + # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -143,6 +178,29 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, #define ext4_xattr_handlers NULL +static inline int +ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + return -EOPNOTSUPP; +} + +static inline int +ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + return -EOPNOTSUPP; +} + +static inline int +ext4_xattr_ibody_get(struct inode *inode, int name_index, + const char *name, + void *buffer, size_t buffer_size) +{ + return -EOPNOTSUPP; +} + # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From 67cf5b09a46f72e048501b84996f2f77bc42e947 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:04:46 -0500 Subject: ext4: add the basic function for inline data support Implement inline data with xattr. Now we use "system.data" to store xattr, and the xattr will be extended if the i_size is increased while we don't release the space during truncate. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/Makefile | 2 +- fs/ext4/ext4.h | 10 +- fs/ext4/inline.c | 466 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/inode.c | 5 +- fs/ext4/xattr.h | 54 +++++++ 5 files changed, 534 insertions(+), 3 deletions(-) create mode 100644 fs/ext4/inline.c diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 41f22be2ffa..3d96d569853 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -9,6 +9,6 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ mmp.o indirect.o extents_status.o -ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o inline.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2e9ffa9100b..c827e47d556 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -402,6 +402,7 @@ struct flex_groups { #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ @@ -458,6 +459,7 @@ enum { EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; @@ -504,6 +506,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(EXTENTS); CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(RESERVED); } @@ -918,6 +921,10 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + #ifdef CONFIG_QUOTA /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; @@ -1376,6 +1383,7 @@ enum { EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1497,7 +1505,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ -#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c new file mode 100644 index 00000000000..bec68b36483 --- /dev/null +++ b/fs/ext4/inline.c @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2012 Taobao. + * Written by Tao Ma + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +#define EXT4_XATTR_SYSTEM_DATA "data" +#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) + +int ext4_get_inline_size(struct inode *inode) +{ + if (EXT4_I(inode)->i_inline_off) + return EXT4_I(inode)->i_inline_size; + + return 0; +} + +static int get_max_inline_xattr_value_size(struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + struct ext4_inode *raw_inode; + int free, min_offs; + + min_offs = EXT4_SB(inode->i_sb)->s_inode_size - + EXT4_GOOD_OLD_INODE_SIZE - + EXT4_I(inode)->i_extra_isize - + sizeof(struct ext4_xattr_ibody_header); + + /* + * We need to subtract another sizeof(__u32) since an in-inode xattr + * needs an empty 4 bytes to indicate the gap between the xattr entry + * and the name/value pair. + */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return EXT4_XATTR_SIZE(min_offs - + EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) - + EXT4_XATTR_ROUND - sizeof(__u32)); + + raw_inode = ext4_raw_inode(iloc); + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + + /* Compute min_offs. */ + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_block && entry->e_value_size) { + size_t offs = le16_to_cpu(entry->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + } + free = min_offs - + ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); + + if (EXT4_I(inode)->i_inline_off) { + entry = (struct ext4_xattr_entry *) + ((void *)raw_inode + EXT4_I(inode)->i_inline_off); + + free += le32_to_cpu(entry->e_value_size); + goto out; + } + + free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)); + + if (free > EXT4_XATTR_ROUND) + free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND); + else + free = 0; + +out: + return free; +} + +/* + * Get the maximum size we now can store in an inode. + * If we can't find the space for a xattr entry, don't use the space + * of the extents since we have no space to indicate the inline data. + */ +int ext4_get_max_inline_size(struct inode *inode) +{ + int error, max_inline_size; + struct ext4_iloc iloc; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) { + ext4_error_inode(inode, __func__, __LINE__, 0, + "can't get inode location %lu", + inode->i_ino); + return 0; + } + + down_read(&EXT4_I(inode)->xattr_sem); + max_inline_size = get_max_inline_xattr_value_size(inode, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + + brelse(iloc.bh); + + if (!max_inline_size) + return 0; + + return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; +} + +int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + +/* + * this function does not take xattr_sem, which is OK because it is + * currently only used in a code path coming form ext4_iget, before + * the new inode has been unlocked + */ +int ext4_find_inline_data_nolock(struct inode *inode) +{ + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + if (!is.s.not_found) { + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + + le32_to_cpu(is.s.here->e_value_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + } +out: + brelse(is.iloc.bh); + return error; +} + +static int ext4_read_inline_data(struct inode *inode, void *buffer, + unsigned int len, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + int cp_len = 0; + struct ext4_inode *raw_inode; + + if (!len) + return 0; + + BUG_ON(len > EXT4_I(inode)->i_inline_size); + + cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ? + len : EXT4_MIN_INLINE_DATA_SIZE; + + raw_inode = ext4_raw_inode(iloc); + memcpy(buffer, (void *)(raw_inode->i_block), cp_len); + + len -= cp_len; + buffer += cp_len; + + if (!len) + goto out; + + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + len = min_t(unsigned int, len, + (unsigned int)le32_to_cpu(entry->e_value_size)); + + memcpy(buffer, + (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len); + cp_len += len; + +out: + return cp_len; +} + +/* + * write the buffer to the inline inode. + * If 'create' is set, we don't need to do the extra copy in the xattr + * value since it is already handled by ext4_xattr_ibody_set. That saves + * us one memcpy. + */ +void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, + void *buffer, loff_t pos, unsigned int len) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + int cp_len = 0; + + BUG_ON(!EXT4_I(inode)->i_inline_off); + BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); + + raw_inode = ext4_raw_inode(iloc); + buffer += pos; + + if (pos < EXT4_MIN_INLINE_DATA_SIZE) { + cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ? + EXT4_MIN_INLINE_DATA_SIZE - pos : len; + memcpy((void *)raw_inode->i_block + pos, buffer, cp_len); + + len -= cp_len; + buffer += cp_len; + pos += cp_len; + } + + if (!len) + return; + + pos -= EXT4_MIN_INLINE_DATA_SIZE; + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + + memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos, + buffer, len); +} + +static int ext4_create_inline_data(handle_t *handle, + struct inode *inode, unsigned len) +{ + int error; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + if (len > EXT4_MIN_INLINE_DATA_SIZE) { + value = (void *)empty_zero_page; + len -= EXT4_MIN_INLINE_DATA_SIZE; + } else { + value = ""; + len = 0; + } + + /* Insert the the xttr entry. */ + i.value = value; + i.value_len = len; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + BUG_ON(!is.s.not_found); + + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (error) { + if (error == -ENOSPC) + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + goto out; + } + + memset((void *)ext4_raw_inode(&is.iloc)->i_block, + 0, EXT4_MIN_INLINE_DATA_SIZE); + + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + +out: + brelse(is.iloc.bh); + return error; +} + +static int ext4_update_inline_data(handle_t *handle, struct inode *inode, + unsigned int len) +{ + int error; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + /* If the old space is ok, write the data directly. */ + if (len <= EXT4_I(inode)->i_inline_size) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + BUG_ON(is.s.not_found); + + len -= EXT4_MIN_INLINE_DATA_SIZE; + value = kzalloc(len, GFP_NOFS); + if (!value) + goto out; + + error = ext4_xattr_ibody_get(inode, i.name_index, i.name, + value, len); + if (error == -ENODATA) + goto out; + + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + /* Update the xttr entry. */ + i.value = value; + i.value_len = len; + + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (error) + goto out; + + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + + le32_to_cpu(is.s.here->e_value_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + +out: + kfree(value); + brelse(is.iloc.bh); + return error; +} + +int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, + unsigned int len) +{ + int ret, size; + struct ext4_inode_info *ei = EXT4_I(inode); + + if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + return -ENOSPC; + + size = ext4_get_max_inline_size(inode); + if (size < len) + return -ENOSPC; + + down_write(&EXT4_I(inode)->xattr_sem); + + if (ei->i_inline_off) + ret = ext4_update_inline_data(handle, inode, len); + else + ret = ext4_create_inline_data(handle, inode, len); + + up_write(&EXT4_I(inode)->xattr_sem); + + return ret; +} + +static int ext4_destroy_inline_data_nolock(handle_t *handle, + struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_xattr_ibody_find is = { + .s = { .not_found = 0, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + .value = NULL, + .value_len = 0, + }; + int error; + + if (!ei->i_inline_off) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (error) + goto out; + + memset((void *)ext4_raw_inode(&is.iloc)->i_block, + 0, EXT4_MIN_INLINE_DATA_SIZE); + + if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (S_ISDIR(inode->i_mode) || + S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode); + } + } + ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); + + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + + EXT4_I(inode)->i_inline_off = 0; + EXT4_I(inode)->i_inline_size = 0; + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); +out: + brelse(is.iloc.bh); + if (error == -ENODATA) + error = 0; + return error; +} + +int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) +{ + int ret; + + down_write(&EXT4_I(inode)->xattr_sem); + ret = ext4_destroy_inline_data_nolock(handle, inode); + up_write(&EXT4_I(inode)->xattr_sem); + + return ret; +} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index befa005711a..e23f114e2cf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3706,8 +3706,10 @@ static inline void ext4_iget_extra_inode(struct inode *inode, { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; - if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) + if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { ext4_set_inode_state(inode, EXT4_STATE_XATTR); + ext4_find_inline_data_nolock(inode); + } } struct inode *ext4_iget(struct super_block *sb, unsigned long ino) @@ -3780,6 +3782,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ + ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 40ca7a6f5ee..7ae0d05156e 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -21,6 +21,7 @@ #define EXT4_XATTR_INDEX_TRUSTED 4 #define EXT4_XATTR_INDEX_LUSTRE 5 #define EXT4_XATTR_INDEX_SECURITY 6 +#define EXT4_XATTR_INDEX_SYSTEM 7 struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ @@ -125,6 +126,19 @@ extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); +extern int ext4_has_inline_data(struct inode *inode); +extern int ext4_get_inline_size(struct inode *inode); +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern void ext4_write_inline_data(struct inode *inode, + struct ext4_iloc *iloc, + void *buffer, loff_t pos, + unsigned int len); +extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -201,6 +215,46 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, return -EOPNOTSUPP; } +static inline int ext4_find_inline_data_nolock(struct inode *inode) +{ + return 0; +} + +static inline int ext4_has_inline_data(struct inode *inode) +{ + return 0; +} + +static inline int ext4_get_inline_size(struct inode *inode) +{ + return 0; +} + +static inline int ext4_get_max_inline_size(struct inode *inode) +{ + return 0; +} + +static inline void ext4_write_inline_data(struct inode *inode, + struct ext4_iloc *iloc, + void *buffer, loff_t pos, + unsigned int len) +{ + return; +} + +static inline int ext4_init_inline_data(handle_t *handle, + struct inode *inode, + unsigned int len) +{ + return 0; +} + +static inline int ext4_destroy_inline_data(handle_t *handle, + struct inode *inode) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From 46c7f254543dedcf134ad05091ed2b935a9a597d Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:04:52 -0500 Subject: ext4: add read support for inline data Let readpage and readpages handle the case when we want to read an inlined file. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/inode.c | 31 +++++++++++++++++++++++++++- fs/ext4/xattr.h | 7 +++++++ 3 files changed, 98 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index bec68b36483..e4a41d5d06d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -454,6 +454,67 @@ out: return error; } +static int ext4_read_inline_page(struct inode *inode, struct page *page) +{ + void *kaddr; + int ret = 0; + size_t len; + struct ext4_iloc iloc; + + BUG_ON(!PageLocked(page)); + BUG_ON(!ext4_has_inline_data(inode)); + BUG_ON(page->index); + + if (!EXT4_I(inode)->i_inline_off) { + ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", + inode->i_ino); + goto out; + } + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + goto out; + + len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); + kaddr = kmap_atomic(page); + ret = ext4_read_inline_data(inode, kaddr, len, &iloc); + flush_dcache_page(page); + kunmap_atomic(kaddr); + zero_user_segment(page, len, PAGE_CACHE_SIZE); + SetPageUptodate(page); + brelse(iloc.bh); + +out: + return ret; +} + +int ext4_readpage_inline(struct inode *inode, struct page *page) +{ + int ret = 0; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + return -EAGAIN; + } + + /* + * Current inline data can only exist in the 1st page, + * So for all the other pages, just set them uptodate. + */ + if (!page->index) + ret = ext4_read_inline_page(inode, page); + else if (!PageUptodate(page)) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } + + up_read(&EXT4_I(inode)->xattr_sem); + + unlock_page(page); + return ret >= 0 ? 0 : ret; +} + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e23f114e2cf..1668abf8054 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -649,6 +649,9 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, int ret = 0, started = 0; int dio_credits; + if (ext4_has_inline_data(inode)) + return -ERANGE; + map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; @@ -2687,6 +2690,12 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) journal_t *journal; int err; + /* + * We can get here for an inline file via the FIBMAP ioctl + */ + if (ext4_has_inline_data(inode)) + return 0; + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && test_opt(inode->i_sb, DELALLOC)) { /* @@ -2732,14 +2741,30 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) static int ext4_readpage(struct file *file, struct page *page) { + int ret = -EAGAIN; + struct inode *inode = page->mapping->host; + trace_ext4_readpage(page); - return mpage_readpage(page, ext4_get_block); + + if (ext4_has_inline_data(inode)) + ret = ext4_readpage_inline(inode, page); + + if (ret == -EAGAIN) + return mpage_readpage(page, ext4_get_block); + + return ret; } static int ext4_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { + struct inode *inode = mapping->host; + + /* If the file has inline data, no need to do readpages. */ + if (ext4_has_inline_data(inode)) + return 0; + return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } @@ -3078,6 +3103,10 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, if (ext4_should_journal_data(inode)) return 0; + /* Let buffer I/O handle the inline data case. */ + if (ext4_has_inline_data(inode)) + return 0; + trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 7ae0d05156e..646c9b9be8e 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -139,6 +139,8 @@ extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, unsigned int len); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +extern int ext4_readpage_inline(struct inode *inode, struct page *page); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -255,6 +257,11 @@ static inline int ext4_destroy_inline_data(handle_t *handle, { return 0; } + +static inline int ext4_readpage_inline(struct inode *inode, struct page *page) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From f19d5870cbf72d4cb2a8e1f749dff97af99b071e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:51 -0500 Subject: ext4: add normal write support for inline data For a normal write case (not journalled write, not delayed allocation), we write to the inline if the file is small and convert it to an extent based file when the write is larger than the max inline size. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 11 +++ fs/ext4/extents.c | 9 ++- fs/ext4/inline.c | 233 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/inode.c | 103 ++++++++++++++---------- fs/ext4/xattr.h | 26 ++++++ 5 files changed, 340 insertions(+), 42 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c827e47d556..9f4efc6c37b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2018,8 +2018,19 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int, int *); +int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh); extern struct inode *ext4_iget(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, struct writeback_control *); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1dc19a7b449..f2659f51b23 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -42,6 +42,7 @@ #include #include "ext4_jbd2.h" #include "ext4_extents.h" +#include "xattr.h" #include @@ -2310,7 +2311,13 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { int index; - int depth = ext_depth(inode); + int depth; + + /* If we are converting the inline data, only one is needed here. */ + if (ext4_has_inline_data(inode)) + return 1; + + depth = ext_depth(inode); if (chunk) index = depth * 2; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index e4a41d5d06d..320ff6fe5d8 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -14,6 +14,7 @@ #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" +#include "truncate.h" #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) @@ -515,6 +516,238 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) return ret >= 0 ? 0 : ret; } +static int ext4_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + unsigned flags) +{ + int ret, needed_blocks; + handle_t *handle = NULL; + int retries = 0, sem_held = 0; + struct page *page = NULL; + unsigned from, to; + struct ext4_iloc iloc; + + if (!ext4_has_inline_data(inode)) { + /* + * clear the flag so that no new write + * will trap here again. + */ + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + return 0; + } + + needed_blocks = ext4_writepage_trans_blocks(inode); + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + +retry: + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out; + } + + down_write(&EXT4_I(inode)->xattr_sem); + sem_held = 1; + /* If some one has already done this for us, just exit. */ + if (!ext4_has_inline_data(inode)) { + ret = 0; + goto out; + } + + from = 0; + to = ext4_get_inline_size(inode); + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out; + } + + ret = ext4_destroy_inline_data_nolock(handle, inode); + if (ret) + goto out; + + if (ext4_should_dioread_nolock(inode)) + ret = __block_write_begin(page, from, to, ext4_get_block_write); + else + ret = __block_write_begin(page, from, to, ext4_get_block); + + if (!ret && ext4_should_journal_data(inode)) { + ret = ext4_walk_page_buffers(handle, page_buffers(page), + from, to, NULL, + do_journal_get_write_access); + } + + if (ret) { + unlock_page(page); + page_cache_release(page); + ext4_orphan_add(handle, inode); + up_write(&EXT4_I(inode)->xattr_sem); + sem_held = 0; + ext4_journal_stop(handle); + handle = NULL; + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might + * still be on the orphan list; we need to + * make sure the inode is removed from the + * orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + block_commit_write(page, from, to); +out: + if (page) { + unlock_page(page); + page_cache_release(page); + } + if (sem_held) + up_write(&EXT4_I(inode)->xattr_sem); + if (handle) + ext4_journal_stop(handle); + brelse(iloc.bh); + return ret; +} + +/* + * Try to write data in the inode. + * If the inode has inline data, check whether the new write can be + * in the inode also. If not, create the page the handle, move the data + * to the page make it update and let the later codes create extent for it. + */ +int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep) +{ + int ret; + handle_t *handle; + struct page *page; + struct ext4_iloc iloc; + + if (pos + len > ext4_get_max_inline_size(inode)) + goto convert; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + /* + * The possible write could happen in the inode, + * so try to reserve the space in inode first. + */ + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + ret = ext4_prepare_inline_data(handle, inode, pos + len); + if (ret && ret != -ENOSPC) + goto out; + + /* We don't have space in inline inode, so convert it to extent. */ + if (ret == -ENOSPC) { + ext4_journal_stop(handle); + brelse(iloc.bh); + goto convert; + } + + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out; + } + + *pagep = page; + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ret = 0; + unlock_page(page); + page_cache_release(page); + goto out_up_read; + } + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out_up_read; + } + + ret = 1; + handle = NULL; +out_up_read: + up_read(&EXT4_I(inode)->xattr_sem); +out: + if (handle) + ext4_journal_stop(handle); + brelse(iloc.bh); + return ret; +convert: + return ext4_convert_inline_data_to_extent(mapping, + inode, flags); +} + +int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct page *page) +{ + int ret; + void *kaddr; + struct ext4_iloc iloc; + + if (unlikely(copied < len)) { + if (!PageUptodate(page)) { + copied = 0; + goto out; + } + } + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + ext4_std_error(inode->i_sb, ret); + copied = 0; + goto out; + } + + down_write(&EXT4_I(inode)->xattr_sem); + BUG_ON(!ext4_has_inline_data(inode)); + + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, pos, len); + kunmap_atomic(kaddr); + SetPageUptodate(page); + /* clear page dirty so that writepages wouldn't work for us. */ + ClearPageDirty(page); + + up_write(&EXT4_I(inode)->xattr_sem); + brelse(iloc.bh); +out: + return copied; +} + + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1668abf8054..70c8d5f323f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -770,13 +770,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return NULL; } -static int walk_page_buffers(handle_t *handle, - struct buffer_head *head, - unsigned from, - unsigned to, - int *partial, - int (*fn)(handle_t *handle, - struct buffer_head *bh)) +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; @@ -826,8 +826,8 @@ static int walk_page_buffers(handle_t *handle, * is elevated. We'll still have enough credits for the tiny quotafile * write. */ -static int do_journal_get_write_access(handle_t *handle, - struct buffer_head *bh) +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) { int dirty = buffer_dirty(bh); int ret; @@ -850,8 +850,6 @@ static int do_journal_get_write_access(handle_t *handle, return ret; } -static int ext4_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, @@ -876,6 +874,17 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, + flags, pagep); + if (ret < 0) + goto out; + if (ret == 1) { + ret = 0; + goto out; + } + } + retry: handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { @@ -893,6 +902,7 @@ retry: ret = -ENOMEM; goto out; } + *pagep = page; if (ext4_should_dioread_nolock(inode)) @@ -901,8 +911,9 @@ retry: ret = __block_write_begin(page, pos, len, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { - ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, do_journal_get_write_access); + ret = ext4_walk_page_buffers(handle, page_buffers(page), + from, to, NULL, + do_journal_get_write_access); } if (ret) { @@ -957,7 +968,12 @@ static int ext4_generic_write_end(struct file *file, struct inode *inode = mapping->host; handle_t *handle = ext4_journal_current_handle(); - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ext4_has_inline_data(inode)) + copied = ext4_write_inline_data_end(inode, pos, len, + copied, page); + else + copied = block_write_end(file, mapping, pos, + len, copied, page, fsdata); /* * No need to use i_size_read() here, the i_size @@ -1114,8 +1130,8 @@ static int ext4_journalled_write_end(struct file *file, page_zero_new_buffers(page, from+copied, to); } - ret = walk_page_buffers(handle, page_buffers(page), from, - to, &partial, write_end_fn); + ret = ext4_walk_page_buffers(handle, page_buffers(page), from, + to, &partial, write_end_fn); if (!partial) SetPageUptodate(page); new_i_size = pos + copied; @@ -1903,7 +1919,7 @@ static int __ext4_journalled_writepage(struct page *page, ClearPageChecked(page); page_bufs = page_buffers(page); BUG_ON(!page_bufs); - walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); + ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); /* As soon as we unlock the page, it can go away, but we have * references to buffers so we are safe */ unlock_page(page); @@ -1916,11 +1932,11 @@ static int __ext4_journalled_writepage(struct page *page, BUG_ON(!ext4_handle_valid(handle)); - ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, - do_journal_get_write_access); + ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, + do_journal_get_write_access); - err = walk_page_buffers(handle, page_bufs, 0, len, NULL, - write_end_fn); + err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, + write_end_fn); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; @@ -1928,7 +1944,7 @@ static int __ext4_journalled_writepage(struct page *page, if (!ret) ret = err; - walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); + ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: return ret; @@ -2007,8 +2023,8 @@ static int ext4_writepage(struct page *page, commit_write = 1; } page_bufs = page_buffers(page); - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { + if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_delay_or_unwritten)) { /* * We don't want to do block allocation, so redirty * the page and return. We may reach here when we do @@ -2831,7 +2847,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) * We allocate an uinitialized extent if blocks haven't been allocated. * The extent will be converted to initialized after the IO is complete. */ -static int ext4_get_block_write(struct inode *inode, sector_t iblock, +int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", @@ -3738,7 +3754,8 @@ static inline void ext4_iget_extra_inode(struct inode *inode, if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { ext4_set_inode_state(inode, EXT4_STATE_XATTR); ext4_find_inline_data_nolock(inode); - } + } else + EXT4_I(inode)->i_inline_off = 0; } struct inode *ext4_iget(struct super_block *sb, unsigned long ino) @@ -3907,17 +3924,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_file_acl); ret = -EIO; goto bad_inode; - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode))) - /* Validate extent which is part of inode */ - ret = ext4_ext_check_inode(inode); - } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode))) { - /* Validate block references which are part of inode */ - ret = ext4_ind_check_inode(inode); + } else if (!ext4_has_inline_data(inode)) { + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode)))) + /* Validate extent which is part of inode */ + ret = ext4_ext_check_inode(inode); + } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) { + /* Validate block references which are part of inode */ + ret = ext4_ind_check_inode(inode); + } } if (ret) goto bad_inode; @@ -4104,9 +4123,10 @@ static int ext4_do_update_inode(handle_t *handle, cpu_to_le32(new_encode_dev(inode->i_rdev)); raw_inode->i_block[2] = 0; } - } else + } else if (!ext4_has_inline_data(inode)) { for (block = 0; block < EXT4_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; + } raw_inode->i_disk_version = cpu_to_le32(inode->i_version); if (ei->i_extra_isize) { @@ -4793,8 +4813,9 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) * journal_start/journal_stop which can block and take a long time */ if (page_has_buffers(page)) { - if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_unmapped)) { + if (!ext4_walk_page_buffers(NULL, page_buffers(page), + 0, len, NULL, + ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ wait_on_page_writeback(page); ret = VM_FAULT_LOCKED; @@ -4815,7 +4836,7 @@ retry_alloc: } ret = __block_page_mkwrite(vma, vmf, get_block); if (!ret && ext4_should_journal_data(inode)) { - if (walk_page_buffers(handle, page_buffers(page), 0, + if (ext4_walk_page_buffers(handle, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { unlock_page(page); ret = VM_FAULT_SIGBUS; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 646c9b9be8e..db567220623 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -141,6 +141,15 @@ extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); extern int ext4_readpage_inline(struct inode *inode, struct page *page); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep); +extern int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -262,6 +271,23 @@ static inline int ext4_readpage_inline(struct inode *inode, struct page *page) { return 0; } + +static inline int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep) +{ + return 0; +} + +static inline int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From 3fdcfb668fd78ec92d9bc2daddf1d41e2a8a30bb Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:57 -0500 Subject: ext4: add journalled write support for inline data Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 24 ++++++++++++++++++++ fs/ext4/inode.c | 69 ++++++++++++++++++++++++++++++++++++++++---------------- fs/ext4/xattr.h | 12 ++++++++++ 3 files changed, 85 insertions(+), 20 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 320ff6fe5d8..01274b1e7d4 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -747,6 +747,30 @@ out: return copied; } +struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page) +{ + int ret; + void *kaddr; + struct ext4_iloc iloc; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + ext4_std_error(inode->i_sb, ret); + return NULL; + } + + down_write(&EXT4_I(inode)->xattr_sem); + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, 0, len); + kunmap_atomic(kaddr); + up_write(&EXT4_I(inode)->xattr_sem); + + return iloc.bh; +} + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 70c8d5f323f..5c91622cfe0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1124,16 +1124,21 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (copied < len) { - if (!PageUptodate(page)) - copied = 0; - page_zero_new_buffers(page, from+copied, to); - } + if (ext4_has_inline_data(inode)) + copied = ext4_write_inline_data_end(inode, pos, len, + copied, page); + else { + if (copied < len) { + if (!PageUptodate(page)) + copied = 0; + page_zero_new_buffers(page, from+copied, to); + } - ret = ext4_walk_page_buffers(handle, page_buffers(page), from, - to, &partial, write_end_fn); - if (!partial) - SetPageUptodate(page); + ret = ext4_walk_page_buffers(handle, page_buffers(page), from, + to, &partial, write_end_fn); + if (!partial) + SetPageUptodate(page); + } new_i_size = pos + copied; if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); @@ -1911,15 +1916,29 @@ static int __ext4_journalled_writepage(struct page *page, { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; - struct buffer_head *page_bufs; + struct buffer_head *page_bufs = NULL; handle_t *handle = NULL; - int ret = 0; - int err; + int ret = 0, err = 0; + int inline_data = ext4_has_inline_data(inode); + struct buffer_head *inode_bh = NULL; ClearPageChecked(page); - page_bufs = page_buffers(page); - BUG_ON(!page_bufs); - ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); + + if (inline_data) { + BUG_ON(page->index != 0); + BUG_ON(len > ext4_get_max_inline_size(inode)); + inode_bh = ext4_journalled_write_inline_data(inode, len, page); + if (inode_bh == NULL) + goto out; + } else { + page_bufs = page_buffers(page); + if (!page_bufs) { + BUG(); + goto out; + } + ext4_walk_page_buffers(handle, page_bufs, 0, len, + NULL, bget_one); + } /* As soon as we unlock the page, it can go away, but we have * references to buffers so we are safe */ unlock_page(page); @@ -1932,11 +1951,18 @@ static int __ext4_journalled_writepage(struct page *page, BUG_ON(!ext4_handle_valid(handle)); - ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, - do_journal_get_write_access); + if (inline_data) { + ret = ext4_journal_get_write_access(handle, inode_bh); - err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, - write_end_fn); + err = ext4_handle_dirty_metadata(handle, inode, inode_bh); + + } else { + ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, + do_journal_get_write_access); + + err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, + write_end_fn); + } if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; @@ -1944,9 +1970,12 @@ static int __ext4_journalled_writepage(struct page *page, if (!ret) ret = err; - ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); + if (!ext4_has_inline_data(inode)) + ext4_walk_page_buffers(handle, page_bufs, 0, len, + NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: + brelse(inode_bh); return ret; } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index db567220623..7095ac13fbc 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -150,6 +150,10 @@ extern int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page); +extern struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -288,6 +292,14 @@ static inline int ext4_write_inline_data_end(struct inode *inode, { return 0; } + +static inline struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page) +{ + return NULL; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From 9c3569b50f12e47cc5e907b5e37e4a45c0c10b43 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:57 -0500 Subject: ext4: add delalloc support for inline data For delayed allocation mode, we write to inline data if the file is small enough. And in case of we write to some offset larger than the inline size, the 1st page is dirtied, so that ext4_da_writepages can handle the conversion. When the 1st page is initialized with blocks, the inline part is removed. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 ++ fs/ext4/inline.c | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/inode.c | 63 +++++++++++++++++--- fs/ext4/xattr.h | 27 +++++++++ 4 files changed, 262 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9f4efc6c37b..268636af7f5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2022,6 +2022,8 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *head, unsigned from, @@ -2031,6 +2033,8 @@ int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct buffer_head *bh); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 extern struct inode *ext4_iget(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, struct writeback_control *); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 01274b1e7d4..65f7ffb5437 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -771,6 +771,183 @@ ext4_journalled_write_inline_data(struct inode *inode, return iloc.bh; } +/* + * Try to make the page cache and handle ready for the inline data case. + * We can call this function in 2 cases: + * 1. The inode is created and the first write exceeds inline size. We can + * clear the inode state safely. + * 2. The inode has inline data, then we need to read the data, make it + * update and dirty so that ext4_da_writepages can handle it. We don't + * need to start the journal since the file's metatdata isn't changed now. + */ +static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + unsigned flags, + void **fsdata) +{ + int ret = 0, inline_size; + struct page *page; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) + return -ENOMEM; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + goto out; + } + + inline_size = ext4_get_inline_size(inode); + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out; + } + + ret = __block_write_begin(page, 0, inline_size, + ext4_da_get_block_prep); + if (ret) { + ext4_truncate_failed_write(inode); + goto out; + } + + SetPageDirty(page); + SetPageUptodate(page); + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + *fsdata = (void *)CONVERT_INLINE_DATA; + +out: + up_read(&EXT4_I(inode)->xattr_sem); + if (page) { + unlock_page(page); + page_cache_release(page); + } + return ret; +} + +/* + * Prepare the write for the inline data. + * If the the data can be written into the inode, we just read + * the page and make it uptodate, and start the journal. + * Otherwise read the page, makes it dirty so that it can be + * handle in writepages(the i_disksize update is left to the + * normal ext4_da_write_end). + */ +int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata) +{ + int ret, inline_size; + handle_t *handle; + struct page *page; + struct ext4_iloc iloc; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + inline_size = ext4_get_max_inline_size(inode); + + ret = -ENOSPC; + if (inline_size >= pos + len) { + ret = ext4_prepare_inline_data(handle, inode, pos + len); + if (ret && ret != -ENOSPC) + goto out; + } + + if (ret == -ENOSPC) { + ret = ext4_da_convert_inline_data_to_extent(mapping, + inode, + flags, + fsdata); + goto out; + } + + /* + * We cannot recurse into the filesystem as the transaction + * is already started. + */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out; + } + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ret = 0; + goto out_release_page; + } + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out_release_page; + } + + up_read(&EXT4_I(inode)->xattr_sem); + *pagep = page; + handle = NULL; + brelse(iloc.bh); + return 1; +out_release_page: + up_read(&EXT4_I(inode)->xattr_sem); + unlock_page(page); + page_cache_release(page); +out: + if (handle) + ext4_journal_stop(handle); + brelse(iloc.bh); + return ret; +} + +int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page) +{ + int i_size_changed = 0; + + copied = ext4_write_inline_data_end(inode, pos, len, copied, page); + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + * + * But it's important to update i_size while still holding page lock: + * page writeout could otherwise come in and zero beyond i_size. + */ + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + i_size_changed = 1; + } + unlock_page(page); + page_cache_release(page); + + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + mark_inode_dirty(inode); + + return copied; +} int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5c91622cfe0..f16ae02599c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1790,7 +1790,19 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, * file system block. */ down_read((&EXT4_I(inode)->i_data_sem)); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + if (ext4_has_inline_data(inode)) { + /* + * We will soon create blocks for this page, and let + * us pretend as if the blocks aren't allocated yet. + * In case of clusters, we have to handle the work + * of mapping from cluster so that the reserved space + * is calculated properly. + */ + if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) && + ext4_find_delalloc_cluster(inode, map->m_lblk)) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + retval = 0; + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) retval = ext4_ext_map_blocks(NULL, inode, map, 0); else retval = ext4_ind_map_blocks(NULL, inode, map, 0); @@ -1841,8 +1853,8 @@ out_unlock: * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. */ -static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) { struct ext4_map_blocks map; int ret = 0; @@ -2119,7 +2131,8 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) * mpage_da_map_and_submit to map a single contiguous memory region * and then write them. */ -static int write_cache_pages_da(struct address_space *mapping, +static int write_cache_pages_da(handle_t *handle, + struct address_space *mapping, struct writeback_control *wbc, struct mpage_da_data *mpd, pgoff_t *done_index) @@ -2198,6 +2211,17 @@ static int write_cache_pages_da(struct address_space *mapping, wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); + /* + * If we have inline data and arrive here, it means that + * we will soon create the block for the 1st page, so + * we'd better clear the inline data here. + */ + if (ext4_has_inline_data(inode)) { + BUG_ON(ext4_test_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA)); + ext4_destroy_inline_data(handle, inode); + } + if (mpd->next_page != page->index) mpd->first_page = page->index; mpd->next_page = page->index + 1; @@ -2404,7 +2428,8 @@ retry: * contiguous region of logical blocks that need * blocks to be allocated by ext4 and submit them. */ - ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); + ret = write_cache_pages_da(handle, mapping, + wbc, &mpd, &done_index); /* * If we have a contiguous extent of pages and we * haven't done the I/O yet, map the blocks and submit @@ -2468,7 +2493,6 @@ out_writepages: return ret; } -#define FALL_BACK_TO_NONDELALLOC 1 static int ext4_nonda_switch(struct super_block *sb) { s64 free_blocks, dirty_blocks; @@ -2525,6 +2549,19 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len, flags); + + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + ret = ext4_da_write_inline_data_begin(mapping, inode, + pos, len, flags, + pagep, fsdata); + if (ret < 0) + goto out; + if (ret == 1) { + ret = 0; + goto out; + } + } + retry: /* * With delayed allocation, we don't log the i_disksize update @@ -2626,10 +2663,10 @@ static int ext4_da_write_end(struct file *file, * changes. So let's piggyback the i_disksize mark_inode_dirty * into that. */ - new_i_size = pos + copied; if (copied && new_i_size > EXT4_I(inode)->i_disksize) { - if (ext4_da_should_update_i_disksize(page, end)) { + if (ext4_has_inline_data(inode) || + ext4_da_should_update_i_disksize(page, end)) { down_write(&EXT4_I(inode)->i_data_sem); if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; @@ -2641,8 +2678,16 @@ static int ext4_da_write_end(struct file *file, ext4_mark_inode_dirty(handle, inode); } } - ret2 = generic_write_end(file, mapping, pos, len, copied, + + if (write_mode != CONVERT_INLINE_DATA && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && + ext4_has_inline_data(inode)) + ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied, + page); + else + ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + copied = ret2; if (ret2 < 0) ret = ret2; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 7095ac13fbc..37e66f86764 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -154,6 +154,15 @@ extern struct buffer_head * ext4_journalled_write_inline_data(struct inode *inode, unsigned len, struct page *page); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata); +extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -300,6 +309,24 @@ ext4_journalled_write_inline_data(struct inode *inode, { return NULL; } + +static inline int +ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata) +{ + return 0; +} + +static inline int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From a774f9c20e08643fc0e6c48b0419ad7657ed0c04 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:57 -0500 Subject: ext4: make ext4_init_dot_dotdot for inline dir usage Currently, the initialization of dot and dotdot are encapsulated in ext4_mkdir and also bond with dir_block. So create a new function named ext4_init_new_dir and the initialization is moved to ext4_init_dot_dotdot. Now it will called either in the normal non-inline case(rec_len of ".." will cover the whole block) or when we converting an inline dir to a block(rec len of ".." will be the real length). The start of the next entry is also returned for inline dir usage. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 ++ fs/ext4/namei.c | 115 ++++++++++++++++++++++++++++++++++---------------------- 2 files changed, 75 insertions(+), 44 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 268636af7f5..cf840146ce8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2415,6 +2415,10 @@ extern void ext4_unwritten_wait(struct inode *inode); extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; extern struct dentry *ext4_get_parent(struct dentry *child); +extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len); /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 88e9a2c7e32..edb9f10c145 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2230,21 +2230,87 @@ retry: return err; } -static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len) +{ + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), + blocksize); + strcpy(de->name, "."); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(parent_ino); + de->name_len = 2; + if (!dotdot_real_len) + de->rec_len = ext4_rec_len_to_disk(blocksize - + (csum_size + EXT4_DIR_REC_LEN(1)), + blocksize); + else + de->rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(de->name_len), blocksize); + strcpy(de->name, ".."); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + + return ext4_next_entry(de, blocksize); +} + +static int ext4_init_new_dir(handle_t *handle, struct inode *dir, + struct inode *inode) { - handle_t *handle; - struct inode *inode; struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; struct ext4_dir_entry_tail *t; unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; - int err, retries = 0; + int err; if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) csum_size = sizeof(struct ext4_dir_entry_tail); + inode->i_size = EXT4_I(inode)->i_disksize = blocksize; + dir_block = ext4_bread(handle, inode, 0, 1, &err); + if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { + if (!err) { + err = -EIO; + ext4_error(inode->i_sb, + "Directory hole detected on inode %lu\n", + inode->i_ino); + } + goto out; + } + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir_block); + if (err) + goto out; + de = (struct ext4_dir_entry_2 *)dir_block->b_data; + ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); + set_nlink(inode, 2); + if (csum_size) { + t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); + initialize_dirent_tail(t, blocksize); + } + + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); + if (err) + goto out; + set_buffer_verified(dir_block); +out: + brelse(dir_block); + return err; +} + +static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; @@ -2268,47 +2334,9 @@ retry: inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; - inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { - if (!err) { - err = -EIO; - ext4_error(inode->i_sb, - "Directory hole detected on inode %lu\n", - inode->i_ino); - } - goto out_clear_inode; - } - BUFFER_TRACE(dir_block, "get_write_access"); - err = ext4_journal_get_write_access(handle, dir_block); + err = ext4_init_new_dir(handle, dir, inode); if (err) goto out_clear_inode; - de = (struct ext4_dir_entry_2 *) dir_block->b_data; - de->inode = cpu_to_le32(inode->i_ino); - de->name_len = 1; - de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), - blocksize); - strcpy(de->name, "."); - ext4_set_de_type(dir->i_sb, de, S_IFDIR); - de = ext4_next_entry(de, blocksize); - de->inode = cpu_to_le32(dir->i_ino); - de->rec_len = ext4_rec_len_to_disk(blocksize - - (csum_size + EXT4_DIR_REC_LEN(1)), - blocksize); - de->name_len = 2; - strcpy(de->name, ".."); - ext4_set_de_type(dir->i_sb, de, S_IFDIR); - set_nlink(inode, 2); - - if (csum_size) { - t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); - initialize_dirent_tail(t, blocksize); - } - - BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); - if (err) - goto out_clear_inode; - set_buffer_verified(dir_block); err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); @@ -2328,7 +2356,6 @@ out_clear_inode: unlock_new_inode(inode); d_instantiate(dentry, inode); out_stop: - brelse(dir_block); ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; -- cgit v1.2.3 From 226ba972b0863783ad377f741f6ff0538f31ab00 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:58 -0500 Subject: ext4: refactor __ext4_check_dir_entry() to accept start and size The __ext4_check_dir_entry() function() is used to check whether the de is over the block boundary. Now with inline data, it could be within the block boundary while exceeds the inode size. So check this function to check the overflow more precisely. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 16 ++++++++-------- fs/ext4/ext4.h | 7 ++++--- fs/ext4/namei.c | 13 +++++++++---- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 8e07d2a5a13..7c9d08b0f2f 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -72,7 +72,7 @@ static int is_dx_dir(struct inode *inode) int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, - struct buffer_head *bh, + struct buffer_head *bh, char *buf, int size, unsigned int offset) { const char *error_msg = NULL; @@ -85,9 +85,8 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (unlikely(((char *) de - bh->b_data) + rlen > - dir->i_sb->s_blocksize)) - error_msg = "directory entry across blocks"; + else if (unlikely(((char *) de - buf) + rlen > size)) + error_msg = "directory entry across range"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; @@ -98,14 +97,14 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % bh->b_size), + error_msg, (unsigned) (offset % size), offset, le32_to_cpu(de->inode), rlen, de->name_len); else ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % bh->b_size), + error_msg, (unsigned) (offset % size), offset, le32_to_cpu(de->inode), rlen, de->name_len); @@ -221,8 +220,9 @@ revalidate: while (!error && filp->f_pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (ext4_check_dir_entry(inode, filp, de, - bh, offset)) { + if (ext4_check_dir_entry(inode, filp, de, bh, + bh->b_data, bh->b_size, + offset)) { /* * On error, skip the f_pos to the next block */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cf840146ce8..59cbf498fd5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1960,10 +1960,11 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, struct ext4_dir_entry_2 *, - struct buffer_head *, unsigned int); -#define ext4_check_dir_entry(dir, filp, de, bh, offset) \ + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ - (de), (bh), (offset))) + (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index edb9f10c145..10da2d50a5d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -892,6 +892,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, EXT4_DIR_REC_LEN(0)); for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, bh->b_size, (block<i_sb)) + ((char *)de - bh->b_data))) { /* On error, skip the f_pos to the next block. */ @@ -1130,7 +1131,8 @@ static inline int search_dirblock(struct buffer_head *bh, if ((char *) de + namelen <= dlimit && ext4_match (namelen, name, de)) { /* found a match - just to be sure, do a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, + bh->b_size, offset)) return -1; *res_dir = de; return 1; @@ -1643,7 +1645,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *)bh->b_data; top = bh->b_data + (blocksize - csum_size) - reclen; while ((char *) de <= top) { - if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, + bh->b_size, offset)) return -EIO; if (ext4_match(namelen, name, de)) return -EEXIST; @@ -2076,7 +2079,8 @@ static int ext4_delete_entry(handle_t *handle, pde = NULL; de = (struct ext4_dir_entry_2 *) bh->b_data; while (i < bh->b_size - csum_size) { - if (ext4_check_dir_entry(dir, NULL, de, bh, i)) + if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, bh->b_size, i)) return -EIO; if (de == de_del) { BUFFER_TRACE(bh, "get_write_access"); @@ -2439,7 +2443,8 @@ static int empty_dir(struct inode *inode) set_buffer_verified(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; } - if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { + if (ext4_check_dir_entry(inode, NULL, de, bh, + bh->b_data, bh->b_size, offset)) { de = (struct ext4_dir_entry_2 *)(bh->b_data + sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; -- cgit v1.2.3 From 978fef914a2e6b8ad5672d0a39f9201b7aa7c396 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:58 -0500 Subject: ext4: create __ext4_insert_dentry for dir entry insertion The old add_dirent_to_buf handles all the work related to the work of adding dir entry to a dir block. Now we have inline data, so create 2 new function __ext4_find_dest_de and __ext4_insert_dentry that do the real work and let add_dirent_to_buf call them. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 15 ++++++++ fs/ext4/namei.c | 105 +++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 80 insertions(+), 40 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 59cbf498fd5..8e9e94cf1bc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1969,6 +1969,21 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + const char *name, int namelen, + struct ext4_dir_entry_2 **dest_de); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + const char *name, int namelen); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); +} /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 10da2d50a5d..bb9259d20b5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1084,13 +1084,6 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) dx_set_count(entries, count + 1); } -static void ext4_update_dx_flag(struct inode *inode) -{ - if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX)) - ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); -} - /* * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. * @@ -1614,6 +1607,63 @@ errout: return NULL; } +int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + const char *name, int namelen, + struct ext4_dir_entry_2 **dest_de) +{ + struct ext4_dir_entry_2 *de; + unsigned short reclen = EXT4_DIR_REC_LEN(namelen); + int nlen, rlen; + unsigned int offset = 0; + char *top; + + de = (struct ext4_dir_entry_2 *)buf; + top = buf + buf_size - reclen; + while ((char *) de <= top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + buf, buf_size, offset)) + return -EIO; + if (ext4_match(namelen, name, de)) + return -EEXIST; + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if ((de->inode ? rlen - nlen : rlen) >= reclen) + break; + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -ENOSPC; + + *dest_de = de; + return 0; +} + +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + const char *name, int namelen) +{ + + int nlen, rlen; + + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = + (struct ext4_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); + de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); + de = de1; + } + de->file_type = EXT4_FT_UNKNOWN; + de->inode = cpu_to_le32(inode->i_ino); + ext4_set_de_type(inode->i_sb, de, inode->i_mode); + de->name_len = namelen; + memcpy(de->name, name, namelen); +} /* * Add a new entry into a directory (leaf) block. If de is non-NULL, * it points to a directory entry which is guaranteed to be large @@ -1629,12 +1679,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; - unsigned int offset = 0; unsigned int blocksize = dir->i_sb->s_blocksize; unsigned short reclen; - int nlen, rlen, err; - char *top; int csum_size = 0; + int err; if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) @@ -1642,23 +1690,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, reclen = EXT4_DIR_REC_LEN(namelen); if (!de) { - de = (struct ext4_dir_entry_2 *)bh->b_data; - top = bh->b_data + (blocksize - csum_size) - reclen; - while ((char *) de <= top) { - if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, - bh->b_size, offset)) - return -EIO; - if (ext4_match(namelen, name, de)) - return -EEXIST; - nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); - if ((de->inode? rlen - nlen: rlen) >= reclen) - break; - de = (struct ext4_dir_entry_2 *)((char *)de + rlen); - offset += rlen; - } - if ((char *) de > top) - return -ENOSPC; + err = ext4_find_dest_de(dir, inode, + bh, bh->b_data, blocksize - csum_size, + name, namelen, &de); + if (err) + return err; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); @@ -1668,19 +1704,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, } /* By now the buffer is marked for journaling */ - nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); - if (de->inode) { - struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); - de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize); - de->rec_len = ext4_rec_len_to_disk(nlen, blocksize); - de = de1; - } - de->file_type = EXT4_FT_UNKNOWN; - de->inode = cpu_to_le32(inode->i_ino); - ext4_set_de_type(dir->i_sb, de, inode->i_mode); - de->name_len = namelen; - memcpy(de->name, name, namelen); + ext4_insert_dentry(inode, de, blocksize, name, namelen); + /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend -- cgit v1.2.3 From 3c47d54170b6a678875566b1b8d6dcf57904e49b Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:59 -0500 Subject: ext4: let add_dir_entry handle inline data properly This patch let add_dir_entry handle the inline data case. So the dir is initialized as inline dir first and then we can try to add some files to it, when the inline space can't hold all the entries, a dir block will be created and the dir entry will be moved to it. Also for an inlined dir, "." and ".." are removed and we only use 4 bytes to store the parent inode number. These 2 entries will be added when we convert an inline dir to a block-based one. [ Folded in patch from Dan Carpenter to remove an unused variable. ] Signed-off-by: Tao Ma Signed-off-by: Dan Carpenter Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 10 ++ fs/ext4/inline.c | 377 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/namei.c | 34 +++-- fs/ext4/xattr.h | 19 +++ 4 files changed, 430 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8e9e94cf1bc..689ce1d696b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1616,6 +1616,11 @@ struct ext4_dir_entry_tail { __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ }; +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + /* * Ext4 directory file types. Only the low 3 bits are used. The * other bits are reserved for now. @@ -2435,6 +2440,11 @@ extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, struct ext4_dir_entry_2 *de, int blocksize, int csum_size, unsigned int parent_ino, int dotdot_real_len); +extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize); +extern int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh); /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 65f7ffb5437..bf732281873 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -18,6 +18,7 @@ #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) +#define EXT4_INLINE_DOTDOT_SIZE 4 int ext4_get_inline_size(struct inode *inode) { @@ -949,6 +950,382 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, return copied; } +#ifdef INLINE_DIR_DEBUG +void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, + void *inline_start, int inline_size) +{ + int offset; + unsigned short de_len; + struct ext4_dir_entry_2 *de = inline_start; + void *dlimit = inline_start + inline_size; + + trace_printk("inode %lu\n", dir->i_ino); + offset = 0; + while ((void *)de < dlimit) { + de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); + trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n", + offset, de_len, de->name_len, de->name, + de->name_len, le32_to_cpu(de->inode)); + if (ext4_check_dir_entry(dir, NULL, de, bh, + inline_start, inline_size, offset)) + BUG(); + + offset += de_len; + de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); + } +} +#else +#define ext4_show_inline_dir(dir, bh, inline_start, inline_size) +#endif + +/* + * Add a new entry into a inline dir. + * It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + */ +static int ext4_add_dirent_to_inline(handle_t *handle, + struct dentry *dentry, + struct inode *inode, + struct ext4_iloc *iloc, + void *inline_start, int inline_size) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned short reclen; + int err; + struct ext4_dir_entry_2 *de; + + reclen = EXT4_DIR_REC_LEN(namelen); + err = ext4_find_dest_de(dir, inode, iloc->bh, + inline_start, inline_size, + name, namelen, &de); + if (err) + return err; + + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) + return err; + ext4_insert_dentry(inode, de, inline_size, name, namelen); + + ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); + + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext4_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + ext4_update_dx_flag(dir); + dir->i_version++; + ext4_mark_inode_dirty(handle, dir); + return 1; +} + +static void *ext4_get_inline_xattr_pos(struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + + BUG_ON(!EXT4_I(inode)->i_inline_off); + + header = IHDR(inode, ext4_raw_inode(iloc)); + entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) + + EXT4_I(inode)->i_inline_off); + + return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs); +} + +/* Set the final de to cover the whole block. */ +static void ext4_update_final_de(void *de_buf, int old_size, int new_size) +{ + struct ext4_dir_entry_2 *de, *prev_de; + void *limit; + int de_len; + + de = (struct ext4_dir_entry_2 *)de_buf; + if (old_size) { + limit = de_buf + old_size; + do { + prev_de = de; + de_len = ext4_rec_len_from_disk(de->rec_len, old_size); + de_buf += de_len; + de = (struct ext4_dir_entry_2 *)de_buf; + } while (de_buf < limit); + + prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - + old_size, new_size); + } else { + /* this is just created, so create an empty entry. */ + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk(new_size, new_size); + } +} + +static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, + struct ext4_iloc *iloc) +{ + int ret; + int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; + int new_size = get_max_inline_xattr_value_size(dir, iloc); + + if (new_size - old_size <= EXT4_DIR_REC_LEN(1)) + return -ENOSPC; + + ret = ext4_update_inline_data(handle, dir, + new_size + EXT4_MIN_INLINE_DATA_SIZE); + if (ret) + return ret; + + ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size, + EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE); + dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size; + return 0; +} + +static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc, + void *buf, int inline_size) +{ + ext4_create_inline_data(handle, inode, inline_size); + ext4_write_inline_data(inode, iloc, buf, 0, inline_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); +} + +static int ext4_finish_convert_inline_dir(handle_t *handle, + struct inode *inode, + struct buffer_head *dir_block, + void *buf, + int inline_size) +{ + int err, csum_size = 0, header_size = 0; + struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_tail *t; + void *target = dir_block->b_data; + + /* + * First create "." and ".." and then copy the dir information + * back to the block. + */ + de = (struct ext4_dir_entry_2 *)target; + de = ext4_init_dot_dotdot(inode, de, + inode->i_sb->s_blocksize, csum_size, + le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); + header_size = (void *)de - target; + + memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, + inline_size - EXT4_INLINE_DOTDOT_SIZE); + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + inode->i_size = inode->i_sb->s_blocksize; + i_size_write(inode, inode->i_sb->s_blocksize); + EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + ext4_update_final_de(dir_block->b_data, + inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, + inode->i_sb->s_blocksize - csum_size); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(dir_block->b_data, + inode->i_sb->s_blocksize); + initialize_dirent_tail(t, inode->i_sb->s_blocksize); + } + set_buffer_uptodate(dir_block); + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); + if (err) + goto out; + set_buffer_verified(dir_block); +out: + return err; +} + +static int ext4_convert_inline_data_nolock(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc) +{ + int error; + void *buf = NULL; + struct buffer_head *data_bh = NULL; + struct ext4_map_blocks map; + int inline_size; + + inline_size = ext4_get_inline_size(inode); + buf = kmalloc(inline_size, GFP_NOFS); + if (!buf) { + error = -ENOMEM; + goto out; + } + + error = ext4_read_inline_data(inode, buf, inline_size, iloc); + if (error < 0) + goto out; + + error = ext4_destroy_inline_data_nolock(handle, inode); + if (error) + goto out; + + map.m_lblk = 0; + map.m_len = 1; + map.m_flags = 0; + error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE); + if (error < 0) + goto out_restore; + if (!(map.m_flags & EXT4_MAP_MAPPED)) { + error = -EIO; + goto out_restore; + } + + data_bh = sb_getblk(inode->i_sb, map.m_pblk); + if (!data_bh) { + error = -EIO; + goto out_restore; + } + + lock_buffer(data_bh); + error = ext4_journal_get_create_access(handle, data_bh); + if (error) { + unlock_buffer(data_bh); + error = -EIO; + goto out_restore; + } + memset(data_bh->b_data, 0, inode->i_sb->s_blocksize); + + if (!S_ISDIR(inode->i_mode)) { + memcpy(data_bh->b_data, buf, inline_size); + set_buffer_uptodate(data_bh); + error = ext4_handle_dirty_metadata(handle, + inode, data_bh); + } else { + error = ext4_finish_convert_inline_dir(handle, inode, data_bh, + buf, inline_size); + } + + unlock_buffer(data_bh); +out_restore: + if (error) + ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); + +out: + brelse(data_bh); + kfree(buf); + return error; +} + +/* + * Try to add the new entry to the inline data. + * If succeeds, return 0. If not, extended the inline dir and copied data to + * the new created block. + */ +int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + int ret, inline_size; + void *inline_start; + struct ext4_iloc iloc; + struct inode *dir = dentry->d_parent->d_inode; + + ret = ext4_get_inode_loc(dir, &iloc); + if (ret) + return ret; + + down_write(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) + goto out; + + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; + + ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + inline_start, inline_size); + if (ret != -ENOSPC) + goto out; + + /* check whether it can be inserted to inline xattr space. */ + inline_size = EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE; + if (!inline_size) { + /* Try to use the xattr space.*/ + ret = ext4_update_inline_dir(handle, dir, &iloc); + if (ret && ret != -ENOSPC) + goto out; + + inline_size = EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE; + } + + if (inline_size) { + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + + ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + inline_start, inline_size); + + if (ret != -ENOSPC) + goto out; + } + + /* + * The inline space is filled up, so create a new block for it. + * As the extent tree will be created, we have to save the inline + * dir first. + */ + ret = ext4_convert_inline_data_nolock(handle, dir, &iloc); + +out: + ext4_mark_inode_dirty(handle, dir); + up_write(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + return ret; +} + +/* + * Try to create the inline data for the new dir. + * If it succeeds, return 0, otherwise return the error. + * In case of ENOSPC, the caller should create the normal disk layout dir. + */ +int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, + struct inode *inode) +{ + int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE; + struct ext4_iloc iloc; + struct ext4_dir_entry_2 *de; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + ret = ext4_prepare_inline_data(handle, inode, inline_size); + if (ret) + goto out; + + /* + * For inline dir, we only save the inode information for the ".." + * and create a fake dentry to cover the left space. + */ + de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + de->inode = cpu_to_le32(parent->i_ino); + de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE); + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk( + inline_size - EXT4_INLINE_DOTDOT_SIZE, + inline_size); + set_nlink(inode, 2); + inode->i_size = EXT4_I(inode)->i_disksize = inline_size; +out: + brelse(iloc.bh); + return ret; +} + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index bb9259d20b5..3cde36bd802 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -202,13 +202,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); /* checksumming functions */ -#define EXT4_DIRENT_TAIL(block, blocksize) \ - ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ - ((blocksize) - \ - sizeof(struct ext4_dir_entry_tail)))) - -static void initialize_dirent_tail(struct ext4_dir_entry_tail *t, - unsigned int blocksize) +void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize) { memset(t, 0, sizeof(struct ext4_dir_entry_tail)); t->det_rec_len = ext4_rec_len_to_disk( @@ -307,9 +302,9 @@ static void ext4_dirent_csum_set(struct inode *inode, (void *)t - (void *)dirent); } -static inline int ext4_handle_dirty_dirent_node(handle_t *handle, - struct inode *inode, - struct buffer_head *bh) +int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) { ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); return ext4_handle_dirty_metadata(handle, inode, bh); @@ -1878,6 +1873,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, blocksize = sb->s_blocksize; if (!dentry->d_name.len) return -EINVAL; + + if (ext4_has_inline_data(dir)) { + retval = ext4_try_add_inline_entry(handle, dentry, inode); + if (retval < 0) + return retval; + if (retval == 1) { + retval = 0; + return retval; + } + } + if (is_dx(dir)) { retval = ext4_dx_add_entry(handle, dentry, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) @@ -2301,6 +2307,14 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) csum_size = sizeof(struct ext4_dir_entry_tail); + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + err = ext4_try_create_inline_dir(handle, dir, inode); + if (err < 0 && err != -ENOSPC) + goto out; + if (!err) + goto out; + } + inode->i_size = EXT4_I(inode)->i_disksize = blocksize; dir_block = ext4_bread(handle, inode, 0, 1, &err); if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 37e66f86764..397ef4bbaf1 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -163,6 +163,11 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping, extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page); +extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -327,6 +332,20 @@ static inline int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, { return 0; } + +static inline int ext4_try_add_inline_entry(handle_t *handle, + struct dentry *dentry, + struct inode *inode) +{ + return 0; +} + +static inline int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From 65d165d9366dbf783d0102177006d47c8859ba31 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:59 -0500 Subject: ext4: let ext4_readdir handle inline data For "." and "..", we just call filldir by ourselves instead of iterating the real dir entry. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 25 +++++----- fs/ext4/ext4.h | 12 +++++ fs/ext4/inline.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/xattr.h | 9 ++++ 4 files changed, 169 insertions(+), 13 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 7c9d08b0f2f..b8d877f6c1f 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -27,23 +27,11 @@ #include #include #include "ext4.h" - -static unsigned char ext4_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; +#include "xattr.h" static int ext4_dx_readdir(struct file *filp, void *dirent, filldir_t filldir); -static unsigned char get_dtype(struct super_block *sb, int filetype) -{ - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || - (filetype >= EXT4_FT_MAX)) - return DT_UNKNOWN; - - return (ext4_filetype_table[filetype]); -} - /** * Check if the given dir-inode refers to an htree-indexed directory * (or a directory which chould potentially get coverted to use htree @@ -68,6 +56,9 @@ static int is_dx_dir(struct inode *inode) * Return 0 if the directory entry is OK, and 1 if there is a problem * * Note: this is the opposite of what ext2 and ext3 historically returned... + * + * bh passed here can be an inode block or a dir data block, depending + * on the inode inline data flag. */ int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct file *filp, @@ -124,6 +115,14 @@ static int ext4_readdir(struct file *filp, int ret = 0; int dir_has_error = 0; + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + ret = ext4_read_inline_dir(filp, dirent, filldir, + &has_inline_data); + if (has_inline_data) + return ret; + } + if (is_dx_dir(inode)) { err = ext4_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 689ce1d696b..e3a74658c63 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1989,6 +1989,18 @@ static inline void ext4_update_dx_flag(struct inode *inode) EXT4_FEATURE_COMPAT_DIR_INDEX)) ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } +static unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT4_FT_MAX)) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index bf732281873..471504133c7 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1288,6 +1288,142 @@ out: return ret; } +int ext4_read_inline_dir(struct file *filp, + void *dirent, filldir_t filldir, + int *has_inline_data) +{ + int error = 0; + unsigned int offset, parent_ino; + int i, stored; + struct ext4_dir_entry_2 *de; + struct super_block *sb; + struct inode *inode = filp->f_path.dentry->d_inode; + int ret, inline_size = 0; + struct ext4_iloc iloc; + void *dir_buf = NULL; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + *has_inline_data = 0; + goto out; + } + + inline_size = ext4_get_inline_size(inode); + dir_buf = kmalloc(inline_size, GFP_NOFS); + if (!dir_buf) { + ret = -ENOMEM; + up_read(&EXT4_I(inode)->xattr_sem); + goto out; + } + + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + if (ret < 0) + goto out; + + sb = inode->i_sb; + stored = 0; + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + + while (!error && !stored && filp->f_pos < inode->i_size) { +revalidate: + /* + * If the version has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the inline + * dir to make sure. + */ + if (filp->f_version != inode->i_version) { + for (i = 0; + i < inode->i_size && i < offset;) { + if (!i) { + /* skip "." and ".." if needed. */ + i += EXT4_INLINE_DOTDOT_SIZE; + continue; + } + de = (struct ext4_dir_entry_2 *) + (dir_buf + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, + inline_size) < EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + inline_size); + } + offset = i; + filp->f_pos = offset; + filp->f_version = inode->i_version; + } + + while (!error && filp->f_pos < inode->i_size) { + if (filp->f_pos == 0) { + error = filldir(dirent, ".", 1, 0, inode->i_ino, + DT_DIR); + if (error) + break; + stored++; + + error = filldir(dirent, "..", 2, 0, parent_ino, + DT_DIR); + if (error) + break; + stored++; + + filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE; + continue; + } + + de = (struct ext4_dir_entry_2 *)(dir_buf + offset); + if (ext4_check_dir_entry(inode, filp, de, + iloc.bh, dir_buf, + inline_size, offset)) { + ret = stored; + goto out; + } + offset += ext4_rec_len_from_disk(de->rec_len, + inline_size); + if (le32_to_cpu(de->inode)) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + u64 version = filp->f_version; + + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type)); + if (error) + break; + if (version != filp->f_version) + goto revalidate; + stored++; + } + filp->f_pos += ext4_rec_len_from_disk(de->rec_len, + inline_size); + } + offset = 0; + } +out: + kfree(dir_buf); + brelse(iloc.bh); + return ret; +} + /* * Try to create the inline data for the new dir. * If it succeeds, return 0, otherwise return the error. diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 397ef4bbaf1..539e6a08c95 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -168,6 +168,9 @@ extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + void *dirent, filldir_t filldir, + int *has_inline_data); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -346,6 +349,12 @@ static inline int ext4_try_create_inline_dir(handle_t *handle, { return 0; } +static inline int ext4_read_inline_dir(struct file *filp, + void *dirent, filldir_t filldir, + int *has_inline_data) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From 7335cd3b41b1e704608ca46159641ca9cb598121 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:59 -0500 Subject: ext4: create a new function search_dir search_dirblock is used to search a dir block, but the code is almost the same for searching an inline dir. So create a new fuction search_dir and let search_dirblock call it. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 7 +++++++ fs/ext4/namei.c | 26 +++++++++++++++++++------- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e3a74658c63..a971b65bf5c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2122,6 +2122,13 @@ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); +extern int search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); /* resize.c */ extern int ext4_group_add(struct super_block *sb, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3cde36bd802..d50684b9149 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1005,6 +1005,16 @@ errout: return (err); } +static inline int search_dirblock(struct buffer_head *bh, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir) +{ + return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, + d_name, offset, res_dir); +} + /* * Directory block splitting, compacting @@ -1098,11 +1108,13 @@ static inline int ext4_match (int len, const char * const name, /* * Returns 0 if not found, -1 on failure, and 1 on success */ -static inline int search_dirblock(struct buffer_head *bh, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 ** res_dir) +int search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; @@ -1110,8 +1122,8 @@ static inline int search_dirblock(struct buffer_head *bh, const char *name = d_name->name; int namelen = d_name->len; - de = (struct ext4_dir_entry_2 *) bh->b_data; - dlimit = bh->b_data + dir->i_sb->s_blocksize; + de = (struct ext4_dir_entry_2 *)search_buf; + dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ -- cgit v1.2.3 From e8e948e7802a2ab05c146d3e72a39b93b5718236 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:00 -0500 Subject: ext4: let ext4_find_entry handle inline data Create a new function ext4_find_inline_entry() to handle the case of inline data. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/namei.c | 10 +++++++++- fs/ext4/xattr.h | 13 +++++++++++++ 3 files changed, 70 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 471504133c7..0a8f5a86549 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1462,6 +1462,54 @@ out: return ret; } +struct buffer_head *ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data) +{ + int ret; + struct ext4_iloc iloc; + void *inline_start; + int inline_size; + + if (ext4_get_inode_loc(dir, &iloc)) + return NULL; + + down_read(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; + ret = search_dir(iloc.bh, inline_start, inline_size, + dir, d_name, 0, res_dir); + if (ret == 1) + goto out_find; + if (ret < 0) + goto out; + + if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) + goto out; + + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; + + ret = search_dir(iloc.bh, inline_start, inline_size, + dir, d_name, 0, res_dir); + if (ret == 1) + goto out_find; + +out: + brelse(iloc.bh); + iloc.bh = NULL; +out_find: + up_read(&EXT4_I(dir)->xattr_sem); + return iloc.bh; +} + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index d50684b9149..b498cafed12 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1015,7 +1015,6 @@ static inline int search_dirblock(struct buffer_head *bh, d_name, offset, res_dir); } - /* * Directory block splitting, compacting */ @@ -1198,6 +1197,15 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, namelen = d_name->len; if (namelen > EXT4_NAME_LEN) return NULL; + + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + ret = ext4_find_inline_entry(dir, d_name, res_dir, + &has_inline_data); + if (has_inline_data) + return ret; + } + if ((namelen <= 2) && (name[0] == '.') && (name[1] == '.' || name[1] == '\0')) { /* diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 539e6a08c95..c6f3dea88d6 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -171,6 +171,10 @@ extern int ext4_try_create_inline_dir(handle_t *handle, extern int ext4_read_inline_dir(struct file *filp, void *dirent, filldir_t filldir, int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -355,6 +359,15 @@ static inline int ext4_read_inline_dir(struct file *filp, { return 0; } + +static inline struct buffer_head * +ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data) +{ + return NULL; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From 05019a9e7f025133f20c67677c9c8551eca3c6dc Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:00 -0500 Subject: ext4: make ext4_delete_entry generic Currently ext4_delete_entry() is used only for dir entry removing from a dir block. So let us create a new function ext4_generic_delete_entry and this function takes a entry_buf and a buf_size so that it can be used for inline data. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 7 ++++++ fs/ext4/namei.c | 72 ++++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 53 insertions(+), 26 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a971b65bf5c..6cfe546282d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2129,6 +2129,13 @@ extern int search_dir(struct buffer_head *bh, const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); /* resize.c */ extern int ext4_group_add(struct super_block *sb, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b498cafed12..c10fc2631ff 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2109,37 +2109,29 @@ cleanup: } /* - * ext4_delete_entry deletes a directory entry by merging it with the - * previous entry + * ext4_generic_delete_entry deletes a directory entry by merging it + * with the previous entry */ -static int ext4_delete_entry(handle_t *handle, - struct inode *dir, - struct ext4_dir_entry_2 *de_del, - struct buffer_head *bh) +int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size) { struct ext4_dir_entry_2 *de, *pde; unsigned int blocksize = dir->i_sb->s_blocksize; - int csum_size = 0; - int i, err; - - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) - csum_size = sizeof(struct ext4_dir_entry_tail); + int i; i = 0; pde = NULL; - de = (struct ext4_dir_entry_2 *) bh->b_data; - while (i < bh->b_size - csum_size) { + de = (struct ext4_dir_entry_2 *)entry_buf; + while (i < buf_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, i)) return -EIO; if (de == de_del) { - BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) { - ext4_std_error(dir->i_sb, err); - return err; - } if (pde) pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, @@ -2150,12 +2142,6 @@ static int ext4_delete_entry(handle_t *handle, else de->inode = 0; dir->i_version++; - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_dirent_node(handle, dir, bh); - if (unlikely(err)) { - ext4_std_error(dir->i_sb, err); - return err; - } return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); @@ -2165,6 +2151,40 @@ static int ext4_delete_entry(handle_t *handle, return -ENOENT; } +static int ext4_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh) +{ + int err, csum_size = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out; + + err = ext4_generic_delete_entry(handle, dir, de_del, + bh, bh->b_data, + dir->i_sb->s_blocksize, csum_size); + if (err) + goto out; + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_dirent_node(handle, dir, bh); + if (unlikely(err)) + goto out; + + return 0; +out: + if (err != -ENOENT) + ext4_std_error(dir->i_sb, err); + return err; +} + /* * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, * since this indicates that nlinks count was previously 1. -- cgit v1.2.3 From 9f40fe54635b7533f51993d0f5e7f014fc14d33a Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:00 -0500 Subject: ext4: let ext4_delete_entry() handle inline data Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/namei.c | 8 ++++++++ fs/ext4/xattr.h | 13 +++++++++++++ 3 files changed, 76 insertions(+) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 0a8f5a86549..f5e9c0e6d73 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1510,6 +1510,61 @@ out_find: return iloc.bh; } +int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data) +{ + int err, inline_size; + struct ext4_iloc iloc; + void *inline_start; + + err = ext4_get_inode_loc(dir, &iloc); + if (err) + return err; + + down_write(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) < + EXT4_MIN_INLINE_DATA_SIZE) { + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - + EXT4_INLINE_DOTDOT_SIZE; + } else { + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + inline_size = ext4_get_inline_size(dir) - + EXT4_MIN_INLINE_DATA_SIZE; + } + + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto out; + + err = ext4_generic_delete_entry(handle, dir, de_del, bh, + inline_start, inline_size, 0); + if (err) + goto out; + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_mark_inode_dirty(handle, dir); + if (unlikely(err)) + goto out; + + ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); +out: + up_write(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + if (err != -ENOENT) + ext4_std_error(dir->i_sb, err); + return err; +} + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index c10fc2631ff..a32228a73df 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2158,6 +2158,14 @@ static int ext4_delete_entry(handle_t *handle, { int err, csum_size = 0; + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + err = ext4_delete_inline_entry(handle, dir, de_del, bh, + &has_inline_data); + if (has_inline_data) + return err; + } + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) csum_size = sizeof(struct ext4_dir_entry_tail); diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index c6f3dea88d6..f86e424d75e 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -175,6 +175,11 @@ extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -368,6 +373,14 @@ ext4_find_inline_entry(struct inode *dir, { return NULL; } +static inline int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From 61f86638d8a656101bb0f9c41c55d9685f8a2357 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:01 -0500 Subject: ext4: let empty_dir handle inline dir empty_dir is used when deleting a dir. So it should handle inline dir properly. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/namei.c | 8 +++++ fs/ext4/xattr.h | 6 ++++ 3 files changed, 104 insertions(+) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index f5e9c0e6d73..e5da458faba 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1565,6 +1565,96 @@ out: return err; } +/* + * Get the inline dentry at offset. + */ +static inline struct ext4_dir_entry_2 * +ext4_get_inline_entry(struct inode *inode, + struct ext4_iloc *iloc, + unsigned int offset, + void **inline_start, + int *inline_size) +{ + void *inline_pos; + + BUG_ON(offset > ext4_get_inline_size(inode)); + + if (offset < EXT4_MIN_INLINE_DATA_SIZE) { + inline_pos = (void *)ext4_raw_inode(iloc)->i_block; + *inline_size = EXT4_MIN_INLINE_DATA_SIZE; + } else { + inline_pos = ext4_get_inline_xattr_pos(inode, iloc); + offset -= EXT4_MIN_INLINE_DATA_SIZE; + *inline_size = ext4_get_inline_size(inode) - + EXT4_MIN_INLINE_DATA_SIZE; + } + + if (inline_start) + *inline_start = inline_pos; + return (struct ext4_dir_entry_2 *)(inline_pos + offset); +} + +int empty_inline_dir(struct inode *dir, int *has_inline_data) +{ + int err, inline_size; + struct ext4_iloc iloc; + void *inline_pos; + unsigned int offset; + struct ext4_dir_entry_2 *de; + int ret = 1; + + err = ext4_get_inode_loc(dir, &iloc); + if (err) { + EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", + err, dir->i_ino); + return 1; + } + + down_read(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + if (!le32_to_cpu(de->inode)) { + ext4_warning(dir->i_sb, + "bad inline directory (dir #%lu) - no `..'", + dir->i_ino); + ret = 1; + goto out; + } + + offset = EXT4_INLINE_DOTDOT_SIZE; + while (offset < dir->i_size) { + de = ext4_get_inline_entry(dir, &iloc, offset, + &inline_pos, &inline_size); + if (ext4_check_dir_entry(dir, NULL, de, + iloc.bh, inline_pos, + inline_size, offset)) { + ext4_warning(dir->i_sb, + "bad inline directory (dir #%lu) - " + "inode %u, rec_len %u, name_len %d" + "inline size %d\n", + dir->i_ino, le32_to_cpu(de->inode), + le16_to_cpu(de->rec_len), de->name_len, + inline_size); + ret = 1; + goto out; + } + if (le32_to_cpu(de->inode)) { + ret = 0; + goto out; + } + offset += ext4_rec_len_from_disk(de->rec_len, inline_size); + } + +out: + up_read(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + return ret; +} + int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a32228a73df..e3e20d0aa29 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2464,6 +2464,14 @@ static int empty_dir(struct inode *inode) struct super_block *sb; int err = 0; + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + + err = empty_inline_dir(inode, &has_inline_data); + if (has_inline_data) + return err; + } + sb = inode->i_sb; if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index f86e424d75e..7747bbcebb3 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -180,6 +180,7 @@ extern int ext4_delete_inline_entry(handle_t *handle, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, int *has_inline_data); +extern int empty_inline_dir(struct inode *dir, int *has_inline_data); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -381,6 +382,11 @@ static inline int ext4_delete_inline_entry(handle_t *handle, { return 0; } + +static inline int empty_inline_dir(struct inode *dir, int *has_inline_data) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From 32f7f22c0b52e8189fef83986b16dc7abe95f2c4 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:01 -0500 Subject: ext4: let ext4_rename handle inline dir In case we rename a directory, ext4_rename has to read the dir block and change its dotdot's information. The old ext4_rename encapsulated the dir_block read into itself. So this patch adds a new function ext4_get_first_dir_block() which gets the dir buffer information so the ext4_rename can handle it properly. As it will also change the parent inode number, we return the parent_de so that ext4_rename() can handle it more easily. ext4_find_entry is also changed so that the caller(rename) can tell whether the found entry is an inlined one or not and journaling the corresponding buffer head. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 15 ++++++++ fs/ext4/namei.c | 109 +++++++++++++++++++++++++++++++++++++------------------ fs/ext4/xattr.h | 11 ++++++ 3 files changed, 100 insertions(+), 35 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index e5da458faba..fc362998092 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1424,6 +1424,21 @@ out: return ret; } +struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval) +{ + struct ext4_iloc iloc; + + *retval = ext4_get_inode_loc(inode, &iloc); + if (*retval) + return NULL; + + *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + + return iloc.bh; +} + /* * Try to create the inline data for the new dir. * If it succeeds, return 0, otherwise return the error. diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e3e20d0aa29..b37c2183983 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1176,7 +1176,8 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, */ static struct buffer_head * ext4_find_entry (struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 ** res_dir) + struct ext4_dir_entry_2 **res_dir, + int *inlined) { struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; @@ -1202,8 +1203,11 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, int has_inline_data = 1; ret = ext4_find_inline_entry(dir, d_name, res_dir, &has_inline_data); - if (has_inline_data) + if (has_inline_data) { + if (inlined) + *inlined = 1; return ret; + } } if ((namelen <= 2) && (name[0] == '.') && @@ -1390,7 +1394,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - bh = ext4_find_entry(dir, &dentry->d_name, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); @@ -1424,7 +1428,7 @@ struct dentry *ext4_get_parent(struct dentry *child) struct ext4_dir_entry_2 * de; struct buffer_head *bh; - bh = ext4_find_entry(child->d_inode, &dotdot, &de); + bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); @@ -2725,7 +2729,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) return PTR_ERR(handle); retval = -ENOENT; - bh = ext4_find_entry(dir, &dentry->d_name, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (!bh) goto end_rmdir; @@ -2790,7 +2794,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) ext4_handle_sync(handle); retval = -ENOENT; - bh = ext4_find_entry(dir, &dentry->d_name, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (!bh) goto end_unlink; @@ -2972,8 +2976,39 @@ retry: return err; } -#define PARENT_INO(buffer, size) \ - (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode) + +/* + * Try to find buffer head where contains the parent block. + * It should be the inode block if it is inlined or the 1st block + * if it is a normal dir. + */ +static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, + struct inode *inode, + int *retval, + struct ext4_dir_entry_2 **parent_de, + int *inlined) +{ + struct buffer_head *bh; + + if (!ext4_has_inline_data(inode)) { + if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) { + if (!*retval) { + *retval = -EIO; + ext4_error(inode->i_sb, + "Directory hole detected on inode %lu\n", + inode->i_ino); + } + return NULL; + } + *parent_de = ext4_next_entry( + (struct ext4_dir_entry_2 *)bh->b_data, + inode->i_sb->s_blocksize); + return bh; + } + + *inlined = 1; + return ext4_get_first_inline_block(inode, parent_de, retval); +} /* * Anybody can rename anything with this: the permission checks are left to the @@ -2987,6 +3022,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *old_bh, *new_bh, *dir_bh; struct ext4_dir_entry_2 *old_de, *new_de; int retval, force_da_alloc = 0; + int inlined = 0, new_inlined = 0; + struct ext4_dir_entry_2 *parent_de; dquot_initialize(old_dir); dquot_initialize(new_dir); @@ -3006,7 +3043,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) ext4_handle_sync(handle); - old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); + old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -3019,7 +3056,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; new_inode = new_dentry->d_inode; - new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); + new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, + &new_de, &new_inlined); if (new_bh) { if (!new_inode) { brelse(new_bh); @@ -3033,22 +3071,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; } retval = -EIO; - if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) { - if (!retval) { - retval = -EIO; - ext4_error(old_inode->i_sb, - "Directory hole detected on inode %lu\n", - old_inode->i_ino); - } + dir_bh = ext4_get_first_dir_block(handle, old_inode, + &retval, &parent_de, + &inlined); + if (!dir_bh) goto end_rename; - } - if (!buffer_verified(dir_bh) && + if (!inlined && !buffer_verified(dir_bh) && !ext4_dirent_csum_verify(old_inode, (struct ext4_dir_entry *)dir_bh->b_data)) goto end_rename; set_buffer_verified(dir_bh); - if (le32_to_cpu(PARENT_INO(dir_bh->b_data, - old_dir->i_sb->s_blocksize)) != old_dir->i_ino) + if (le32_to_cpu(parent_de->inode) != old_dir->i_ino) goto end_rename; retval = -EMLINK; if (!new_inode && new_dir != old_dir && @@ -3077,10 +3110,13 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_current_time(new_dir); ext4_mark_inode_dirty(handle, new_dir); BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh); - if (unlikely(retval)) { - ext4_std_error(new_dir->i_sb, retval); - goto end_rename; + if (!new_inlined) { + retval = ext4_handle_dirty_dirent_node(handle, + new_dir, new_bh); + if (unlikely(retval)) { + ext4_std_error(new_dir->i_sb, retval); + goto end_rename; + } } brelse(new_bh); new_bh = NULL; @@ -3108,7 +3144,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *old_bh2; struct ext4_dir_entry_2 *old_de2; - old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); + old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, + &old_de2, NULL); if (old_bh2) { retval = ext4_delete_entry(handle, old_dir, old_de2, old_bh2); @@ -3128,17 +3165,19 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); ext4_update_dx_flag(old_dir); if (dir_bh) { - PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = - cpu_to_le32(new_dir->i_ino); + parent_de->inode = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - if (is_dx(old_inode)) { - retval = ext4_handle_dirty_dx_node(handle, - old_inode, - dir_bh); + if (!inlined) { + if (is_dx(old_inode)) { + retval = ext4_handle_dirty_dx_node(handle, + old_inode, + dir_bh); + } else { + retval = ext4_handle_dirty_dirent_node(handle, + old_inode, dir_bh); + } } else { - retval = ext4_handle_dirty_dirent_node(handle, - old_inode, - dir_bh); + retval = ext4_mark_inode_dirty(handle, old_inode); } if (retval) { ext4_std_error(old_dir->i_sb, retval); diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 7747bbcebb3..f6c3ca6dae4 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -181,6 +181,9 @@ extern int ext4_delete_inline_entry(handle_t *handle, struct buffer_head *bh, int *has_inline_data); extern int empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -387,6 +390,14 @@ static inline int empty_inline_dir(struct inode *dir, int *has_inline_data) { return 0; } + +static inline struct buffer_head * +ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval) +{ + return NULL; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From 941919856c11d4dd11d4fcabb4dab58bd2b146bf Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:02 -0500 Subject: ext4: let fiemap work with inline data fiemap is used to find the disk layout of a file, as for inline data, let us just pretend like a file with just one extent. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 9 +++++++++ fs/ext4/inline.c | 35 +++++++++++++++++++++++++++++++++++ fs/ext4/xattr.h | 10 ++++++++++ 3 files changed, 54 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f2659f51b23..70dc6fc53a0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4802,6 +4802,15 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ext4_lblk_t start_blk; int error = 0; + if (ext4_has_inline_data(inode)) { + int has_inline = 1; + + error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); + + if (has_inline) + return error; + } + /* fallback to generic here if not in extents fmt */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return generic_block_fiemap(inode, fieinfo, start, len, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index fc362998092..bf5f7780388 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -15,6 +15,7 @@ #include "ext4.h" #include "xattr.h" #include "truncate.h" +#include #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) @@ -1680,3 +1681,37 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) return ret; } + +int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline) +{ + __u64 physical = 0; + __u64 length; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST; + int error = 0; + struct ext4_iloc iloc; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + *has_inline = 0; + goto out; + } + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + goto out; + + physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; + physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; + physical += offsetof(struct ext4_inode, i_block); + length = i_size_read(inode); + + if (physical) + error = fiemap_fill_next_extent(fieinfo, 0, physical, + length, flags); + brelse(iloc.bh); +out: + up_read(&EXT4_I(inode)->xattr_sem); + return (error < 0 ? error : 0); +} diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index f6c3ca6dae4..5c7e55edfe6 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -184,6 +184,9 @@ extern int empty_inline_dir(struct inode *dir, int *has_inline_data); extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval); +extern int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -398,6 +401,13 @@ ext4_get_first_inline_block(struct inode *inode, { return NULL; } + +static inline int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From 0d812f77b36c16dff692390508155de2c7f95ea3 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:02 -0500 Subject: ext4: evict inline data out if we need to strore xattr in inode Now we that store data in the inode, in case we need to store some xattrs and inode doesn't have enough space, Andreas suggested that we should keep the xattr(metadata) in and data should be pushed out. So this patch does the work. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 48 +++++++++++++++++++++++++++++++++++++++++++----- fs/ext4/xattr.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++---- fs/ext4/xattr.h | 9 ++++++--- 3 files changed, 99 insertions(+), 12 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index bf5f7780388..cec651e2646 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -207,8 +207,8 @@ out: /* * write the buffer to the inline inode. * If 'create' is set, we don't need to do the extra copy in the xattr - * value since it is already handled by ext4_xattr_ibody_set. That saves - * us one memcpy. + * value since it is already handled by ext4_xattr_ibody_inline_set. + * That saves us one memcpy. */ void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, void *buffer, loff_t pos, unsigned int len) @@ -285,7 +285,7 @@ static int ext4_create_inline_data(handle_t *handle, BUG_ON(!is.s.not_found); - error = ext4_xattr_ibody_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); if (error) { if (error == -ENOSPC) ext4_clear_inode_state(inode, @@ -354,7 +354,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, i.value = value; i.value_len = len; - error = ext4_xattr_ibody_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); if (error) goto out; @@ -427,7 +427,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, if (error) goto out; - error = ext4_xattr_ibody_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); if (error) goto out; @@ -1715,3 +1715,41 @@ out: up_read(&EXT4_I(inode)->xattr_sem); return (error < 0 ? error : 0); } + +/* + * Called during xattr set, and if we can sparse space 'needed', + * just create the extent tree evict the data to the outer block. + * + * We use jbd2 instead of page cache to move data to the 1st block + * so that the whole transaction can be committed as a whole and + * the data isn't lost because of the delayed page cache write. + */ +int ext4_try_to_evict_inline_data(handle_t *handle, + struct inode *inode, + int needed) +{ + int error; + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + if (EXT4_XATTR_LEN(entry->e_name_len) + + EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) { + error = -ENOSPC; + goto out; + } + + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); +out: + brelse(iloc.bh); + return error; +} diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index a47dc3883a2..2251769a3c5 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -958,9 +958,47 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, return 0; } -int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) +int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_search *s = &is->s; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; + error = ext4_xattr_set_entry(i, s); + if (error) { + if (error == -ENOSPC && + ext4_has_inline_data(inode)) { + error = ext4_try_to_evict_inline_data(handle, inode, + EXT4_XATTR_LEN(strlen(i->name) + + EXT4_XATTR_SIZE(i->value_len))); + if (error) + return error; + error = ext4_xattr_ibody_find(inode, i, is); + if (error) + return error; + error = ext4_xattr_set_entry(i, s); + } + if (error) + return error; + } + header = IHDR(inode, ext4_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + } else { + header->h_magic = cpu_to_le32(0); + ext4_clear_inode_state(inode, EXT4_STATE_XATTR); + } + return 0; +} + +static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; @@ -1116,9 +1154,17 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, { handle_t *handle; int error, retries = 0; + int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); retry: - handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + /* + * In case of inline data, we may push out the data to a block, + * So reserve the journal space first. + */ + if (ext4_has_inline_data(inode)) + credits += ext4_writepage_trans_blocks(inode) + 1; + + handle = ext4_journal_start(inode, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); } else { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 5c7e55edfe6..1be243aab01 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -122,9 +122,9 @@ extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, extern int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size); -extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is); +extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); extern int ext4_has_inline_data(struct inode *inode); extern int ext4_get_inline_size(struct inode *inode); @@ -187,6 +187,9 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, extern int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int *has_inline); +extern int ext4_try_to_evict_inline_data(handle_t *handle, + struct inode *inode, + int needed); # else /* CONFIG_EXT4_FS_XATTR */ static inline int -- cgit v1.2.3 From aef1c8513c1f8ae076e22ea2a57eff5835578e75 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:02 -0500 Subject: ext4: let ext4_truncate handle inline data correctly Signed-off-by: Robin Dong Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/inode.c | 8 +++++ fs/ext4/xattr.h | 9 ++++++ 3 files changed, 107 insertions(+) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index cec651e2646..727edb8d57e 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1753,3 +1753,93 @@ out: brelse(iloc.bh); return error; } + +void ext4_inline_data_truncate(struct inode *inode, int *has_inline) +{ + handle_t *handle; + int inline_size, value_len, needed_blocks; + size_t i_size; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + + needed_blocks = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) + return; + + down_write(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + *has_inline = 0; + ext4_journal_stop(handle); + return; + } + + if (ext4_orphan_add(handle, inode)) + goto out; + + if (ext4_get_inode_loc(inode, &is.iloc)) + goto out; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = inode->i_size; + inline_size = ext4_get_inline_size(inode); + EXT4_I(inode)->i_disksize = i_size; + + if (i_size < inline_size) { + /* Clear the content in the xattr space. */ + if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { + if (ext4_xattr_ibody_find(inode, &i, &is)) + goto out_error; + + BUG_ON(is.s.not_found); + + value_len = le32_to_cpu(is.s.here->e_value_size); + value = kmalloc(value_len, GFP_NOFS); + if (!value) + goto out_error; + + if (ext4_xattr_ibody_get(inode, i.name_index, i.name, + value, value_len)) + goto out_error; + + i.value = value; + i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? + i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; + if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is)) + goto out_error; + } + + /* Clear the content within i_blocks. */ + if (i_size < EXT4_MIN_INLINE_DATA_SIZE) + memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0, + EXT4_MIN_INLINE_DATA_SIZE - i_size); + + EXT4_I(inode)->i_inline_size = i_size < + EXT4_MIN_INLINE_DATA_SIZE ? + EXT4_MIN_INLINE_DATA_SIZE : i_size; + } + +out_error: + up_write(&EXT4_I(inode)->i_data_sem); +out: + brelse(is.iloc.bh); + up_write(&EXT4_I(inode)->xattr_sem); + kfree(value); + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); + return; +} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f16ae02599c..cb1c1ab2720 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3594,6 +3594,14 @@ void ext4_truncate(struct inode *inode) if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + if (ext4_has_inline_data(inode)) { + int has_inline = 1; + + ext4_inline_data_truncate(inode, &has_inline); + if (has_inline) + return; + } + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ext4_ext_truncate(inode); else diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 1be243aab01..1a71a97e14a 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -190,6 +190,8 @@ extern int ext4_inline_data_fiemap(struct inode *inode, extern int ext4_try_to_evict_inline_data(handle_t *handle, struct inode *inode, int needed); +extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); + # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -411,6 +413,13 @@ static inline int ext4_inline_data_fiemap(struct inode *inode, { return 0; } + +static inline void ext4_inline_data_truncate(struct inode *inode, + int *has_inline) +{ + return; +} + # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From 0c8d414f163f5d35e43a4de7a6e5ee8c253fcccf Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:03 -0500 Subject: ext4: let fallocate handle inline data correctly If we are punching hole in a file, we will return ENOTSUPP. As for the fallocation of some extents, we will convert the inline data to a normal extent based file first. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 ++++ fs/ext4/inline.c | 39 +++++++++++++++++++++++++++++++++++++++ fs/ext4/xattr.h | 5 +++++ 3 files changed, 48 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 70dc6fc53a0..d45ff3faefc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4399,6 +4399,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_PUNCH_HOLE) return ext4_punch_hole(file, offset, len); + ret = ext4_convert_inline_data(inode); + if (ret) + return ret; + trace_ext4_fallocate_enter(inode, offset, len, mode); map.m_lblk = offset >> blkbits; /* diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 727edb8d57e..53b2f65091d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1843,3 +1843,42 @@ out: ext4_journal_stop(handle); return; } + +int ext4_convert_inline_data(struct inode *inode) +{ + int error, needed_blocks; + handle_t *handle; + struct ext4_iloc iloc; + + if (!ext4_has_inline_data(inode)) { + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + return 0; + } + + needed_blocks = ext4_writepage_trans_blocks(inode); + + iloc.bh = NULL; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto out_free; + } + + down_write(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_write(&EXT4_I(inode)->xattr_sem); + goto out; + } + + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); + up_write(&EXT4_I(inode)->xattr_sem); +out: + ext4_journal_stop(handle); +out_free: + brelse(iloc.bh); + return error; +} diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 1a71a97e14a..4222388c772 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -192,6 +192,7 @@ extern int ext4_try_to_evict_inline_data(handle_t *handle, int needed); extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); +extern int ext4_convert_inline_data(struct inode *inode); # else /* CONFIG_EXT4_FS_XATTR */ static inline int @@ -420,6 +421,10 @@ static inline void ext4_inline_data_truncate(struct inode *inode, return; } +static inline int ext4_convert_inline_data(struct inode *inode) +{ + return 0; +} # endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY -- cgit v1.2.3 From f08225d176a5736363beea653b9b3fb9400c1255 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:03 -0500 Subject: ext4: enable ext4 inline support Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 ++- fs/ext4/ialloc.c | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6cfe546282d..b90e2720b82 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1529,7 +1529,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ - EXT4_FEATURE_INCOMPAT_MMP) + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index c7efa88d714..3f32c801244 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -902,6 +902,10 @@ got: ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; + ei->i_inline_off = 0; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + ret = inode; dquot_initialize(inode); err = dquot_alloc_inode(inode); -- cgit v1.2.3 From 64744e03c6871e5e4678478bab1b8c3ba6cca395 Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Mon, 10 Dec 2012 14:06:03 -0500 Subject: ext4: use sync_inode_metadata() when syncing inode metadata We have a dedicated interface to sync inode metadata. Use it to simplify ext4's code some. Signed-off-by: Guo Chao Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner --- fs/ext4/fsync.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index be1d89f385b..dfbc1fe9667 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -44,7 +44,6 @@ */ static int ext4_sync_parent(struct inode *inode) { - struct writeback_control wbc; struct dentry *dentry = NULL; struct inode *next; int ret = 0; @@ -66,10 +65,7 @@ static int ext4_sync_parent(struct inode *inode) ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; - memset(&wbc, 0, sizeof(wbc)); - wbc.sync_mode = WB_SYNC_ALL; - wbc.nr_to_write = 0; /* only write out the inode */ - ret = sync_inode(inode, &wbc); + ret = sync_inode_metadata(inode, 1); if (ret) break; } -- cgit v1.2.3 From a789f49c9272e81f4f52487e94820182d0a2d2ff Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Mon, 10 Dec 2012 14:06:04 -0500 Subject: ext4: remove redundant code in ext4_alloc_inode() inode_init_always() will initialize inode->i_data.writeback_index anyway, no need to do this in ext4_alloc_inode(). Signed-off-by: Guo Chao Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner --- fs/ext4/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 856206f255a..c2ea525e85c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -939,7 +939,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) return NULL; ei->vfs_inode.i_version = 1; - ei->vfs_inode.i_data.writeback_index = 0; memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); -- cgit v1.2.3 From 6b280c913ee02a1a41b020a74c41584f2fca582a Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Mon, 10 Dec 2012 14:06:04 -0500 Subject: ext4: remove redundant initialization in ext4_fill_super() We use kzalloc() to allocate sbi, no need to zero its field. Signed-off-by: Guo Chao Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c2ea525e85c..e1e216f8e9b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3798,7 +3798,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); - sbi->s_resize_flags = 0; sb->s_root = NULL; -- cgit v1.2.3 From 187fd030d801b02b0daeb010dbf7c0113be3156d Mon Sep 17 00:00:00 2001 From: Zhi Yong Wu Date: Mon, 10 Dec 2012 14:06:04 -0500 Subject: ext4: remove unused variable from ext4_ext_in_cache() Signed-off-by: "Theodore Ts'o" Signed-off-by: Zhi Yong Wu Reviewed-by: Zheng Liu --- fs/ext4/extents.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d45ff3faefc..26af22832a8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2194,7 +2194,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_extent *ex) { struct ext4_ext_cache *cex; - struct ext4_sb_info *sbi; int ret = 0; /* @@ -2202,7 +2201,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; - sbi = EXT4_SB(inode->i_sb); /* has cache valid data? */ if (cex->ec_len == 0) -- cgit v1.2.3 From 939da1084458246d2e29dd921c2012c177000e96 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 16:30:43 -0500 Subject: ext4: Remove CONFIG_EXT4_FS_XATTR Ted has sent out a RFC about removing this feature. Eric and Jan confirmed that both RedHat and SUSE enable this feature in all their product. David also said that "As far as I know, it's enabled in all Android kernels that use ext4." So it seems OK for us. And what's more, as inline data depends its implementation on xattr, and to be frank, I don't run any test again inline data enabled while xattr disabled. So I think we should add inline data and remove this config option in the same release. [ The savings if you disable CONFIG_EXT4_FS_XATTR is only 27k, which isn't much in the grand scheme of things. Since no one seems to be testing this configuration except for some automated compile farms, on balance we are better removing this config option, and so that it is effectively always enabled. -- tytso ] Cc: David Brown Cc: Eric Sandeen Reviewed-by: Jan Kara Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 9 +- fs/Kconfig | 4 +- fs/ext4/Kconfig | 15 --- fs/ext4/Makefile | 4 +- fs/ext4/ext4.h | 2 - fs/ext4/file.c | 2 - fs/ext4/namei.c | 4 - fs/ext4/super.c | 9 -- fs/ext4/symlink.c | 4 - fs/ext4/xattr.h | 235 ------------------------------------- 10 files changed, 7 insertions(+), 281 deletions(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 104322bf378..34ea4f1fa6e 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -200,12 +200,9 @@ inode_readahead_blks=n This tuning parameter controls the maximum table readahead algorithm will pre-read into the buffer cache. The default value is 32 blocks. -nouser_xattr Disables Extended User Attributes. If you have extended - attribute support enabled in the kernel configuration - (CONFIG_EXT4_FS_XATTR), extended attribute support - is enabled by default on mount. See the attr(5) manual - page and http://acl.bestbits.at/ for more information - about extended attributes. +nouser_xattr Disables Extended User Attributes. See the + attr(5) manual page and http://acl.bestbits.at/ + for more information about extended attributes. noacl This option disables POSIX Access Control List support. If ACL support is enabled in the kernel diff --git a/fs/Kconfig b/fs/Kconfig index f95ae3a027f..eaff24a1950 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -28,8 +28,8 @@ config FS_MBCACHE tristate default y if EXT2_FS=y && EXT2_FS_XATTR default y if EXT3_FS=y && EXT3_FS_XATTR - default y if EXT4_FS=y && EXT4_FS_XATTR - default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR + default y if EXT4_FS=y + default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS source "fs/reiserfs/Kconfig" source "fs/jfs/Kconfig" diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index c22f17021b6..0a475c88185 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -39,22 +39,8 @@ config EXT4_USE_FOR_EXT23 compiled kernel size by using one file system driver for ext2, ext3, and ext4 file systems. -config EXT4_FS_XATTR - bool "Ext4 extended attributes" - depends on EXT4_FS - default y - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - for details). - - If unsure, say N. - - You need this for POSIX ACL support on ext4. - config EXT4_FS_POSIX_ACL bool "Ext4 POSIX Access Control Lists" - depends on EXT4_FS_XATTR select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support permissions for users and @@ -67,7 +53,6 @@ config EXT4_FS_POSIX_ACL config EXT4_FS_SECURITY bool "Ext4 Security Labels" - depends on EXT4_FS_XATTR help Security labels support alternative access control models implemented by security modules like SELinux. This option diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 3d96d569853..0310fec2ee3 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -7,8 +7,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o indirect.o extents_status.o + mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ + xattr_trusted.o inline.o -ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o inline.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b90e2720b82..e20dc38858d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -848,7 +848,6 @@ struct ext4_inode_info { #endif unsigned long i_flags; -#ifdef CONFIG_EXT4_FS_XATTR /* * Extended attributes can be read independently of the main file * data. Taking i_mutex even when reading would cause contention @@ -857,7 +856,6 @@ struct ext4_inode_info { * EAs. */ struct rw_semaphore xattr_sem; -#endif struct list_head i_orphan; /* unlinked but open inodes */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2f5759eb9f8..b64a60bf105 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -656,12 +656,10 @@ const struct file_operations ext4_file_operations = { const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b37c2183983..cac44828233 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3228,23 +3228,19 @@ const struct inode_operations ext4_dir_inode_operations = { .mknod = ext4_mknod, .rename = ext4_rename, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif .get_acl = ext4_get_acl, }; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e1e216f8e9b..7d53adff8bd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -997,9 +997,7 @@ static void init_once(void *foo) struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; INIT_LIST_HEAD(&ei->i_orphan); -#ifdef CONFIG_EXT4_FS_XATTR init_rwsem(&ei->xattr_sem); -#endif init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); } @@ -1449,13 +1447,8 @@ static const struct mount_opts { {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, -#ifdef CONFIG_EXT4_FS_XATTR {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, -#else - {Opt_user_xattr, 0, MOPT_NOSUPPORT}, - {Opt_nouser_xattr, 0, MOPT_NOSUPPORT}, -#endif #ifdef CONFIG_EXT4_FS_POSIX_ACL {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, @@ -3368,9 +3361,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ -#ifdef CONFIG_EXT4_FS_XATTR set_opt(sb, XATTR_USER); -#endif #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index ed9354aff27..ff371193201 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -35,22 +35,18 @@ const struct inode_operations ext4_symlink_inode_operations = { .follow_link = page_follow_link_light, .put_link = page_put_link, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif }; const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext4_follow_link, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif }; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 4222388c772..7b5513ed3b3 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -92,8 +92,6 @@ struct ext4_xattr_ibody_find { struct ext4_iloc iloc; }; -# ifdef CONFIG_EXT4_FS_XATTR - extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; extern const struct xattr_handler ext4_xattr_acl_access_handler; @@ -193,239 +191,6 @@ extern int ext4_try_to_evict_inline_data(handle_t *handle, extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); extern int ext4_convert_inline_data(struct inode *inode); -# else /* CONFIG_EXT4_FS_XATTR */ - -static inline int -ext4_xattr_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline int -ext4_xattr_set(struct inode *inode, int name_index, const char *name, - const void *value, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline int -ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, - const char *name, const void *value, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline void -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) -{ -} - -static inline void -ext4_xattr_put_super(struct super_block *sb) -{ -} - -static __init inline int -ext4_init_xattr(void) -{ - return 0; -} - -static inline void -ext4_exit_xattr(void) -{ -} - -static inline int -ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, - struct ext4_inode *raw_inode, handle_t *handle) -{ - return -EOPNOTSUPP; -} - -#define ext4_xattr_handlers NULL - -static inline int -ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) -{ - return -EOPNOTSUPP; -} - -static inline int -ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) -{ - return -EOPNOTSUPP; -} - -static inline int -ext4_xattr_ibody_get(struct inode *inode, int name_index, - const char *name, - void *buffer, size_t buffer_size) -{ - return -EOPNOTSUPP; -} - -static inline int ext4_find_inline_data_nolock(struct inode *inode) -{ - return 0; -} - -static inline int ext4_has_inline_data(struct inode *inode) -{ - return 0; -} - -static inline int ext4_get_inline_size(struct inode *inode) -{ - return 0; -} - -static inline int ext4_get_max_inline_size(struct inode *inode) -{ - return 0; -} - -static inline void ext4_write_inline_data(struct inode *inode, - struct ext4_iloc *iloc, - void *buffer, loff_t pos, - unsigned int len) -{ - return; -} - -static inline int ext4_init_inline_data(handle_t *handle, - struct inode *inode, - unsigned int len) -{ - return 0; -} - -static inline int ext4_destroy_inline_data(handle_t *handle, - struct inode *inode) -{ - return 0; -} - -static inline int ext4_readpage_inline(struct inode *inode, struct page *page) -{ - return 0; -} - -static inline int ext4_try_to_write_inline_data(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - unsigned flags, - struct page **pagep) -{ - return 0; -} - -static inline int ext4_write_inline_data_end(struct inode *inode, - loff_t pos, unsigned len, - unsigned copied, - struct page *page) -{ - return 0; -} - -static inline struct buffer_head * -ext4_journalled_write_inline_data(struct inode *inode, - unsigned len, - struct page *page) -{ - return NULL; -} - -static inline int -ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - unsigned flags, - struct page **pagep, - void **fsdata) -{ - return 0; -} - -static inline int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page) -{ - return 0; -} - -static inline int ext4_try_add_inline_entry(handle_t *handle, - struct dentry *dentry, - struct inode *inode) -{ - return 0; -} - -static inline int ext4_try_create_inline_dir(handle_t *handle, - struct inode *parent, - struct inode *inode) -{ - return 0; -} -static inline int ext4_read_inline_dir(struct file *filp, - void *dirent, filldir_t filldir, - int *has_inline_data) -{ - return 0; -} - -static inline struct buffer_head * -ext4_find_inline_entry(struct inode *dir, - const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, - int *has_inline_data) -{ - return NULL; -} -static inline int ext4_delete_inline_entry(handle_t *handle, - struct inode *dir, - struct ext4_dir_entry_2 *de_del, - struct buffer_head *bh, - int *has_inline_data) -{ - return 0; -} - -static inline int empty_inline_dir(struct inode *dir, int *has_inline_data) -{ - return 0; -} - -static inline struct buffer_head * -ext4_get_first_inline_block(struct inode *inode, - struct ext4_dir_entry_2 **parent_de, - int *retval) -{ - return NULL; -} - -static inline int ext4_inline_data_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - int *has_inline) -{ - return 0; -} - -static inline void ext4_inline_data_truncate(struct inode *inode, - int *has_inline) -{ - return; -} - -static inline int ext4_convert_inline_data(struct inode *inode) -{ - return 0; -} -# endif /* CONFIG_EXT4_FS_XATTR */ #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, -- cgit v1.2.3 From 9a4c8019471386c6fb039ae9e30f5216b6b55a9e Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Mon, 10 Dec 2012 16:30:45 -0500 Subject: ext4: ensure Inode flags consistency are checked at build time Flags being used by atomic operations in inode flags (e.g. ext4_test_inode_flag(), should be consistent with that actually stored in inodes, i.e.: EXT4_XXX_FL. It ensures that this consistency is checked at build-time, not at run-time. Currently, the flags consistency are being checked at run-time, but, there is no real reason to not do a build-time check instead of a run-time check. The code is comparing macro defined values with enum type variables, where both are constants, so, there is no problem in comparing constants at build-time. enum variables are treated as constants by the C compiler, according to the C99 specs (see www.open-std.org/jtc1/sc22/wg14/www/docs/n1124.pdf sec. 6.2.5, item 16), so, there is no real problem in comparing an enumeration type at build time Signed-off-by: Carlos Maiolino Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 29 +++++++++++++---------------- fs/ext4/super.c | 1 + 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e20dc38858d..b79d613091d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -463,25 +463,22 @@ enum { EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; -#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) -#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ - printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ - EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } - -/* - * Since it's pretty easy to mix up bit numbers and hex values, and we - * can't do a compile-time test for ENUM values, we use a run-time - * test to make sure that EXT4_XXX_FL is consistent with respect to - * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop - * out so it won't cost any extra space in the compiled kernel image. - * But it's important that these values are the same, since we are - * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL - * must be consistent with the values of FS_XXX_FL defined in - * include/linux/fs.h and the on-disk values found in ext2, ext3, and - * ext4 filesystems, and of course the values defined in e2fsprogs. +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. * * It's not paranoia if the Murphy's Law really *is* out to get you. :-) */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + static inline void ext4_check_flag_values(void) { CHECK_FLAG_VALUE(SECRM); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7d53adff8bd..3cdb0a2fc64 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5278,6 +5278,7 @@ static int __init ext4_init_fs(void) ext4_li_info = NULL; mutex_init(&ext4_li_mtx); + /* Build-time check for flags consistency */ ext4_check_flag_values(); for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { -- cgit v1.2.3 From bd9926e80330d43f15b710c2935fa41b792d56fd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 11 Dec 2012 03:31:49 -0500 Subject: ext4: zero out inline data using memset() instead of empty_zero_page Not all architectures (in particular, sparc64) have empty_zero_page. So instead of copying from empty_zero_page, use memset to clear the inline data by signalling to ext4_xattr_set_entry() via a magic pointer value, EXT4_ZERO_ATTR_VALUE, which is defined by casting -1 to a pointer. This fixes a build failure on sparc64, and the memset() should be more efficient than using memcpy() anyway. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inline.c | 2 +- fs/ext4/xattr.c | 22 ++++++++++++++++------ fs/ext4/xattr.h | 1 + 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 53b2f65091d..387c47c6cda 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -268,7 +268,7 @@ static int ext4_create_inline_data(handle_t *handle, goto out; if (len > EXT4_MIN_INLINE_DATA_SIZE) { - value = (void *)empty_zero_page; + value = EXT4_ZERO_XATTR_VALUE; len -= EXT4_MIN_INLINE_DATA_SIZE; } else { value = ""; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2251769a3c5..3a91ebc2b66 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -628,9 +628,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) size. Just replace. */ s->here->e_value_size = cpu_to_le32(i->value_len); - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); /* Clear pad bytes. */ - memcpy(val, i->value, i->value_len); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, size); + } else { + /* Clear pad bytes first. */ + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); + memcpy(val, i->value, i->value_len); + } return 0; } @@ -669,9 +674,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) size_t size = EXT4_XATTR_SIZE(i->value_len); void *val = s->base + min_offs - size; s->here->e_value_offs = cpu_to_le16(min_offs - size); - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); /* Clear the pad bytes. */ - memcpy(val, i->value, i->value_len); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, size); + } else { + /* Clear the pad bytes first. */ + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); + memcpy(val, i->value, i->value_len); + } } } return 0; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 7b5513ed3b3..69eda787a96 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -71,6 +71,7 @@ struct ext4_xattr_entry { #define BFIRST(bh) ENTRY(BHDR(bh)+1) #define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) +#define EXT4_ZERO_XATTR_VALUE ((void *)-1) struct ext4_xattr_info { int name_index; -- cgit v1.2.3