From f6fb99cadcd44660c68e13f6eab28333653621e6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 30 Jun 2012 19:14:57 -0400 Subject: ext4: pass a char * to ext4_count_free() instead of a buffer_head ptr Make it possible for ext4_count_free to operate on buffers and not just data in buffer_heads. Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cfc4e01b3c8..293fa1ced21 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1852,7 +1852,7 @@ struct mmpd_data { # define NORET_AND noreturn, /* bitmap.c */ -extern unsigned int ext4_count_free(struct buffer_head *, unsigned); +extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz); -- cgit v1.2.3 From 952fc18ef9ec707ebdc16c0786ec360295e5ff15 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 9 Jul 2012 16:27:05 -0400 Subject: ext4: fix overhead calculation used by ext4_statfs() Commit f975d6bcc7a introduced bug which caused ext4_statfs() to miscalculate the number of file system overhead blocks. This causes the f_blocks field in the statfs structure to be larger than it should be. This would in turn cause the "df" output to show the number of data blocks in the file system and the number of data blocks used to be larger than they should be. Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/ext4.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 293fa1ced21..01434f25917 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1161,8 +1161,7 @@ struct ext4_sb_info { unsigned long s_desc_per_block; /* Number of group descriptors per block */ ext4_group_t s_groups_count; /* Number of groups in the fs */ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ - unsigned long s_overhead_last; /* Last calculated overhead */ - unsigned long s_blocks_last; /* Last seen block count */ + unsigned long s_overhead; /* # of fs overhead clusters */ unsigned int s_cluster_ratio; /* Number of blocks per cluster */ unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ @@ -2037,6 +2036,7 @@ extern int ext4_group_extend(struct super_block *sb, extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ +extern int ext4_calculate_overhead(struct super_block *sb); extern int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb, -- cgit v1.2.3 From 729f52c6be51013c9268e5fc85acbc1091286fdb Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 9 Jul 2012 16:29:29 -0400 Subject: ext4: add a new nolock flag in ext4_map_blocks EXT4_GET_BLOCKS_NO_LOCK flag is added to indicate that we don't need to acquire i_data_sem lock in ext4_map_blocks. Meanwhile, it changes ext4_get_block() to not start a new journal because when we do a overwrite dio, there is no any metadata that needs to be modified. We define a new function called ext4_get_block_write_nolock, which is used in dio overwrite nolock. In this function, it doesn't try to acquire i_data_sem lock and doesn't start a new journal as it does a lookup. CC: Tao Ma CC: Eric Sandeen CC: Robin Dong Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 01434f25917..4a49f8225d0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -571,6 +571,8 @@ enum { #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* Request will not result in inode size update (user for fallocate) */ #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 + /* Do not take i_data_sem locking in ext4_map_blocks */ +#define EXT4_GET_BLOCKS_NO_LOCK 0x0100 /* * Flags used by ext4_free_blocks -- cgit v1.2.3 From 7c319d328505b7781b65238ae9f53293b5ee0ca8 Mon Sep 17 00:00:00 2001 From: Aditya Kali Date: Sun, 22 Jul 2012 20:21:31 -0400 Subject: ext4: make quota as first class supported feature This patch adds support for quotas as a first class feature in ext4; which is to say, the quota files are stored in hidden inodes as file system metadata, instead of as separate files visible in the file system directory hierarchy. It is based on the proposal at: https://ext4.wiki.kernel.org/index.php/Design_For_1st_Class_Quota_in_Ext4 This patch introduces a new feature - EXT4_FEATURE_RO_COMPAT_QUOTA which, when turned on, enables quota accounting at mount time iteself. Also, the quota inodes are stored in two additional superblock fields. Some changes introduced by this patch that should be pointed out are: 1) Two new ext4-superblock fields - s_usr_quota_inum and s_grp_quota_inum for storing the quota inodes in use. 2) Default quota inodes are: inode#3 for tracking userquota and inode#4 for tracking group quota. The superblock fields can be set to use other inodes as well. 3) If the QUOTA feature and corresponding quota inodes are set in superblock, the quota usage tracking is turned on at mount time. On 'quotaon' ioctl, the quota limits enforcement is turned on. 'quotaoff' ioctl turns off only the limits enforcement in this case. 4) When QUOTA feature is in use, the quota mount options 'quota', 'usrquota', 'grpquota' are ignored by the kernel. 5) mke2fs or tune2fs can be used to set the QUOTA feature and initialize quota inodes. The default reserved inodes will not be visible to user as regular files. 6) The quota-tools will need to be modified to support hidden quota files on ext4. E2fsprogs will also include support for creating and fixing quota files. 7) Support is only for the new V2 quota file format. Tested-by: Jan Kara Reviewed-by: Jan Kara Reviewed-by: Johann Lombardi Signed-off-by: Aditya Kali Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4a49f8225d0..1610e808ebe 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1315,6 +1315,8 @@ static inline struct timespec ext4_current_time(struct inode *inode) static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || + ino == EXT4_USR_QUOTA_INO || + ino == EXT4_GRP_QUOTA_INO || ino == EXT4_JOURNAL_INO || ino == EXT4_RESIZE_INO || (ino >= EXT4_FIRST_INO(sb) && @@ -1497,7 +1499,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ + EXT4_FEATURE_RO_COMPAT_QUOTA) /* * Default values for user and/or group using reserved blocks -- cgit v1.2.3 From 3108b54bcedde5d952c90460df5bc21efc1e134f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 22 Jul 2012 20:25:31 -0400 Subject: ext4: remove dynamic array size in ext4_chksum() The ext4_checksum() inline function was using a dynamic array size, which is not legal C. (It is a gcc extension). Remove it. Cc: "Darrick J. Wong" Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1610e808ebe..e8e8afa402f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1667,10 +1667,12 @@ static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, { struct { struct shash_desc shash; - char ctx[crypto_shash_descsize(sbi->s_chksum_driver)]; + char ctx[4]; } desc; int err; + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); + desc.shash.tfm = sbi->s_chksum_driver; desc.shash.flags = 0; *(u32 *)desc.ctx = crc; -- cgit v1.2.3 From 044ce47fec90ec0f25605e87a5d72cca14568bc3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 22 Jul 2012 20:31:31 -0400 Subject: ext4: convert last user of ext4_mark_super_dirty() to ext4_handle_dirty_super() The last user of ext4_mark_super_dirty() in ext4_file_open() is so rare it can well be modifying the superblock properly by journalling the change. Change it and get rid of ext4_mark_super_dirty() as it's not needed anymore. Artem: small amendments. Artem: tested using xfstests for both journalled and non-journalled ext4. Signed-off-by: Jan Kara Signed-off-by: Artem Bityutskiy Signed-off-by: "Theodore Ts'o" Tested-by: Artem Bityutskiy --- fs/ext4/ext4.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e8e8afa402f..c3411d4ce2d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2328,15 +2328,6 @@ static inline void ext4_unlock_group(struct super_block *sb, spin_unlock(ext4_group_lock_ptr(sb, group)); } -static inline void ext4_mark_super_dirty(struct super_block *sb) -{ - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - - ext4_superblock_csum_set(sb, es); - if (EXT4_SB(sb)->s_journal == NULL) - sb->s_dirt =1; -} - /* * Block validity checking */ -- cgit v1.2.3 From df981d03eeff7971ac7e6ff37000bfa702327ef1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Aug 2012 09:48:17 -0400 Subject: ext4: add max_dir_size_kb mount option Very large directories can cause significant performance problems, or perhaps even invoke the OOM killer, if the process is running in a highly constrained memory environment (whether it is VM's with a small amount of memory or in a small memory cgroup). So it is useful, in cloud server/data center environments, to be able to set a filesystem-wide cap on the maximum size of a directory, to ensure that directories never get larger than a sane size. We do this via a new mount option, max_dir_size_kb. If there is an attempt to grow the directory larger than max_dir_size_kb, the system call will return ENOSPC instead. Google-Bug-Id: 6863013 Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c3411d4ce2d..7c0841ecde6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1243,6 +1243,7 @@ struct ext4_sb_info { unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; unsigned int s_max_writeback_mb_bump; + unsigned int s_max_dir_size_kb; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; -- cgit v1.2.3 From 67a5da564f97f31c4054d358e00b34d7ee570da5 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Fri, 17 Aug 2012 09:54:17 -0400 Subject: ext4: make the zero-out chunk size tunable Currently in ext4 the length of zero-out chunk is set to 7 file system blocks. But if an inode has uninitailized extents from using fallocate to preallocate space, and the workload issues many random writes, this can cause a fragmented extent tree that will unnecessarily grow the extent tree. So create a new sysfs tunable, extent_max_zeroout_kb, which controls the maximum size where blocks will be zeroed out instead of creating a new uninitialized extent. The default of this has been sent to 32kb. CC: Zach Brown CC: Andreas Dilger Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7c0841ecde6..0df5ee102b6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1271,6 +1271,9 @@ struct ext4_sb_info { unsigned long s_sectors_written_start; u64 s_kbytes_written; + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; + unsigned int s_log_groups_per_flex; struct flex_groups *s_flex_groups; -- cgit v1.2.3 From 117fff10d7f140e12dd43df20d3f9fda80577460 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 5 Sep 2012 01:29:50 -0400 Subject: ext4: grow the s_flex_groups array as needed when resizing Previously, we allocated the s_flex_groups array to the maximum size that the file system could be resized. There was two problems with this approach. First, it wasted memory in the common case where the file system was not resized. Secondly, once we start allowing online resizing using the meta_bg scheme, there is no maximum size that the file system can be resized. So instead, we need to grow the s_flex_groups at inline resize time. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0df5ee102b6..464cff711ed 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1276,6 +1276,7 @@ struct ext4_sb_info { unsigned int s_log_groups_per_flex; struct flex_groups *s_flex_groups; + ext4_group_t s_flex_groups_allocated; /* workqueue for dio unwritten */ struct workqueue_struct *dio_unwritten_wq; @@ -2055,6 +2056,8 @@ extern void ext4_superblock_csum_set(struct super_block *sb, extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern void ext4_kvfree(void *ptr); +extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, const char *, ...); -- cgit v1.2.3 From 28623c2f5b0dca3c3ea34fd6108940661352e276 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 5 Sep 2012 01:31:50 -0400 Subject: ext4: grow the s_group_info array as needed Previously we allocated the s_group_info array with enough space for any future possible growth of the file system via online resize. This is unfortunate because it wastes memory, and it doesn't work for the meta_bg scheme, since there is no limit based on the number of reserved gdt blocks. So add the code to grow the s_group_info array as needed. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 464cff711ed..8b6902c4d7b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1233,6 +1233,7 @@ struct ext4_sb_info { spinlock_t s_md_lock; unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; + unsigned int s_group_info_size; /* tunables */ unsigned long s_stripe; @@ -1971,6 +1972,8 @@ extern void ext4_exit_mballoc(void); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags); +extern int ext4_mb_alloc_groupinfo(struct super_block *sb, + ext4_group_t ngroups); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, -- cgit v1.2.3 From f45ee3a1ea438af96e4fd2c0b16d195e67ef235f Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 28 Sep 2012 23:21:09 -0400 Subject: ext4: ext4_inode_info diet Generic inode has unused i_private pointer which may be used as cur_aio_dio storage. TODO: If cur_aio_dio will be passed as an argument to get_block_t this allow to have concurent AIO_DIO requests. Reviewed-by: Zheng Liu Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8b6902c4d7b..3e8e185e5e2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -912,8 +912,6 @@ struct ext4_inode_info { struct list_head i_completed_io_list; spinlock_t i_completed_io_lock; atomic_t i_ioend_count; /* Number of outstanding io_end structs */ - /* current io_end structure for async DIO write*/ - ext4_io_end_t *cur_aio_dio; atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; @@ -1338,6 +1336,16 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, } } +static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode) +{ + return inode->i_private; +} + +static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io) +{ + inode->i_private = io; +} + /* * Inode dynamic state flags */ -- cgit v1.2.3 From e27f41e1b789e60e7d8cc9c81fd93ca49ef31f13 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 28 Sep 2012 23:24:52 -0400 Subject: ext4: give i_aiodio_unwritten a more appropriate name AIO/DIO prefix is wrong because it account unwritten extents which also may be scheduled from buffered write endio Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3e8e185e5e2..6ec9856065d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -912,7 +912,7 @@ struct ext4_inode_info { struct list_head i_completed_io_list; spinlock_t i_completed_io_lock; atomic_t i_ioend_count; /* Number of outstanding io_end structs */ - atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */ + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; @@ -1332,7 +1332,7 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + atomic_inc(&EXT4_I(inode)->i_unwritten); } } -- cgit v1.2.3 From 28a535f9a0df060569dcc786e5bc2e1de43d7dc7 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 29 Sep 2012 00:14:55 -0400 Subject: ext4: completed_io locking cleanup Current unwritten extent conversion state-machine is very fuzzy. - For unknown reason it performs conversion under i_mutex. What for? My diagnosis: We already protect extent tree with i_data_sem, truncate and punch_hole should wait for DIO, so the only data we have to protect is end_io->flags modification, but only flush_completed_IO and end_io_work modified this flags and we can serialize them via i_completed_io_lock. Currently all these games with mutex_trylock result in the following deadlock truncate: kworker: ext4_setattr ext4_end_io_work mutex_lock(i_mutex) inode_dio_wait(inode) ->BLOCK DEADLOCK<- mutex_trylock() inode_dio_done() #TEST_CASE1_BEGIN MNT=/mnt_scrach unlink $MNT/file fallocate -l $((1024*1024*1024)) $MNT/file aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file sleep 2 truncate -s 0 $MNT/file #TEST_CASE1_END Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286 This patch makes state machine simple and clean: (1) xxx_end_io schedule final extent conversion simply by calling ext4_add_complete_io(), which append it to ei->i_completed_io_list NOTE1: because of (2A) work should be queued only if ->i_completed_io_list was empty, otherwise the work is scheduled already. (2) ext4_flush_completed_IO is responsible for handling all pending end_io from ei->i_completed_io_list Flushing sequence consists of following stages: A) LOCKED: Atomically drain completed_io_list to local_list B) Perform extents conversion C) LOCKED: move converted io's to to_free list for final deletion This logic depends on context which we was called from. D) Final end_io context destruction NOTE1: i_mutex is no longer required because end_io->flags modification is protected by ei->ext4_complete_io_lock Full list of changes: - Move all completion end_io related routines to page-io.c in order to improve logic locality - Move open coded logic from various xx_end_xx routines to ext4_add_complete_io() - remove EXT4_IO_END_FSYNC - Improve SMP scalability by removing useless i_mutex which does not protect io->flags anymore. - Reduce lock contention on i_completed_io_lock by optimizing list walk. - Rename ext4_end_io_nolock to end4_end_io and make it static - Check flush completion status to ext4_ext_punch_hole(). Because it is not good idea to punch blocks from corrupted inode. Changes since V3 (in request to Jan's comments): Fall back to active flush_completed_IO() approach in order to prevent performance issues with nolocked DIO reads. Changes since V2: Fix use-after-free caused by race truncate vs end_io_work Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6ec9856065d..edb04957942 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -186,7 +186,6 @@ struct mpage_da_data { #define EXT4_IO_END_ERROR 0x0002 #define EXT4_IO_END_QUEUED 0x0004 #define EXT4_IO_END_DIRECT 0x0008 -#define EXT4_IO_END_IN_FSYNC 0x0010 struct ext4_io_page { struct page *p_page; @@ -2418,11 +2417,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, /* page-io.c */ extern int __init ext4_init_pageio(void); +extern void ext4_add_complete_io(ext4_io_end_t *io_end); extern void ext4_exit_pageio(void); extern void ext4_ioend_wait(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); -extern int ext4_end_io_nolock(ext4_io_end_t *io); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, -- cgit v1.2.3 From 17335dcc471199717839b2fa3492ca36f70f1168 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 29 Sep 2012 00:41:21 -0400 Subject: ext4: serialize dio nonlocked reads with defrag workers Inode's block defrag and ext4_change_inode_journal_flag() may affect nonlocked DIO reads result, so proper synchronization required. - Add missed inode_dio_wait() calls where appropriate - Check inode state under extra i_dio_count reference. Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index edb04957942..1be2b4472a8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1358,6 +1358,8 @@ enum { EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ + EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read + nolocking */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -2469,6 +2471,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } +/* + * Disable DIO read nolock optimization, so new dioreaders will be forced + * to grab i_mutex + */ +static inline void ext4_inode_block_unlocked_dio(struct inode *inode) +{ + ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); + smp_mb(); +} +static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb(); + ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); +} + #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) /* For ioend & aio unwritten conversion wait queues */ -- cgit v1.2.3 From c278531d39f3158bfee93dc67da0b77e09776de2 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 5 Oct 2012 11:31:55 -0400 Subject: ext4: fix ext4_flush_completed_IO wait semantics BUG #1) All places where we call ext4_flush_completed_IO are broken because buffered io and DIO/AIO goes through three stages 1) submitted io, 2) completed io (in i_completed_io_list) conversion pended 3) finished io (conversion done) And by calling ext4_flush_completed_IO we will flush only requests which were in (2) stage, which is wrong because: 1) punch_hole and truncate _must_ wait for all outstanding unwritten io regardless to it's state. 2) fsync and nolock_dio_read should also wait because there is a time window between end_page_writeback() and ext4_add_complete_io() As result integrity fsync is broken in case of buffered write to fallocated region: fsync blkdev_completion ->filemap_write_and_wait_range ->ext4_end_bio ->end_page_writeback <-- filemap_write_and_wait_range return ->ext4_flush_completed_IO sees empty i_completed_io_list but pended conversion still exist ->ext4_add_complete_io BUG #2) Race window becomes wider due to the 'ext4: completed_io locking cleanup V4' patch series This patch make following changes: 1) ext4_flush_completed_io() now first try to flush completed io and when wait for any outstanding unwritten io via ext4_unwritten_wait() 2) Rename function to more appropriate name. 3) Assert that all callers of ext4_flush_unwritten_io should hold i_mutex to prevent endless wait Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1be2b4472a8..3ab2539b7b2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1947,7 +1947,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p); /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); -extern int ext4_flush_completed_IO(struct inode *); +extern int ext4_flush_unwritten_io(struct inode *); /* hash.c */ extern int ext4fs_dirhash(const char *name, int len, struct @@ -2371,6 +2371,7 @@ extern const struct file_operations ext4_dir_operations; extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); +extern void ext4_unwritten_wait(struct inode *inode); /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; -- cgit v1.2.3 From 06db49e68ae70cf16819b85a14057acb2820776a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 10 Oct 2012 01:06:58 -0400 Subject: ext4: fix metadata checksum calculation for the superblock The function ext4_handle_dirty_super() was calculating the superblock on the wrong block data. As a result, when the superblock is modified while it is mounted (most commonly, when inodes are added or removed from the orphan list), the superblock checksum would be wrong. We didn't notice because the superblock *was* being correctly calculated in ext4_commit_super(), and this would get called when the file system was unmounted. So the problem only became obvious if the system crashed while the file system was mounted. Fix this by removing the poorly designed function signature for ext4_superblock_csum_set(); if it only took a single argument, the pointer to a struct superblock, the ambiguity which caused this mistake would have been impossible. Reported-by: George Spelvin Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3ab2539b7b2..78971cfd9c7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2063,8 +2063,7 @@ extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); extern int ext4_calculate_overhead(struct super_block *sb); extern int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es); -extern void ext4_superblock_csum_set(struct super_block *sb, - struct ext4_super_block *es); +extern void ext4_superblock_csum_set(struct super_block *sb); extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern void ext4_kvfree(void *ptr); -- cgit v1.2.3 From 79f1ba49569e5aec919b653c55b03274c2331701 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 22 Oct 2012 00:34:32 -0400 Subject: ext4: Checksum the block bitmap properly with bigalloc enabled In mke2fs, we only checksum the whole bitmap block and it is right. While in the kernel, we use EXT4_BLOCKS_PER_GROUP to indicate the size of the checksumed bitmap which is wrong when we enable bigalloc. The right size should be EXT4_CLUSTERS_PER_GROUP and this patch fixes it. Also as every caller of ext4_block_bitmap_csum_set and ext4_block_bitmap_csum_verify pass in EXT4_BLOCKS_PER_GROUP(sb)/8, we'd better removes this parameter and sets it in the function itself. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 78971cfd9c7..3c20de1d59d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1882,10 +1882,10 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, struct buffer_head *bh, int sz); void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *gdp, - struct buffer_head *bh, int sz); + struct buffer_head *bh); int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *gdp, - struct buffer_head *bh, int sz); + struct buffer_head *bh); /* balloc.c */ extern void ext4_validate_block_bitmap(struct super_block *sb, -- cgit v1.2.3 From c0677e6d0f9d991adff972b8d06cb83de1f8ee8e Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 15:18:54 -0500 Subject: ext4: add data structures for the extent status tree This patch adds two structures that supports extent status tree, extent_status and ext4_es_tree. Currently extent_status is used to track a delay extent for an inode, which record the start block and the length of the delay extent. ext4_es_tree is used to store all extent_status for an inode in memory. Signed-off-by: Yongqiang Yang Signed-off-by: Allison Henderson Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3c20de1d59d..bcc634b26d4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -811,6 +811,8 @@ struct ext4_ext_cache { __u32 ec_len; /* must be 32bit to return holes */ }; +#include "extents_status.h" + /* * fourth extended file system inode data in memory */ @@ -888,6 +890,10 @@ struct ext4_inode_info { struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + /* ialloc */ ext4_group_t i_last_alloc_group; -- cgit v1.2.3 From 7d1b1fbc95ebf41fee246dde437a77921f3bfec5 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:35 -0500 Subject: ext4: reimplement ext4_find_delay_alloc_range on extent status tree Signed-off-by: Yongqiang Yang Signed-off-by: Allison Henderson Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bcc634b26d4..246e38f3915 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2451,14 +2451,10 @@ enum ext4_state_bits { * never, ever appear in a buffer_head's state * flag. See EXT4_MAP_FROM_CLUSTER to see where * this is used. */ - BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This - * flag is set when ext4_map_blocks is called on a - * delayed allocated block to get its real mapping. */ }; BUFFER_FNS(Uninit, uninit) TAS_BUFFER_FNS(Uninit, uninit) -BUFFER_FNS(Da_Mapped, da_mapped) /* * Add new method to test wether block and inode bitmaps are properly -- cgit v1.2.3 From 48fc7f7e787dd65ffe88521bce31f4062ba273eb Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Wed, 19 Sep 2012 21:48:00 -0400 Subject: Fix misspellings of "whether" in comments. "Whether" is misspelled in various comments across the tree; this fixes them. No code changes. Signed-off-by: Adam Buchbinder Signed-off-by: Jiri Kosina --- fs/ext4/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3c20de1d59d..df163da388c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2455,7 +2455,7 @@ TAS_BUFFER_FNS(Uninit, uninit) BUFFER_FNS(Da_Mapped, da_mapped) /* - * Add new method to test wether block and inode bitmaps are properly + * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough * to mark the bitmap uptodate. We need to also zero-out the bitmap */ -- cgit v1.2.3 From 4a092d737955301da22b9d5e07f5036da821a932 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 28 Nov 2012 13:03:30 -0500 Subject: ext4: rationalize ext4_extents.h inclusion Previously, ext4_extents.h was being included at the end of ext4.h, which was bad for a number of reasons: (a) it was not being included in the expected place, and (b) it caused the header to be included multiple times. There were #ifdef's to prevent this from causing any problems, but it still was unnecessary. By moving the function declarations that were in ext4_extents.h to ext4.h, which is standard practice for where the function declarations for the rest of ext4.h can be found, we can remove ext4_extents.h from being included in ext4.h at all, and then we can only include ext4_extents.h where it is needed in ext4's source files. It should be possible to move a few more things into ext4.h, and further reduce the number of source files that need to #include ext4_extents.h, but that's a cleanup for another day. Reported-by: Sachin Kamat Reported-by: Wei Yongjun Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 246e38f3915..2e9ffa9100b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -57,6 +57,16 @@ #define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + #define EXT4_ERROR_INODE(inode, fmt, a...) \ ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) @@ -2399,6 +2409,9 @@ extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); /* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, @@ -2416,8 +2429,27 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_extent_tree_init(handle_t *, struct inode *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, + struct ext4_ext_path *, + struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path *); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); + + /* move_extent.c */ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, @@ -2505,6 +2537,4 @@ extern void ext4_resize_end(struct super_block *sb); #endif /* __KERNEL__ */ -#include "ext4_extents.h" - #endif /* _EXT4_H */ -- cgit v1.2.3 From 67cf5b09a46f72e048501b84996f2f77bc42e947 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:04:46 -0500 Subject: ext4: add the basic function for inline data support Implement inline data with xattr. Now we use "system.data" to store xattr, and the xattr will be extended if the i_size is increased while we don't release the space during truncate. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2e9ffa9100b..c827e47d556 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -402,6 +402,7 @@ struct flex_groups { #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ @@ -458,6 +459,7 @@ enum { EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; @@ -504,6 +506,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(EXTENTS); CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(RESERVED); } @@ -918,6 +921,10 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + #ifdef CONFIG_QUOTA /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; @@ -1376,6 +1383,7 @@ enum { EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1497,7 +1505,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ -#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ -- cgit v1.2.3 From f19d5870cbf72d4cb2a8e1f749dff97af99b071e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:51 -0500 Subject: ext4: add normal write support for inline data For a normal write case (not journalled write, not delayed allocation), we write to the inline if the file is small and convert it to an extent based file when the write is larger than the max inline size. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c827e47d556..9f4efc6c37b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2018,8 +2018,19 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int, int *); +int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh); extern struct inode *ext4_iget(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, struct writeback_control *); -- cgit v1.2.3 From 9c3569b50f12e47cc5e907b5e37e4a45c0c10b43 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:57 -0500 Subject: ext4: add delalloc support for inline data For delayed allocation mode, we write to inline data if the file is small enough. And in case of we write to some offset larger than the inline size, the 1st page is dirtied, so that ext4_da_writepages can handle the conversion. When the 1st page is initialized with blocks, the inline part is removed. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9f4efc6c37b..268636af7f5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2022,6 +2022,8 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *head, unsigned from, @@ -2031,6 +2033,8 @@ int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct buffer_head *bh); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 extern struct inode *ext4_iget(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, struct writeback_control *); -- cgit v1.2.3 From a774f9c20e08643fc0e6c48b0419ad7657ed0c04 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:57 -0500 Subject: ext4: make ext4_init_dot_dotdot for inline dir usage Currently, the initialization of dot and dotdot are encapsulated in ext4_mkdir and also bond with dir_block. So create a new function named ext4_init_new_dir and the initialization is moved to ext4_init_dot_dotdot. Now it will called either in the normal non-inline case(rec_len of ".." will cover the whole block) or when we converting an inline dir to a block(rec len of ".." will be the real length). The start of the next entry is also returned for inline dir usage. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 268636af7f5..cf840146ce8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2415,6 +2415,10 @@ extern void ext4_unwritten_wait(struct inode *inode); extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; extern struct dentry *ext4_get_parent(struct dentry *child); +extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len); /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; -- cgit v1.2.3 From 226ba972b0863783ad377f741f6ff0538f31ab00 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:58 -0500 Subject: ext4: refactor __ext4_check_dir_entry() to accept start and size The __ext4_check_dir_entry() function() is used to check whether the de is over the block boundary. Now with inline data, it could be within the block boundary while exceeds the inode size. So check this function to check the overflow more precisely. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cf840146ce8..59cbf498fd5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1960,10 +1960,11 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, struct ext4_dir_entry_2 *, - struct buffer_head *, unsigned int); -#define ext4_check_dir_entry(dir, filp, de, bh, offset) \ + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ - (de), (bh), (offset))) + (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); -- cgit v1.2.3 From 978fef914a2e6b8ad5672d0a39f9201b7aa7c396 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:58 -0500 Subject: ext4: create __ext4_insert_dentry for dir entry insertion The old add_dirent_to_buf handles all the work related to the work of adding dir entry to a dir block. Now we have inline data, so create 2 new function __ext4_find_dest_de and __ext4_insert_dentry that do the real work and let add_dirent_to_buf call them. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 59cbf498fd5..8e9e94cf1bc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1969,6 +1969,21 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + const char *name, int namelen, + struct ext4_dir_entry_2 **dest_de); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + const char *name, int namelen); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); +} /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); -- cgit v1.2.3 From 3c47d54170b6a678875566b1b8d6dcf57904e49b Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:59 -0500 Subject: ext4: let add_dir_entry handle inline data properly This patch let add_dir_entry handle the inline data case. So the dir is initialized as inline dir first and then we can try to add some files to it, when the inline space can't hold all the entries, a dir block will be created and the dir entry will be moved to it. Also for an inlined dir, "." and ".." are removed and we only use 4 bytes to store the parent inode number. These 2 entries will be added when we convert an inline dir to a block-based one. [ Folded in patch from Dan Carpenter to remove an unused variable. ] Signed-off-by: Tao Ma Signed-off-by: Dan Carpenter Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8e9e94cf1bc..689ce1d696b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1616,6 +1616,11 @@ struct ext4_dir_entry_tail { __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ }; +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + /* * Ext4 directory file types. Only the low 3 bits are used. The * other bits are reserved for now. @@ -2435,6 +2440,11 @@ extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, struct ext4_dir_entry_2 *de, int blocksize, int csum_size, unsigned int parent_ino, int dotdot_real_len); +extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize); +extern int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh); /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; -- cgit v1.2.3 From 65d165d9366dbf783d0102177006d47c8859ba31 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:59 -0500 Subject: ext4: let ext4_readdir handle inline data For "." and "..", we just call filldir by ourselves instead of iterating the real dir entry. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 689ce1d696b..e3a74658c63 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1989,6 +1989,18 @@ static inline void ext4_update_dx_flag(struct inode *inode) EXT4_FEATURE_COMPAT_DIR_INDEX)) ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } +static unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT4_FT_MAX)) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); -- cgit v1.2.3 From 7335cd3b41b1e704608ca46159641ca9cb598121 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:05:59 -0500 Subject: ext4: create a new function search_dir search_dirblock is used to search a dir block, but the code is almost the same for searching an inline dir. So create a new fuction search_dir and let search_dirblock call it. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e3a74658c63..a971b65bf5c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2122,6 +2122,13 @@ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); +extern int search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); /* resize.c */ extern int ext4_group_add(struct super_block *sb, -- cgit v1.2.3 From 05019a9e7f025133f20c67677c9c8551eca3c6dc Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:00 -0500 Subject: ext4: make ext4_delete_entry generic Currently ext4_delete_entry() is used only for dir entry removing from a dir block. So let us create a new function ext4_generic_delete_entry and this function takes a entry_buf and a buf_size so that it can be used for inline data. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a971b65bf5c..6cfe546282d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2129,6 +2129,13 @@ extern int search_dir(struct buffer_head *bh, const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); /* resize.c */ extern int ext4_group_add(struct super_block *sb, -- cgit v1.2.3 From f08225d176a5736363beea653b9b3fb9400c1255 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 14:06:03 -0500 Subject: ext4: enable ext4 inline support Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6cfe546282d..b90e2720b82 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1529,7 +1529,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ - EXT4_FEATURE_INCOMPAT_MMP) + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ -- cgit v1.2.3 From 939da1084458246d2e29dd921c2012c177000e96 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 16:30:43 -0500 Subject: ext4: Remove CONFIG_EXT4_FS_XATTR Ted has sent out a RFC about removing this feature. Eric and Jan confirmed that both RedHat and SUSE enable this feature in all their product. David also said that "As far as I know, it's enabled in all Android kernels that use ext4." So it seems OK for us. And what's more, as inline data depends its implementation on xattr, and to be frank, I don't run any test again inline data enabled while xattr disabled. So I think we should add inline data and remove this config option in the same release. [ The savings if you disable CONFIG_EXT4_FS_XATTR is only 27k, which isn't much in the grand scheme of things. Since no one seems to be testing this configuration except for some automated compile farms, on balance we are better removing this config option, and so that it is effectively always enabled. -- tytso ] Cc: David Brown Cc: Eric Sandeen Reviewed-by: Jan Kara Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b90e2720b82..e20dc38858d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -848,7 +848,6 @@ struct ext4_inode_info { #endif unsigned long i_flags; -#ifdef CONFIG_EXT4_FS_XATTR /* * Extended attributes can be read independently of the main file * data. Taking i_mutex even when reading would cause contention @@ -857,7 +856,6 @@ struct ext4_inode_info { * EAs. */ struct rw_semaphore xattr_sem; -#endif struct list_head i_orphan; /* unlinked but open inodes */ -- cgit v1.2.3 From 9a4c8019471386c6fb039ae9e30f5216b6b55a9e Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Mon, 10 Dec 2012 16:30:45 -0500 Subject: ext4: ensure Inode flags consistency are checked at build time Flags being used by atomic operations in inode flags (e.g. ext4_test_inode_flag(), should be consistent with that actually stored in inodes, i.e.: EXT4_XXX_FL. It ensures that this consistency is checked at build-time, not at run-time. Currently, the flags consistency are being checked at run-time, but, there is no real reason to not do a build-time check instead of a run-time check. The code is comparing macro defined values with enum type variables, where both are constants, so, there is no problem in comparing constants at build-time. enum variables are treated as constants by the C compiler, according to the C99 specs (see www.open-std.org/jtc1/sc22/wg14/www/docs/n1124.pdf sec. 6.2.5, item 16), so, there is no real problem in comparing an enumeration type at build time Signed-off-by: Carlos Maiolino Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e20dc38858d..b79d613091d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -463,25 +463,22 @@ enum { EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; -#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) -#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ - printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ - EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } - -/* - * Since it's pretty easy to mix up bit numbers and hex values, and we - * can't do a compile-time test for ENUM values, we use a run-time - * test to make sure that EXT4_XXX_FL is consistent with respect to - * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop - * out so it won't cost any extra space in the compiled kernel image. - * But it's important that these values are the same, since we are - * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL - * must be consistent with the values of FS_XXX_FL defined in - * include/linux/fs.h and the on-disk values found in ext2, ext3, and - * ext4 filesystems, and of course the values defined in e2fsprogs. +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. * * It's not paranoia if the Murphy's Law really *is* out to get you. :-) */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + static inline void ext4_check_flag_values(void) { CHECK_FLAG_VALUE(SECRM); -- cgit v1.2.3