aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Kconfig17
-rw-r--r--fs/ext4/Makefile4
-rw-r--r--fs/ext4/acl.c37
-rw-r--r--fs/ext4/balloc.c71
-rw-r--r--fs/ext4/bitmap.c24
-rw-r--r--fs/ext4/dir.c118
-rw-r--r--fs/ext4/ext4.h249
-rw-r--r--fs/ext4/ext4_extents.h40
-rw-r--r--fs/ext4/ext4_jbd2.c12
-rw-r--r--fs/ext4/ext4_jbd2.h32
-rw-r--r--fs/ext4/extents.c864
-rw-r--r--fs/ext4/extents_status.c500
-rw-r--r--fs/ext4/extents_status.h45
-rw-r--r--fs/ext4/file.c462
-rw-r--r--fs/ext4/fsync.c111
-rw-r--r--fs/ext4/ialloc.c43
-rw-r--r--fs/ext4/indirect.c23
-rw-r--r--fs/ext4/inline.c1884
-rw-r--r--fs/ext4/inode.c836
-rw-r--r--fs/ext4/ioctl.c41
-rw-r--r--fs/ext4/mballoc.c227
-rw-r--r--fs/ext4/mballoc.h5
-rw-r--r--fs/ext4/migrate.c1
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/move_extent.c521
-rw-r--r--fs/ext4/namei.c613
-rw-r--r--fs/ext4/page-io.c179
-rw-r--r--fs/ext4/resize.c459
-rw-r--r--fs/ext4/super.c553
-rw-r--r--fs/ext4/symlink.c4
-rw-r--r--fs/ext4/xattr.c121
-rw-r--r--fs/ext4/xattr.h158
32 files changed, 6098 insertions, 2162 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index c22f17021b6..987358740cb 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -39,22 +39,9 @@ config EXT4_USE_FOR_EXT23
compiled kernel size by using one file system driver for
ext2, ext3, and ext4 file systems.
-config EXT4_FS_XATTR
- bool "Ext4 extended attributes"
- depends on EXT4_FS
- default y
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page, or visit
- <http://acl.bestbits.at/> for details).
-
- If unsure, say N.
-
- You need this for POSIX ACL support on ext4.
-
config EXT4_FS_POSIX_ACL
bool "Ext4 POSIX Access Control Lists"
- depends on EXT4_FS_XATTR
+ depends on EXT4_FS
select FS_POSIX_ACL
help
POSIX Access Control Lists (ACLs) support permissions for users and
@@ -67,7 +54,7 @@ config EXT4_FS_POSIX_ACL
config EXT4_FS_SECURITY
bool "Ext4 Security Labels"
- depends on EXT4_FS_XATTR
+ depends on EXT4_FS
help
Security labels support alternative access control models
implemented by security modules like SELinux. This option
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 56fd8f86593..0310fec2ee3 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -7,8 +7,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
- mmp.o indirect.o
+ mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
+ xattr_trusted.o inline.o
-ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index a5c29bb3b83..e6e0d988439 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -55,16 +55,23 @@ ext4_acl_from_disk(const void *value, size_t size)
case ACL_OTHER:
value = (char *)value +
sizeof(ext4_acl_entry_short);
- acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
break;
case ACL_USER:
+ value = (char *)value + sizeof(ext4_acl_entry);
+ if ((char *)value > end)
+ goto fail;
+ acl->a_entries[n].e_uid =
+ make_kuid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
case ACL_GROUP:
value = (char *)value + sizeof(ext4_acl_entry);
if ((char *)value > end)
goto fail;
- acl->a_entries[n].e_id =
- le32_to_cpu(entry->e_id);
+ acl->a_entries[n].e_gid =
+ make_kgid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
break;
default:
@@ -98,13 +105,19 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
e = (char *)ext_acl + sizeof(ext4_acl_header);
for (n = 0; n < acl->a_count; n++) {
+ const struct posix_acl_entry *acl_e = &acl->a_entries[n];
ext4_acl_entry *entry = (ext4_acl_entry *)e;
- entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
- entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
- switch (acl->a_entries[n].e_tag) {
+ entry->e_tag = cpu_to_le16(acl_e->e_tag);
+ entry->e_perm = cpu_to_le16(acl_e->e_perm);
+ switch (acl_e->e_tag) {
case ACL_USER:
+ entry->e_id = cpu_to_le32(
+ from_kuid(&init_user_ns, acl_e->e_uid));
+ e += sizeof(ext4_acl_entry);
+ break;
case ACL_GROUP:
- entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
+ entry->e_id = cpu_to_le32(
+ from_kgid(&init_user_ns, acl_e->e_gid));
e += sizeof(ext4_acl_entry);
break;
@@ -374,7 +387,7 @@ ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
return PTR_ERR(acl);
if (acl == NULL)
return -ENODATA;
- error = posix_acl_to_xattr(acl, buffer, size);
+ error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
posix_acl_release(acl);
return error;
@@ -397,7 +410,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
return -EPERM;
if (value) {
- acl = posix_acl_from_xattr(value, size);
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
if (IS_ERR(acl))
return PTR_ERR(acl);
else if (acl) {
@@ -410,8 +423,10 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
retry:
handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto release_and_out;
+ }
error = ext4_set_acl(handle, inode, type, acl);
ext4_journal_stop(handle);
if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index cee7812cc3c..cf1821784a1 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -174,8 +174,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
ext4_free_inodes_set(sb, gdp, 0);
ext4_itable_unused_set(sb, gdp, 0);
memset(bh->b_data, 0xff, sb->s_blocksize);
- ext4_block_bitmap_csum_set(sb, block_group, gdp, bh,
- EXT4_BLOCKS_PER_GROUP(sb) / 8);
+ ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
return;
}
memset(bh->b_data, 0, sb->s_blocksize);
@@ -212,8 +211,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
*/
ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
sb->s_blocksize * 8, bh->b_data);
- ext4_block_bitmap_csum_set(sb, block_group, gdp, bh,
- EXT4_BLOCKS_PER_GROUP(sb) / 8);
+ ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
ext4_group_desc_csum_set(sb, block_group, gdp);
}
@@ -280,14 +278,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
return desc;
}
-static int ext4_valid_block_bitmap(struct super_block *sb,
- struct ext4_group_desc *desc,
- unsigned int block_group,
- struct buffer_head *bh)
+/*
+ * Return the block number which was discovered to be invalid, or 0 if
+ * the block bitmap is valid.
+ */
+static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ unsigned int block_group,
+ struct buffer_head *bh)
{
ext4_grpblk_t offset;
ext4_grpblk_t next_zero_bit;
- ext4_fsblk_t bitmap_blk;
+ ext4_fsblk_t blk;
ext4_fsblk_t group_first_block;
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
@@ -297,37 +299,33 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
* or it has to also read the block group where the bitmaps
* are located to verify they are set.
*/
- return 1;
+ return 0;
}
group_first_block = ext4_group_first_block_no(sb, block_group);
/* check whether block bitmap block number is set */
- bitmap_blk = ext4_block_bitmap(sb, desc);
- offset = bitmap_blk - group_first_block;
+ blk = ext4_block_bitmap(sb, desc);
+ offset = blk - group_first_block;
if (!ext4_test_bit(offset, bh->b_data))
/* bad block bitmap */
- goto err_out;
+ return blk;
/* check whether the inode bitmap block number is set */
- bitmap_blk = ext4_inode_bitmap(sb, desc);
- offset = bitmap_blk - group_first_block;
+ blk = ext4_inode_bitmap(sb, desc);
+ offset = blk - group_first_block;
if (!ext4_test_bit(offset, bh->b_data))
/* bad block bitmap */
- goto err_out;
+ return blk;
/* check whether the inode table block number is set */
- bitmap_blk = ext4_inode_table(sb, desc);
- offset = bitmap_blk - group_first_block;
+ blk = ext4_inode_table(sb, desc);
+ offset = blk - group_first_block;
next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
offset + EXT4_SB(sb)->s_itb_per_group,
offset);
- if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
- /* good bitmap for inode tables */
- return 1;
-
-err_out:
- ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
- block_group, bitmap_blk);
+ if (next_zero_bit < offset + EXT4_SB(sb)->s_itb_per_group)
+ /* bad bitmap for inode tables */
+ return blk;
return 0;
}
@@ -336,14 +334,26 @@ void ext4_validate_block_bitmap(struct super_block *sb,
unsigned int block_group,
struct buffer_head *bh)
{
+ ext4_fsblk_t blk;
+
if (buffer_verified(bh))
return;
ext4_lock_group(sb, block_group);
- if (ext4_valid_block_bitmap(sb, desc, block_group, bh) &&
- ext4_block_bitmap_csum_verify(sb, block_group, desc, bh,
- EXT4_BLOCKS_PER_GROUP(sb) / 8))
- set_buffer_verified(bh);
+ blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
+ if (unlikely(blk != 0)) {
+ ext4_unlock_group(sb, block_group);
+ ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
+ block_group, blk);
+ return;
+ }
+ if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
+ desc, bh))) {
+ ext4_unlock_group(sb, block_group);
+ ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+ return;
+ }
+ set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
}
@@ -609,7 +619,8 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
if (bitmap_bh == NULL)
continue;
- x = ext4_count_free(bitmap_bh, sb->s_blocksize);
+ x = ext4_count_free(bitmap_bh->b_data,
+ EXT4_BLOCKS_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
i, ext4_free_group_clusters(sb, gdp), x);
bitmap_count += x;
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index b319721da26..3285aa5a706 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -11,24 +11,11 @@
#include <linux/jbd2.h>
#include "ext4.h"
-#ifdef EXT4FS_DEBUG
-
-static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
-
-unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
+unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
{
- unsigned int i, sum = 0;
-
- if (!map)
- return 0;
- for (i = 0; i < numchars; i++)
- sum += nibblemap[map->b_data[i] & 0xf] +
- nibblemap[(map->b_data[i] >> 4) & 0xf];
- return sum;
+ return numchars * BITS_PER_BYTE - memweight(bitmap, numchars);
}
-#endif /* EXT4FS_DEBUG */
-
int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *gdp,
struct buffer_head *bh, int sz)
@@ -71,11 +58,12 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz)
+ struct buffer_head *bh)
{
__u32 hi;
__u32 provided, calculated;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -92,14 +80,14 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
if (provided == calculated)
return 1;
- ext4_error(sb, "Bad block bitmap checksum: block_group = %u", group);
return 0;
}
void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz)
+ struct buffer_head *bh)
{
+ int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index aa39e600d15..80a28b29727 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -27,23 +27,11 @@
#include <linux/slab.h>
#include <linux/rbtree.h>
#include "ext4.h"
-
-static unsigned char ext4_filetype_table[] = {
- DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
+#include "xattr.h"
static int ext4_dx_readdir(struct file *filp,
void *dirent, filldir_t filldir);
-static unsigned char get_dtype(struct super_block *sb, int filetype)
-{
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
- (filetype >= EXT4_FT_MAX))
- return DT_UNKNOWN;
-
- return (ext4_filetype_table[filetype]);
-}
-
/**
* Check if the given dir-inode refers to an htree-indexed directory
* (or a directory which chould potentially get coverted to use htree
@@ -68,11 +56,14 @@ static int is_dx_dir(struct inode *inode)
* Return 0 if the directory entry is OK, and 1 if there is a problem
*
* Note: this is the opposite of what ext2 and ext3 historically returned...
+ *
+ * bh passed here can be an inode block or a dir data block, depending
+ * on the inode inline data flag.
*/
int __ext4_check_dir_entry(const char *function, unsigned int line,
struct inode *dir, struct file *filp,
struct ext4_dir_entry_2 *de,
- struct buffer_head *bh,
+ struct buffer_head *bh, char *buf, int size,
unsigned int offset)
{
const char *error_msg = NULL;
@@ -85,9 +76,8 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
error_msg = "rec_len % 4 != 0";
else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
error_msg = "rec_len is too small for name_len";
- else if (unlikely(((char *) de - bh->b_data) + rlen >
- dir->i_sb->s_blocksize))
- error_msg = "directory entry across blocks";
+ else if (unlikely(((char *) de - buf) + rlen > size))
+ error_msg = "directory entry across range";
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
@@ -98,14 +88,14 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
ext4_error_file(filp, function, line, bh->b_blocknr,
"bad entry in directory: %s - offset=%u(%u), "
"inode=%u, rec_len=%d, name_len=%d",
- error_msg, (unsigned) (offset % bh->b_size),
+ error_msg, (unsigned) (offset % size),
offset, le32_to_cpu(de->inode),
rlen, de->name_len);
else
ext4_error_inode(dir, function, line, bh->b_blocknr,
"bad entry in directory: %s - offset=%u(%u), "
"inode=%u, rec_len=%d, name_len=%d",
- error_msg, (unsigned) (offset % bh->b_size),
+ error_msg, (unsigned) (offset % size),
offset, le32_to_cpu(de->inode),
rlen, de->name_len);
@@ -125,6 +115,14 @@ static int ext4_readdir(struct file *filp,
int ret = 0;
int dir_has_error = 0;
+ if (ext4_has_inline_data(inode)) {
+ int has_inline_data = 1;
+ ret = ext4_read_inline_dir(filp, dirent, filldir,
+ &has_inline_data);
+ if (has_inline_data)
+ return ret;
+ }
+
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(filp, dirent, filldir);
if (err != ERR_BAD_DX_DIR) {
@@ -221,8 +219,9 @@ revalidate:
while (!error && filp->f_pos < inode->i_size
&& offset < sb->s_blocksize) {
de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
- if (ext4_check_dir_entry(inode, filp, de,
- bh, offset)) {
+ if (ext4_check_dir_entry(inode, filp, de, bh,
+ bh->b_data, bh->b_size,
+ offset)) {
/*
* On error, skip the f_pos to the next block
*/
@@ -324,74 +323,27 @@ static inline loff_t ext4_get_htree_eof(struct file *filp)
/*
- * ext4_dir_llseek() based on generic_file_llseek() to handle both
- * non-htree and htree directories, where the "offset" is in terms
- * of the filename hash value instead of the byte offset.
+ * ext4_dir_llseek() calls generic_file_llseek_size to handle htree
+ * directories, where the "offset" is in terms of the filename hash
+ * value instead of the byte offset.
*
- * NOTE: offsets obtained *before* ext4_set_inode_flag(dir, EXT4_INODE_INDEX)
- * will be invalid once the directory was converted into a dx directory
+ * Because we may return a 64-bit hash that is well beyond offset limits,
+ * we need to pass the max hash as the maximum allowable offset in
+ * the htree directory case.
+ *
+ * For non-htree, ext4_llseek already chooses the proper max offset.
*/
-loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin)
+loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- loff_t ret = -EINVAL;
int dx_dir = is_dx_dir(inode);
+ loff_t htree_max = ext4_get_htree_eof(file);
- mutex_lock(&inode->i_mutex);
-
- /* NOTE: relative offsets with dx directories might not work
- * as expected, as it is difficult to figure out the
- * correct offset between dx hashes */
-
- switch (origin) {
- case SEEK_END:
- if (unlikely(offset > 0))
- goto out_err; /* not supported for directories */
-
- /* so only negative offsets are left, does that have a
- * meaning for directories at all? */
- if (dx_dir)
- offset += ext4_get_htree_eof(file);
- else
- offset += inode->i_size;
- break;
- case SEEK_CUR:
- /*
- * Here we special-case the lseek(fd, 0, SEEK_CUR)
- * position-querying operation. Avoid rewriting the "same"
- * f_pos value back to the file because a concurrent read(),
- * write() or lseek() might have altered it
- */
- if (offset == 0) {
- offset = file->f_pos;
- goto out_ok;
- }
-
- offset += file->f_pos;
- break;
- }
-
- if (unlikely(offset < 0))
- goto out_err;
-
- if (!dx_dir) {
- if (offset > inode->i_sb->s_maxbytes)
- goto out_err;
- } else if (offset > ext4_get_htree_eof(file))
- goto out_err;
-
- /* Special lock needed here? */
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
-
-out_ok:
- ret = offset;
-out_err:
- mutex_unlock(&inode->i_mutex);
-
- return ret;
+ if (likely(dx_dir))
+ return generic_file_llseek_size(file, offset, whence,
+ htree_max, htree_max);
+ else
+ return ext4_llseek(file, offset, whence);
}
/*
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cfc4e01b3c8..8462eb3c33a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -57,6 +57,16 @@
#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
+/*
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
+ */
+#define EXT_DEBUG__
+#ifdef EXT_DEBUG
+#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#else
+#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#endif
+
#define EXT4_ERROR_INODE(inode, fmt, a...) \
ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
@@ -186,7 +196,6 @@ struct mpage_da_data {
#define EXT4_IO_END_ERROR 0x0002
#define EXT4_IO_END_QUEUED 0x0004
#define EXT4_IO_END_DIRECT 0x0008
-#define EXT4_IO_END_IN_FSYNC 0x0010
struct ext4_io_page {
struct page *p_page;
@@ -393,6 +402,7 @@ struct flex_groups {
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
+#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
@@ -449,28 +459,26 @@ enum {
EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
+ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
};
-#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
-#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
- printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
- EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
-
-/*
- * Since it's pretty easy to mix up bit numbers and hex values, and we
- * can't do a compile-time test for ENUM values, we use a run-time
- * test to make sure that EXT4_XXX_FL is consistent with respect to
- * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
- * out so it won't cost any extra space in the compiled kernel image.
- * But it's important that these values are the same, since we are
- * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
- * must be consistent with the values of FS_XXX_FL defined in
- * include/linux/fs.h and the on-disk values found in ext2, ext3, and
- * ext4 filesystems, and of course the values defined in e2fsprogs.
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, we use a
+ * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
+ * any extra space in the compiled kernel image, otherwise, the build will fail.
+ * It's important that these values are the same, since we are using
+ * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
+ * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
+ * values found in ext2, ext3 and ext4 filesystems, and of course the values
+ * defined in e2fsprogs.
*
* It's not paranoia if the Murphy's Law really *is* out to get you. :-)
*/
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
+
static inline void ext4_check_flag_values(void)
{
CHECK_FLAG_VALUE(SECRM);
@@ -495,6 +503,7 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(EXTENTS);
CHECK_FLAG_VALUE(EA_INODE);
CHECK_FLAG_VALUE(EOFBLOCKS);
+ CHECK_FLAG_VALUE(INLINE_DATA);
CHECK_FLAG_VALUE(RESERVED);
}
@@ -571,6 +580,8 @@ enum {
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
/* Request will not result in inode size update (user for fallocate) */
#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
+ /* Do not take i_data_sem locking in ext4_map_blocks */
+#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
/*
* Flags used by ext4_free_blocks
@@ -810,6 +821,8 @@ struct ext4_ext_cache {
__u32 ec_len; /* must be 32bit to return holes */
};
+#include "extents_status.h"
+
/*
* fourth extended file system inode data in memory
*/
@@ -832,7 +845,6 @@ struct ext4_inode_info {
#endif
unsigned long i_flags;
-#ifdef CONFIG_EXT4_FS_XATTR
/*
* Extended attributes can be read independently of the main file
* data. Taking i_mutex even when reading would cause contention
@@ -841,7 +853,6 @@ struct ext4_inode_info {
* EAs.
*/
struct rw_semaphore xattr_sem;
-#endif
struct list_head i_orphan; /* unlinked but open inodes */
@@ -887,6 +898,10 @@ struct ext4_inode_info {
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
+ /* extents status tree */
+ struct ext4_es_tree i_es_tree;
+ rwlock_t i_es_lock;
+
/* ialloc */
ext4_group_t i_last_alloc_group;
@@ -901,6 +916,10 @@ struct ext4_inode_info {
/* on-disk additional length */
__u16 i_extra_isize;
+ /* Indicate the inline data space. */
+ u16 i_inline_off;
+ u16 i_inline_size;
+
#ifdef CONFIG_QUOTA
/* quota space reservation, managed internally by quota code */
qsize_t i_reserved_quota;
@@ -910,9 +929,7 @@ struct ext4_inode_info {
struct list_head i_completed_io_list;
spinlock_t i_completed_io_lock;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
- /* current io_end structure for async DIO write*/
- ext4_io_end_t *cur_aio_dio;
- atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
+ atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -1161,8 +1178,7 @@ struct ext4_sb_info {
unsigned long s_desc_per_block; /* Number of group descriptors per block */
ext4_group_t s_groups_count; /* Number of groups in the fs */
ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
- unsigned long s_overhead_last; /* Last calculated overhead */
- unsigned long s_blocks_last; /* Last seen block count */
+ unsigned long s_overhead; /* # of fs overhead clusters */
unsigned int s_cluster_ratio; /* Number of blocks per cluster */
unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
@@ -1232,6 +1248,7 @@ struct ext4_sb_info {
spinlock_t s_md_lock;
unsigned short *s_mb_offsets;
unsigned int *s_mb_maxs;
+ unsigned int s_group_info_size;
/* tunables */
unsigned long s_stripe;
@@ -1242,6 +1259,7 @@ struct ext4_sb_info {
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
unsigned int s_max_writeback_mb_bump;
+ unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
@@ -1269,8 +1287,12 @@ struct ext4_sb_info {
unsigned long s_sectors_written_start;
u64 s_kbytes_written;
+ /* the size of zero-out chunk */
+ unsigned int s_extent_max_zeroout_kb;
+
unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
+ ext4_group_t s_flex_groups_allocated;
/* workqueue for dio unwritten */
struct workqueue_struct *dio_unwritten_wq;
@@ -1314,6 +1336,8 @@ static inline struct timespec ext4_current_time(struct inode *inode)
static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
return ino == EXT4_ROOT_INO ||
+ ino == EXT4_USR_QUOTA_INO ||
+ ino == EXT4_GRP_QUOTA_INO ||
ino == EXT4_JOURNAL_INO ||
ino == EXT4_RESIZE_INO ||
(ino >= EXT4_FIRST_INO(sb) &&
@@ -1325,10 +1349,20 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
io_end->flag |= EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+ atomic_inc(&EXT4_I(inode)->i_unwritten);
}
}
+static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
+{
+ return inode->i_private;
+}
+
+static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
+{
+ inode->i_private = io;
+}
+
/*
* Inode dynamic state flags
*/
@@ -1342,6 +1376,9 @@ enum {
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
+ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
+ nolocking */
+ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1463,7 +1500,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
-#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1487,7 +1524,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
EXT4_FEATURE_INCOMPAT_EXTENTS| \
EXT4_FEATURE_INCOMPAT_64BIT| \
EXT4_FEATURE_INCOMPAT_FLEX_BG| \
- EXT4_FEATURE_INCOMPAT_MMP)
+ EXT4_FEATURE_INCOMPAT_MMP | \
+ EXT4_FEATURE_INCOMPAT_INLINE_DATA)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1496,7 +1534,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
+ EXT4_FEATURE_RO_COMPAT_QUOTA)
/*
* Default values for user and/or group using reserved blocks
@@ -1573,6 +1612,11 @@ struct ext4_dir_entry_tail {
__le32 det_checksum; /* crc32c(uuid+inum+dirblock) */
};
+#define EXT4_DIRENT_TAIL(block, blocksize) \
+ ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
+ ((blocksize) - \
+ sizeof(struct ext4_dir_entry_tail))))
+
/*
* Ext4 directory file types. Only the low 3 bits are used. The
* other bits are reserved for now.
@@ -1663,10 +1707,12 @@ static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
{
struct {
struct shash_desc shash;
- char ctx[crypto_shash_descsize(sbi->s_chksum_driver)];
+ char ctx[4];
} desc;
int err;
+ BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));
+
desc.shash.tfm = sbi->s_chksum_driver;
desc.shash.flags = 0;
*(u32 *)desc.ctx = crc;
@@ -1852,7 +1898,7 @@ struct mmpd_data {
# define NORET_AND noreturn,
/* bitmap.c */
-extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
+extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *gdp,
struct buffer_head *bh, int sz);
@@ -1861,10 +1907,10 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
struct buffer_head *bh, int sz);
void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz);
+ struct buffer_head *bh);
int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz);
+ struct buffer_head *bh);
/* balloc.c */
extern void ext4_validate_block_bitmap(struct super_block *sb,
@@ -1915,18 +1961,46 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
struct file *,
struct ext4_dir_entry_2 *,
- struct buffer_head *, unsigned int);
-#define ext4_check_dir_entry(dir, filp, de, bh, offset) \
+ struct buffer_head *, char *, int,
+ unsigned int);
+#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
- (de), (bh), (offset)))
+ (de), (bh), (buf), (size), (offset)))
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
__u32 minor_hash,
struct ext4_dir_entry_2 *dirent);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
+extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+ struct buffer_head *bh,
+ void *buf, int buf_size,
+ const char *name, int namelen,
+ struct ext4_dir_entry_2 **dest_de);
+void ext4_insert_dentry(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ const char *name, int namelen);
+static inline void ext4_update_dx_flag(struct inode *inode)
+{
+ if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_COMPAT_DIR_INDEX))
+ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+}
+static unsigned char ext4_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static inline unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+ (filetype >= EXT4_FT_MAX))
+ return DT_UNKNOWN;
+
+ return ext4_filetype_table[filetype];
+}
/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_completed_IO(struct inode *);
+extern int ext4_flush_unwritten_io(struct inode *);
/* hash.c */
extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1960,6 +2034,8 @@ extern void ext4_exit_mballoc(void);
extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t block,
unsigned long count, int flags);
+extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
+ ext4_group_t ngroups);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
@@ -1971,8 +2047,23 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
ext4_lblk_t, int, int *);
struct buffer_head *ext4_bread(handle_t *, struct inode *,
ext4_lblk_t, int, int *);
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create);
+int ext4_walk_page_buffers(handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)(handle_t *handle,
+ struct buffer_head *bh));
+int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh);
+#define FALL_BACK_TO_NONDELALLOC 1
+#define CONVERT_INLINE_DATA 2
extern struct inode *ext4_iget(struct super_block *, unsigned long);
extern int ext4_write_inode(struct inode *, struct writeback_control *);
@@ -2027,6 +2118,20 @@ extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
__u32 start_minor_hash, __u32 *next_hash);
+extern int search_dir(struct buffer_head *bh,
+ char *search_buf,
+ int buf_size,
+ struct inode *dir,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 **res_dir);
+extern int ext4_generic_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ void *entry_buf,
+ int buf_size,
+ int csum_size);
/* resize.c */
extern int ext4_group_add(struct super_block *sb,
@@ -2037,13 +2142,15 @@ extern int ext4_group_extend(struct super_block *sb,
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
/* super.c */
+extern int ext4_calculate_overhead(struct super_block *sb);
extern int ext4_superblock_csum_verify(struct super_block *sb,
struct ext4_super_block *es);
-extern void ext4_superblock_csum_set(struct super_block *sb,
- struct ext4_super_block *es);
+extern void ext4_superblock_csum_set(struct super_block *sb);
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
extern void *ext4_kvzalloc(size_t size, gfp_t flags);
extern void ext4_kvfree(void *ptr);
+extern int ext4_alloc_flex_bg_array(struct super_block *sb,
+ ext4_group_t ngroup);
extern __printf(4, 5)
void __ext4_error(struct super_block *, const char *, unsigned int,
const char *, ...);
@@ -2321,15 +2428,6 @@ static inline void ext4_unlock_group(struct super_block *sb,
spin_unlock(ext4_group_lock_ptr(sb, group));
}
-static inline void ext4_mark_super_dirty(struct super_block *sb)
-{
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-
- ext4_superblock_csum_set(sb, es);
- if (EXT4_SB(sb)->s_journal == NULL)
- sb->s_dirt =1;
-}
-
/*
* Block validity checking
*/
@@ -2354,11 +2452,21 @@ extern const struct file_operations ext4_dir_operations;
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
+extern void ext4_unwritten_wait(struct inode *inode);
/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
+extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int blocksize, int csum_size,
+ unsigned int parent_ino, int dotdot_real_len);
+extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+ unsigned int blocksize);
+extern int ext4_handle_dirty_dirent_node(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh);
/* symlink.c */
extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2376,6 +2484,9 @@ extern int ext4_check_blockref(const char *, unsigned int,
struct inode *, __le32 *, unsigned int);
/* extents.c */
+struct ext4_ext_path;
+struct ext4_extent;
+
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
@@ -2393,8 +2504,27 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
ssize_t len);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+ ext4_lblk_t lblocks);
+extern int ext4_extent_tree_init(handle_t *, struct inode *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+ int num,
+ struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+ struct ext4_extent *ex1,
+ struct ext4_extent *ex2);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *,
+ struct ext4_ext_path *,
+ struct ext4_extent *, int);
+extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+ struct ext4_ext_path *);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
+
+
/* move_extent.c */
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 start_orig, __u64 start_donor,
@@ -2402,11 +2532,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* page-io.c */
extern int __init ext4_init_pageio(void);
+extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
extern void ext4_ioend_wait(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern int ext4_end_io_nolock(ext4_io_end_t *io);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
@@ -2428,17 +2558,13 @@ enum ext4_state_bits {
* never, ever appear in a buffer_head's state
* flag. See EXT4_MAP_FROM_CLUSTER to see where
* this is used. */
- BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This
- * flag is set when ext4_map_blocks is called on a
- * delayed allocated block to get its real mapping. */
};
BUFFER_FNS(Uninit, uninit)
TAS_BUFFER_FNS(Uninit, uninit)
-BUFFER_FNS(Da_Mapped, da_mapped)
/*
- * Add new method to test wether block and inode bitmaps are properly
+ * Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
* to mark the bitmap uptodate. We need to also zero-out the bitmap
*/
@@ -2454,6 +2580,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
+/*
+ * Disable DIO read nolock optimization, so new dioreaders will be forced
+ * to grab i_mutex
+ */
+static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
+{
+ ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+ smp_mb();
+}
+static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
+{
+ smp_mb();
+ ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+}
+
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
/* For ioend & aio unwritten conversion wait queues */
@@ -2471,6 +2612,4 @@ extern void ext4_resize_end(struct super_block *sb);
#endif /* __KERNEL__ */
-#include "ext4_extents.h"
-
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index cb1b2c91996..487fda12bc0 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,16 +43,6 @@
#define CHECK_BINSEARCH__
/*
- * Turn on EXT_DEBUG to get lots of info about extents operations.
- */
-#define EXT_DEBUG__
-#ifdef EXT_DEBUG
-#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
-#else
-#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
-#endif
-
-/*
* If EXT_STATS is defined then stats numbers are collected.
* These number will be displayed at umount time.
*/
@@ -144,20 +134,6 @@ struct ext4_ext_path {
*/
/*
- * to be called by ext4_ext_walk_space()
- * negative retcode - error
- * positive retcode - signal for ext4_ext_walk_space(), see below
- * callback must return valid extent (passed or newly created)
- */
-typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
- struct ext4_ext_cache *,
- struct ext4_extent *, void *);
-
-#define EXT_CONTINUE 0
-#define EXT_BREAK 1
-#define EXT_REPEAT 2
-
-/*
* Maximum number of logical blocks in a file; ext4_extent's ee_block is
* __le32.
*/
@@ -300,21 +276,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
0xffff);
}
-extern int ext4_ext_calc_metadata_amount(struct inode *inode,
- ext4_lblk_t lblocks);
-extern int ext4_extent_tree_init(handle_t *, struct inode *);
-extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
- int num,
- struct ext4_ext_path *path);
-extern int ext4_can_extents_be_merged(struct inode *inode,
- struct ext4_extent *ex1,
- struct ext4_extent *ex2);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
-extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
- struct ext4_ext_path *);
-extern void ext4_ext_drop_refs(struct ext4_ext_path *);
-extern int ext4_ext_check_inode(struct inode *inode);
-extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
- int search_hint_reverse);
#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 90f7c2e84db..b4323ba846b 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -138,24 +138,18 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
}
int __ext4_handle_dirty_super(const char *where, unsigned int line,
- handle_t *handle, struct super_block *sb,
- int now)
+ handle_t *handle, struct super_block *sb)
{
struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
int err = 0;
+ ext4_superblock_csum_set(sb);
if (ext4_handle_valid(handle)) {
- ext4_superblock_csum_set(sb,
- (struct ext4_super_block *)bh->b_data);
err = jbd2_journal_dirty_metadata(handle, bh);
if (err)
ext4_journal_abort_handle(where, line, __func__,
bh, handle, err);
- } else if (now) {
- ext4_superblock_csum_set(sb,
- (struct ext4_super_block *)bh->b_data);
- mark_buffer_dirty(bh);
} else
- sb->s_dirt = 1;
+ mark_buffer_dirty(bh);
return err;
}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index f440e8f1841..7177f9b21cb 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -87,14 +87,20 @@
#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
* allocated so we need to update only data block */
-#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ 1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
* but inode, sb and group updates are done only once */
-#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
- (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
-
-#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
- (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
+#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+ +3+DQUOT_INIT_REWRITE) : 0)
+
+#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+ +3+DQUOT_DEL_REWRITE) : 0)
#else
#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
@@ -213,8 +219,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
struct buffer_head *bh);
int __ext4_handle_dirty_super(const char *where, unsigned int line,
- handle_t *handle, struct super_block *sb,
- int now);
+ handle_t *handle, struct super_block *sb);
#define ext4_journal_get_write_access(handle, bh) \
__ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
@@ -226,10 +231,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
#define ext4_handle_dirty_metadata(handle, inode, bh) \
__ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
(bh))
-#define ext4_handle_dirty_super_now(handle, sb) \
- __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb), 1)
#define ext4_handle_dirty_super(handle, sb) \
- __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb), 0)
+ __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
@@ -251,13 +254,6 @@ static inline void ext4_handle_sync(handle_t *handle)
handle->h_sync = 1;
}
-static inline void ext4_handle_release_buffer(handle_t *handle,
- struct buffer_head *bh)
-{
- if (ext4_handle_valid(handle))
- jbd2_journal_release_buffer(handle, bh);
-}
-
static inline int ext4_handle_is_aborted(handle_t *handle)
{
if (ext4_handle_valid(handle))
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 91341ec6e06..5ae1674ec12 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -41,6 +41,8 @@
#include <asm/uaccess.h>
#include <linux/fiemap.h>
#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "xattr.h"
#include <trace/events/ext4.h>
@@ -52,6 +54,9 @@
#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
+#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */
+#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */
+
static __le32 ext4_extent_block_csum(struct inode *inode,
struct ext4_extent_header *eh)
{
@@ -106,6 +111,9 @@ static int ext4_split_extent_at(handle_t *handle,
int split_flag,
int flags);
+static int ext4_find_delayed_extent(struct inode *inode,
+ struct ext4_ext_cache *newex);
+
static int ext4_ext_truncate_extend_restart(handle_t *handle,
struct inode *inode,
int needed)
@@ -1177,7 +1185,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
- neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1);
+ le16_add_cpu(&neh->eh_depth, 1);
ext4_mark_inode_dirty(handle, inode);
out:
brelse(bh);
@@ -1656,16 +1664,60 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
}
/*
+ * This function does a very simple check to see if we can collapse
+ * an extent tree with a single extent tree leaf block into the inode.
+ */
+static void ext4_ext_try_to_merge_up(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ size_t s;
+ unsigned max_root = ext4_ext_space_root(inode, 0);
+ ext4_fsblk_t blk;
+
+ if ((path[0].p_depth != 1) ||
+ (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
+ (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
+ return;
+
+ /*
+ * We need to modify the block allocation bitmap and the block
+ * group descriptor to release the extent tree block. If we
+ * can't get the journal credits, give up.
+ */
+ if (ext4_journal_extend(handle, 2))
+ return;
+
+ /*
+ * Copy the extent data up to the inode
+ */
+ blk = ext4_idx_pblock(path[0].p_idx);
+ s = le16_to_cpu(path[1].p_hdr->eh_entries) *
+ sizeof(struct ext4_extent_idx);
+ s += sizeof(struct ext4_extent_header);
+
+ memcpy(path[0].p_hdr, path[1].p_hdr, s);
+ path[0].p_depth = 0;
+ path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
+ (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
+ path[0].p_hdr->eh_max = cpu_to_le16(max_root);
+
+ brelse(path[1].p_bh);
+ ext4_free_blocks(handle, inode, NULL, blk, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+}
+
+/*
* This function tries to merge the @ex extent to neighbours in the tree.
* return 1 if merge left else 0.
*/
-static int ext4_ext_try_to_merge(struct inode *inode,
+static void ext4_ext_try_to_merge(handle_t *handle,
+ struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex) {
struct ext4_extent_header *eh;
unsigned int depth;
int merge_done = 0;
- int ret = 0;
depth = ext_depth(inode);
BUG_ON(path[depth].p_hdr == NULL);
@@ -1675,9 +1727,9 @@ static int ext4_ext_try_to_merge(struct inode *inode,
merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
if (!merge_done)
- ret = ext4_ext_try_to_merge_right(inode, path, ex);
+ (void) ext4_ext_try_to_merge_right(inode, path, ex);
- return ret;
+ ext4_ext_try_to_merge_up(handle, inode, path);
}
/*
@@ -1891,18 +1943,17 @@ has_space:
nearex->ee_len = newext->ee_len;
merge:
- /* try to merge extents to the right */
+ /* try to merge extents */
if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
- ext4_ext_try_to_merge(inode, path, nearex);
+ ext4_ext_try_to_merge(handle, inode, path, nearex);
- /* try to merge extents to the left */
/* time to correct all indexes above */
err = ext4_ext_correct_indexes(handle, inode, path);
if (err)
goto cleanup;
- err = ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
cleanup:
if (npath) {
@@ -1913,27 +1964,33 @@ cleanup:
return err;
}
-static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
- ext4_lblk_t num, ext_prepare_callback func,
- void *cbdata)
+static int ext4_fill_fiemap_extents(struct inode *inode,
+ ext4_lblk_t block, ext4_lblk_t num,
+ struct fiemap_extent_info *fieinfo)
{
struct ext4_ext_path *path = NULL;
- struct ext4_ext_cache cbex;
+ struct ext4_ext_cache newex;
struct ext4_extent *ex;
- ext4_lblk_t next, start = 0, end = 0;
+ ext4_lblk_t next, next_del, start = 0, end = 0;
ext4_lblk_t last = block + num;
- int depth, exists, err = 0;
-
- BUG_ON(func == NULL);
- BUG_ON(inode == NULL);
+ int exists, depth = 0, err = 0;
+ unsigned int flags = 0;
+ unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
while (block < last && block != EXT_MAX_BLOCKS) {
num = last - block;
/* find extent for this block */
down_read(&EXT4_I(inode)->i_data_sem);
+
+ if (path && ext_depth(inode) != depth) {
+ /* depth was changed. we have to realloc path */
+ kfree(path);
+ path = NULL;
+ }
+
path = ext4_ext_find_extent(inode, block, path);
- up_read(&EXT4_I(inode)->i_data_sem);
if (IS_ERR(path)) {
+ up_read(&EXT4_I(inode)->i_data_sem);
err = PTR_ERR(path);
path = NULL;
break;
@@ -1941,13 +1998,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
depth = ext_depth(inode);
if (unlikely(path[depth].p_hdr == NULL)) {
+ up_read(&EXT4_I(inode)->i_data_sem);
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
err = -EIO;
break;
}
ex = path[depth].p_ext;
next = ext4_ext_next_allocated_block(path);
+ ext4_ext_drop_refs(path);
+ flags = 0;
exists = 0;
if (!ex) {
/* there is no extent yet, so try to allocate
@@ -1984,40 +2044,64 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
BUG_ON(end <= start);
if (!exists) {
- cbex.ec_block = start;
- cbex.ec_len = end - start;
- cbex.ec_start = 0;
+ newex.ec_block = start;
+ newex.ec_len = end - start;
+ newex.ec_start = 0;
} else {
- cbex.ec_block = le32_to_cpu(ex->ee_block);
- cbex.ec_len = ext4_ext_get_actual_len(ex);
- cbex.ec_start = ext4_ext_pblock(ex);
+ newex.ec_block = le32_to_cpu(ex->ee_block);
+ newex.ec_len = ext4_ext_get_actual_len(ex);
+ newex.ec_start = ext4_ext_pblock(ex);
+ if (ext4_ext_is_uninitialized(ex))
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
}
- if (unlikely(cbex.ec_len == 0)) {
- EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
- err = -EIO;
- break;
+ /*
+ * Find delayed extent and update newex accordingly. We call
+ * it even in !exists case to find out whether newex is the
+ * last existing extent or not.
+ */
+ next_del = ext4_find_delayed_extent(inode, &newex);
+ if (!exists && next_del) {
+ exists = 1;
+ flags |= FIEMAP_EXTENT_DELALLOC;
}
- err = func(inode, next, &cbex, ex, cbdata);
- ext4_ext_drop_refs(path);
+ up_read(&EXT4_I(inode)->i_data_sem);
- if (err < 0)
+ if (unlikely(newex.ec_len == 0)) {
+ EXT4_ERROR_INODE(inode, "newex.ec_len == 0");
+ err = -EIO;
break;
+ }
- if (err == EXT_REPEAT)
- continue;
- else if (err == EXT_BREAK) {
- err = 0;
- break;
+ /* This is possible iff next == next_del == EXT_MAX_BLOCKS */
+ if (next == next_del) {
+ flags |= FIEMAP_EXTENT_LAST;
+ if (unlikely(next_del != EXT_MAX_BLOCKS ||
+ next != EXT_MAX_BLOCKS)) {
+ EXT4_ERROR_INODE(inode,
+ "next extent == %u, next "
+ "delalloc extent = %u",
+ next, next_del);
+ err = -EIO;
+ break;
+ }
}
- if (ext_depth(inode) != depth) {
- /* depth was changed. we have to realloc path */
- kfree(path);
- path = NULL;
+ if (exists) {
+ err = fiemap_fill_next_extent(fieinfo,
+ (__u64)newex.ec_block << blksize_bits,
+ (__u64)newex.ec_start << blksize_bits,
+ (__u64)newex.ec_len << blksize_bits,
+ flags);
+ if (err < 0)
+ break;
+ if (err == 1) {
+ err = 0;
+ break;
+ }
}
- block = cbex.ec_block + cbex.ec_len;
+ block = newex.ec_block + newex.ec_len;
}
if (path) {
@@ -2093,13 +2177,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
}
/*
- * ext4_ext_check_cache()
+ * ext4_ext_in_cache()
* Checks to see if the given block is in the cache.
* If it is, the cached extent is stored in the given
- * cache extent pointer. If the cached extent is a hole,
- * this routine should be used instead of
- * ext4_ext_in_cache if the calling function needs to
- * know the size of the hole.
+ * cache extent pointer.
*
* @inode: The files inode
* @block: The block to look for in the cache
@@ -2108,10 +2189,11 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
*
* Return 0 if cache is invalid; 1 if the cache is valid
*/
-static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
- struct ext4_ext_cache *ex){
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+ struct ext4_extent *ex)
+{
struct ext4_ext_cache *cex;
- struct ext4_sb_info *sbi;
int ret = 0;
/*
@@ -2119,14 +2201,15 @@ static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
*/
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
cex = &EXT4_I(inode)->i_cached_extent;
- sbi = EXT4_SB(inode->i_sb);
/* has cache valid data? */
if (cex->ec_len == 0)
goto errout;
if (in_range(block, cex->ec_block, cex->ec_len)) {
- memcpy(ex, cex, sizeof(struct ext4_ext_cache));
+ ex->ee_block = cpu_to_le32(cex->ec_block);
+ ext4_ext_store_pblock(ex, cex->ec_start);
+ ex->ee_len = cpu_to_le16(cex->ec_len);
ext_debug("%u cached by %u:%u:%llu\n",
block,
cex->ec_block, cex->ec_len, cex->ec_start);
@@ -2139,48 +2222,18 @@ errout:
}
/*
- * ext4_ext_in_cache()
- * Checks to see if the given block is in the cache.
- * If it is, the cached extent is stored in the given
- * extent pointer.
- *
- * @inode: The files inode
- * @block: The block to look for in the cache
- * @ex: Pointer where the cached extent will be stored
- * if it contains block
- *
- * Return 0 if cache is invalid; 1 if the cache is valid
- */
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
- struct ext4_extent *ex)
-{
- struct ext4_ext_cache cex;
- int ret = 0;
-
- if (ext4_ext_check_cache(inode, block, &cex)) {
- ex->ee_block = cpu_to_le32(cex.ec_block);
- ext4_ext_store_pblock(ex, cex.ec_start);
- ex->ee_len = cpu_to_le16(cex.ec_len);
- ret = 1;
- }
-
- return ret;
-}
-
-
-/*
* ext4_ext_rm_idx:
* removes index from the index block.
*/
static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
+ struct ext4_ext_path *path, int depth)
{
int err;
ext4_fsblk_t leaf;
/* free index block */
- path--;
+ depth--;
+ path = path + depth;
leaf = ext4_idx_pblock(path->p_idx);
if (unlikely(path->p_hdr->eh_entries == 0)) {
EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
@@ -2205,6 +2258,19 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
ext4_free_blocks(handle, inode, NULL, leaf, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+
+ while (--depth >= 0) {
+ if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
+ break;
+ path--;
+ err = ext4_ext_get_access(handle, inode, path);
+ if (err)
+ break;
+ path->p_idx->ei_block = (path+1)->p_idx->ei_block;
+ err = ext4_ext_dirty(handle, inode, path);
+ if (err)
+ break;
+ }
return err;
}
@@ -2257,7 +2323,13 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
{
int index;
- int depth = ext_depth(inode);
+ int depth;
+
+ /* If we are converting the inline data, only one is needed here. */
+ if (ext4_has_inline_data(inode))
+ return 1;
+
+ depth = ext_depth(inode);
if (chunk)
index = depth * 2;
@@ -2275,10 +2347,13 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
ext4_fsblk_t pblk;
- int flags = EXT4_FREE_BLOCKS_FORGET;
+ int flags = 0;
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+ else if (ext4_should_journal_data(inode))
+ flags |= EXT4_FREE_BLOCKS_FORGET;
+
/*
* For bigalloc file systems, we never free a partial cluster
* at the beginning of the extent. Instead, we make a note
@@ -2538,7 +2613,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
/* if this leaf is free, then we should
* remove it from index block above */
if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
- err = ext4_ext_rm_idx(handle, inode, path + depth);
+ err = ext4_ext_rm_idx(handle, inode, path, depth);
out:
return err;
@@ -2570,10 +2645,10 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
{
struct super_block *sb = inode->i_sb;
int depth = ext_depth(inode);
- struct ext4_ext_path *path;
+ struct ext4_ext_path *path = NULL;
ext4_fsblk_t partial_cluster = 0;
handle_t *handle;
- int i, err;
+ int i = 0, err = 0;
ext_debug("truncate since %u to %u\n", start, end);
@@ -2605,9 +2680,17 @@ again:
return PTR_ERR(path);
}
depth = ext_depth(inode);
+ /* Leaf not may not exist only if inode has no blocks at all */
ex = path[depth].p_ext;
- if (!ex)
- goto cont;
+ if (!ex) {
+ if (depth) {
+ EXT4_ERROR_INODE(inode,
+ "path[%d].p_hdr == NULL",
+ depth);
+ err = -EIO;
+ }
+ goto out;
+ }
ee_block = le32_to_cpu(ex->ee_block);
@@ -2637,29 +2720,34 @@ again:
if (err < 0)
goto out;
}
- ext4_ext_drop_refs(path);
- kfree(path);
}
-cont:
-
/*
* We start scanning from right side, freeing all the blocks
* after i_size and walking into the tree depth-wise.
*/
depth = ext_depth(inode);
- path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
- if (path == NULL) {
- ext4_journal_stop(handle);
- return -ENOMEM;
- }
- path[0].p_depth = depth;
- path[0].p_hdr = ext_inode_hdr(inode);
+ if (path) {
+ int k = i = depth;
+ while (--k > 0)
+ path[k].p_block =
+ le16_to_cpu(path[k].p_hdr->eh_entries)+1;
+ } else {
+ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
+ GFP_NOFS);
+ if (path == NULL) {
+ ext4_journal_stop(handle);
+ return -ENOMEM;
+ }
+ path[0].p_depth = depth;
+ path[0].p_hdr = ext_inode_hdr(inode);
+ i = 0;
- if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
- err = -EIO;
- goto out;
+ if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
+ err = -EIO;
+ goto out;
+ }
}
- i = err = 0;
+ err = 0;
while (i >= 0 && err == 0) {
if (i == depth) {
@@ -2728,7 +2816,7 @@ cont:
/* index is empty, remove it;
* handle must be already prepared by the
* truncatei_leaf() */
- err = ext4_ext_rm_idx(handle, inode, path + i);
+ err = ext4_ext_rm_idx(handle, inode, path, i);
}
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
@@ -2773,8 +2861,10 @@ cont:
out:
ext4_ext_drop_refs(path);
kfree(path);
- if (err == -EAGAIN)
+ if (err == -EAGAIN) {
+ path = NULL;
goto again;
+ }
ext4_journal_stop(handle);
return err;
@@ -2883,6 +2973,9 @@ static int ext4_split_extent_at(handle_t *handle,
unsigned int ee_len, depth;
int err = 0;
+ BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
+ (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
+
ext_debug("ext4_split_extents_at: inode %lu, logical"
"block %llu\n", inode->i_ino, (unsigned long long)split);
@@ -2912,9 +3005,9 @@ static int ext4_split_extent_at(handle_t *handle,
ext4_ext_mark_initialized(ex);
if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
- ext4_ext_try_to_merge(inode, path, ex);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
goto out;
}
@@ -2941,13 +3034,20 @@ static int ext4_split_extent_at(handle_t *handle,
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
- err = ext4_ext_zeroout(inode, &orig_ex);
+ if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
+ if (split_flag & EXT4_EXT_DATA_VALID1)
+ err = ext4_ext_zeroout(inode, ex2);
+ else
+ err = ext4_ext_zeroout(inode, ex);
+ } else
+ err = ext4_ext_zeroout(inode, &orig_ex);
+
if (err)
goto fix_extent_len;
/* update the extent length and mark as initialized */
ex->ee_len = cpu_to_le16(ee_len);
- ext4_ext_try_to_merge(inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + depth);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
goto out;
} else if (err)
goto fix_extent_len;
@@ -2994,12 +3094,13 @@ static int ext4_split_extent(handle_t *handle,
uninitialized = ext4_ext_is_uninitialized(ex);
if (map->m_lblk + map->m_len < ee_block + ee_len) {
- split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
- EXT4_EXT_MAY_ZEROOUT : 0;
+ split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
if (uninitialized)
split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
EXT4_EXT_MARK_UNINIT2;
+ if (split_flag & EXT4_EXT_DATA_VALID2)
+ split_flag1 |= EXT4_EXT_DATA_VALID1;
err = ext4_split_extent_at(handle, inode, path,
map->m_lblk + map->m_len, split_flag1, flags1);
if (err)
@@ -3012,8 +3113,8 @@ static int ext4_split_extent(handle_t *handle,
return PTR_ERR(path);
if (map->m_lblk >= ee_block) {
- split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
- EXT4_EXT_MAY_ZEROOUT : 0;
+ split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT |
+ EXT4_EXT_DATA_VALID2);
if (uninitialized)
split_flag1 |= EXT4_EXT_MARK_UNINIT1;
if (split_flag & EXT4_EXT_MARK_UNINIT2)
@@ -3029,7 +3130,6 @@ out:
return err ? err : map->m_len;
}
-#define EXT4_EXT_ZERO_LEN 7
/*
* This function is called by ext4_ext_map_blocks() if someone tries to write
* to an uninitialized extent. It may result in splitting the uninitialized
@@ -3055,13 +3155,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
struct ext4_map_blocks *map,
struct ext4_ext_path *path)
{
+ struct ext4_sb_info *sbi;
struct ext4_extent_header *eh;
struct ext4_map_blocks split_map;
struct ext4_extent zero_ex;
struct ext4_extent *ex;
ext4_lblk_t ee_block, eof_block;
unsigned int ee_len, depth;
- int allocated;
+ int allocated, max_zeroout = 0;
int err = 0;
int split_flag = 0;
@@ -3069,6 +3170,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
"block %llu, max_blocks %u\n", inode->i_ino,
(unsigned long long)map->m_lblk, map->m_len);
+ sbi = EXT4_SB(inode->i_sb);
eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map->m_len)
@@ -3168,9 +3270,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
*/
split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
- /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
- if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
- (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ if (EXT4_EXT_MAY_ZEROOUT & split_flag)
+ max_zeroout = sbi->s_extent_max_zeroout_kb >>
+ inode->i_sb->s_blocksize_bits;
+
+ /* If extent is less than s_max_zeroout_kb, zeroout directly */
+ if (max_zeroout && (ee_len <= max_zeroout)) {
err = ext4_ext_zeroout(inode, ex);
if (err)
goto out;
@@ -3179,8 +3284,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
if (err)
goto out;
ext4_ext_mark_initialized(ex);
- ext4_ext_try_to_merge(inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + depth);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
goto out;
}
@@ -3194,9 +3299,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
split_map.m_lblk = map->m_lblk;
split_map.m_len = map->m_len;
- if (allocated > map->m_len) {
- if (allocated <= EXT4_EXT_ZERO_LEN &&
- (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ if (max_zeroout && (allocated > map->m_len)) {
+ if (allocated <= max_zeroout) {
/* case 3 */
zero_ex.ee_block =
cpu_to_le32(map->m_lblk);
@@ -3208,9 +3312,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
goto out;
split_map.m_lblk = map->m_lblk;
split_map.m_len = allocated;
- } else if ((map->m_lblk - ee_block + map->m_len <
- EXT4_EXT_ZERO_LEN) &&
- (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
/* case 2 */
if (map->m_lblk != ee_block) {
zero_ex.ee_block = ex->ee_block;
@@ -3230,7 +3332,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
}
allocated = ext4_split_extent(handle, inode, path,
- &split_map, split_flag, 0);
+ &split_map, split_flag, 0);
if (allocated < 0)
err = allocated;
@@ -3244,7 +3346,7 @@ out:
* to an uninitialized extent.
*
* Writing to an uninitialized extent may result in splitting the uninitialized
- * extent into multiple /initialized uninitialized extents (up to three)
+ * extent into multiple initialized/uninitialized extents (up to three)
* There are three possibilities:
* a> There is no split required: Entire extent should be uninitialized
* b> Splits in two extents: Write is happening at either end of the extent
@@ -3291,26 +3393,47 @@ static int ext4_split_unwritten_extents(handle_t *handle,
split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
split_flag |= EXT4_EXT_MARK_UNINIT2;
-
+ if (flags & EXT4_GET_BLOCKS_CONVERT)
+ split_flag |= EXT4_EXT_DATA_VALID2;
flags |= EXT4_GET_BLOCKS_PRE_IO;
return ext4_split_extent(handle, inode, path, map, split_flag, flags);
}
static int ext4_convert_unwritten_extents_endio(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path *path)
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path)
{
struct ext4_extent *ex;
+ ext4_lblk_t ee_block;
+ unsigned int ee_len;
int depth;
int err = 0;
depth = ext_depth(inode);
ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
"block %llu, max_blocks %u\n", inode->i_ino,
- (unsigned long long)le32_to_cpu(ex->ee_block),
- ext4_ext_get_actual_len(ex));
+ (unsigned long long)ee_block, ee_len);
+
+ /* If extent is larger than requested then split is required */
+ if (ee_block != map->m_lblk || ee_len > map->m_len) {
+ err = ext4_split_unwritten_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT);
+ if (err < 0)
+ goto out;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
+ }
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -3321,10 +3444,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
/* note: ext4_ext_correct_indexes() isn't needed here because
* borders are not changed
*/
- ext4_ext_try_to_merge(inode, path, ex);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
/* Mark modified extent as dirty */
- err = ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
out:
ext4_ext_show_leaf(inode, path);
return err;
@@ -3394,115 +3517,34 @@ out:
/**
* ext4_find_delalloc_range: find delayed allocated block in the given range.
*
- * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
- * whether there are any buffers marked for delayed allocation. It returns '1'
- * on the first delalloc'ed buffer head found. If no buffer head in the given
- * range is marked for delalloc, it returns 0.
- * lblk_start should always be <= lblk_end.
- * search_hint_reverse is to indicate that searching in reverse from lblk_end to
- * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
- * block sooner). This is useful when blocks are truncated sequentially from
- * lblk_start towards lblk_end.
+ * Return 1 if there is a delalloc block in the range, otherwise 0.
*/
static int ext4_find_delalloc_range(struct inode *inode,
ext4_lblk_t lblk_start,
- ext4_lblk_t lblk_end,
- int search_hint_reverse)
+ ext4_lblk_t lblk_end)
{
- struct address_space *mapping = inode->i_mapping;
- struct buffer_head *head, *bh = NULL;
- struct page *page;
- ext4_lblk_t i, pg_lblk;
- pgoff_t index;
+ struct extent_status es;
- if (!test_opt(inode->i_sb, DELALLOC))
- return 0;
-
- /* reverse search wont work if fs block size is less than page size */
- if (inode->i_blkbits < PAGE_CACHE_SHIFT)
- search_hint_reverse = 0;
-
- if (search_hint_reverse)
- i = lblk_end;
+ es.start = lblk_start;
+ ext4_es_find_extent(inode, &es);
+ if (es.len == 0)
+ return 0; /* there is no delay extent in this tree */
+ else if (es.start <= lblk_start && lblk_start < es.start + es.len)
+ return 1;
+ else if (lblk_start <= es.start && es.start <= lblk_end)
+ return 1;
else
- i = lblk_start;
-
- index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
- while ((i >= lblk_start) && (i <= lblk_end)) {
- page = find_get_page(mapping, index);
- if (!page)
- goto nextpage;
-
- if (!page_has_buffers(page))
- goto nextpage;
-
- head = page_buffers(page);
- if (!head)
- goto nextpage;
-
- bh = head;
- pg_lblk = index << (PAGE_CACHE_SHIFT -
- inode->i_blkbits);
- do {
- if (unlikely(pg_lblk < lblk_start)) {
- /*
- * This is possible when fs block size is less
- * than page size and our cluster starts/ends in
- * middle of the page. So we need to skip the
- * initial few blocks till we reach the 'lblk'
- */
- pg_lblk++;
- continue;
- }
-
- /* Check if the buffer is delayed allocated and that it
- * is not yet mapped. (when da-buffers are mapped during
- * their writeout, their da_mapped bit is set.)
- */
- if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
- page_cache_release(page);
- trace_ext4_find_delalloc_range(inode,
- lblk_start, lblk_end,
- search_hint_reverse,
- 1, i);
- return 1;
- }
- if (search_hint_reverse)
- i--;
- else
- i++;
- } while ((i >= lblk_start) && (i <= lblk_end) &&
- ((bh = bh->b_this_page) != head));
-nextpage:
- if (page)
- page_cache_release(page);
- /*
- * Move to next page. 'i' will be the first lblk in the next
- * page.
- */
- if (search_hint_reverse)
- index--;
- else
- index++;
- i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- }
-
- trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
- search_hint_reverse, 0, 0);
- return 0;
+ return 0;
}
-int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
- int search_hint_reverse)
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t lblk_start, lblk_end;
lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
- return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
- search_hint_reverse);
+ return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
}
/**
@@ -3563,7 +3605,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
lblk_to = lblk_from + c_offset - 1;
- if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+ if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
allocated_clusters--;
}
@@ -3573,7 +3615,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
lblk_from = lblk_start + num_blks;
lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
- if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+ if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
allocated_clusters--;
}
@@ -3588,7 +3630,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
{
int ret = 0;
int err = 0;
- ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_io_end_t *io = ext4_inode_aio(inode);
ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
"block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -3596,13 +3638,15 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
flags, allocated);
ext4_ext_show_leaf(inode, path);
- trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
- newblock);
+ trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
+ allocated, newblock);
/* get_block() before submit the IO, split the extent */
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
ret = ext4_split_unwritten_extents(handle, inode, map,
path, flags);
+ if (ret <= 0)
+ goto out;
/*
* Flag the inode(non aio case) or end_io struct (aio case)
* that this IO needs to conversion to written when IO is
@@ -3618,7 +3662,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
}
/* IO end_io complete, convert the filled extent to written */
if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
- ret = ext4_convert_unwritten_extents_endio(handle, inode,
+ ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
path);
if (ret >= 0) {
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3842,12 +3886,13 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent newex, *ex, *ex2;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_fsblk_t newblock = 0;
- int free_on_err = 0, err = 0, depth, ret;
+ int free_on_err = 0, err = 0, depth;
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
- ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_io_end_t *io = ext4_inode_aio(inode);
ext4_lblk_t cluster_offset;
+ int set_unwritten = 0;
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
@@ -3857,7 +3902,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
if (!newex.ee_start_lo && !newex.ee_start_hi) {
if ((sbi->s_cluster_ratio > 1) &&
- ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+ ext4_find_delalloc_cluster(inode, map->m_lblk))
map->m_flags |= EXT4_MAP_FROM_CLUSTER;
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3937,15 +3982,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ee_len, ee_start);
goto out;
}
- ret = ext4_ext_handle_uninitialized_extents(
+ allocated = ext4_ext_handle_uninitialized_extents(
handle, inode, map, path, flags,
allocated, newblock);
- return ret;
+ goto out3;
}
}
if ((sbi->s_cluster_ratio > 1) &&
- ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+ ext4_find_delalloc_cluster(inode, map->m_lblk))
map->m_flags |= EXT4_MAP_FROM_CLUSTER;
/*
@@ -4070,13 +4115,8 @@ got_allocated_blocks:
* For non asycn direct IO case, flag the inode state
* that we need to perform conversion when IO is done.
*/
- if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
- if (io)
- ext4_set_io_unwritten_flag(inode, io);
- else
- ext4_set_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN);
- }
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO))
+ set_unwritten = 1;
if (ext4_should_dioread_nolock(inode))
map->m_flags |= EXT4_MAP_UNINIT;
}
@@ -4088,6 +4128,15 @@ got_allocated_blocks:
if (!err)
err = ext4_ext_insert_extent(handle, inode, path,
&newex, flags);
+
+ if (!err && set_unwritten) {
+ if (io)
+ ext4_set_io_unwritten_flag(inode, io);
+ else
+ ext4_set_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN);
+ }
+
if (err && free_on_err) {
int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
@@ -4210,8 +4259,8 @@ out2:
kfree(path);
}
- trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
- newblock, map->m_len, err ? err : allocated);
+out3:
+ trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
return err ? err : allocated;
}
@@ -4229,7 +4278,7 @@ void ext4_ext_truncate(struct inode *inode)
* finish any pending end_io work so we won't run the risk of
* converting any truncated blocks to initialized later
*/
- ext4_flush_completed_IO(inode);
+ ext4_flush_unwritten_io(inode);
/*
* probably first extent we're gonna free will be last in block
@@ -4270,6 +4319,8 @@ void ext4_ext_truncate(struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
+ err = ext4_es_remove_extent(inode, last_block,
+ EXT_MAX_BLOCKS - last_block);
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
/* In a multi-transaction truncate, we only make the final
@@ -4360,6 +4411,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (mode & FALLOC_FL_PUNCH_HOLE)
return ext4_punch_hole(file, offset, len);
+ ret = ext4_convert_inline_data(inode);
+ if (ret)
+ return ret;
+
trace_ext4_fallocate_enter(inode, offset, len, mode);
map.m_lblk = offset >> blkbits;
/*
@@ -4389,6 +4444,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
*/
if (len <= EXT_UNINIT_MAX_LEN << blkbits)
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+
+ /* Prevent race condition between unwritten */
+ ext4_flush_unwritten_io(inode);
retry:
while (ret >= 0 && ret < max_blocks) {
map.m_lblk = map.m_lblk + ret;
@@ -4420,6 +4478,8 @@ retry:
ext4_falloc_update_inode(inode, mode, new_size,
(map.m_flags & EXT4_MAP_NEW));
ext4_mark_inode_dirty(handle, inode);
+ if ((file->f_flags & O_SYNC) && ret >= max_blocks)
+ ext4_handle_sync(handle);
ret2 = ext4_journal_stop(handle);
if (ret2)
break;
@@ -4493,206 +4553,43 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
}
/*
- * Callback function called for each extent to gather FIEMAP information.
+ * If newex is not existing extent (newex->ec_start equals zero) find
+ * delayed extent at start of newex and update newex accordingly and
+ * return start of the next delayed extent.
+ *
+ * If newex is existing extent (newex->ec_start is not equal zero)
+ * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
+ * extent found. Leave newex unmodified.
*/
-static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
- struct ext4_ext_cache *newex, struct ext4_extent *ex,
- void *data)
+static int ext4_find_delayed_extent(struct inode *inode,
+ struct ext4_ext_cache *newex)
{
- __u64 logical;
- __u64 physical;
- __u64 length;
- __u32 flags = 0;
- int ret = 0;
- struct fiemap_extent_info *fieinfo = data;
- unsigned char blksize_bits;
+ struct extent_status es;
+ ext4_lblk_t next_del;
- blksize_bits = inode->i_sb->s_blocksize_bits;
- logical = (__u64)newex->ec_block << blksize_bits;
+ es.start = newex->ec_block;
+ next_del = ext4_es_find_extent(inode, &es);
if (newex->ec_start == 0) {
/*
* No extent in extent-tree contains block @newex->ec_start,
* then the block may stay in 1)a hole or 2)delayed-extent.
- *
- * Holes or delayed-extents are processed as follows.
- * 1. lookup dirty pages with specified range in pagecache.
- * If no page is got, then there is no delayed-extent and
- * return with EXT_CONTINUE.
- * 2. find the 1st mapped buffer,
- * 3. check if the mapped buffer is both in the request range
- * and a delayed buffer. If not, there is no delayed-extent,
- * then return.
- * 4. a delayed-extent is found, the extent will be collected.
*/
- ext4_lblk_t end = 0;
- pgoff_t last_offset;
- pgoff_t offset;
- pgoff_t index;
- pgoff_t start_index = 0;
- struct page **pages = NULL;
- struct buffer_head *bh = NULL;
- struct buffer_head *head = NULL;
- unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
-
- pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (pages == NULL)
- return -ENOMEM;
-
- offset = logical >> PAGE_SHIFT;
-repeat:
- last_offset = offset;
- head = NULL;
- ret = find_get_pages_tag(inode->i_mapping, &offset,
- PAGECACHE_TAG_DIRTY, nr_pages, pages);
-
- if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
- /* First time, try to find a mapped buffer. */
- if (ret == 0) {
-out:
- for (index = 0; index < ret; index++)
- page_cache_release(pages[index]);
- /* just a hole. */
- kfree(pages);
- return EXT_CONTINUE;
- }
- index = 0;
-
-next_page:
- /* Try to find the 1st mapped buffer. */
- end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
- blksize_bits;
- if (!page_has_buffers(pages[index]))
- goto out;
- head = page_buffers(pages[index]);
- if (!head)
- goto out;
-
- index++;
- bh = head;
- do {
- if (end >= newex->ec_block +
- newex->ec_len)
- /* The buffer is out of
- * the request range.
- */
- goto out;
-
- if (buffer_mapped(bh) &&
- end >= newex->ec_block) {
- start_index = index - 1;
- /* get the 1st mapped buffer. */
- goto found_mapped_buffer;
- }
-
- bh = bh->b_this_page;
- end++;
- } while (bh != head);
-
- /* No mapped buffer in the range found in this page,
- * We need to look up next page.
- */
- if (index >= ret) {
- /* There is no page left, but we need to limit
- * newex->ec_len.
- */
- newex->ec_len = end - newex->ec_block;
- goto out;
- }
- goto next_page;
- } else {
- /*Find contiguous delayed buffers. */
- if (ret > 0 && pages[0]->index == last_offset)
- head = page_buffers(pages[0]);
- bh = head;
- index = 1;
- start_index = 0;
- }
-
-found_mapped_buffer:
- if (bh != NULL && buffer_delay(bh)) {
- /* 1st or contiguous delayed buffer found. */
- if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
- /*
- * 1st delayed buffer found, record
- * the start of extent.
- */
- flags |= FIEMAP_EXTENT_DELALLOC;
- newex->ec_block = end;
- logical = (__u64)end << blksize_bits;
- }
- /* Find contiguous delayed buffers. */
- do {
- if (!buffer_delay(bh))
- goto found_delayed_extent;
- bh = bh->b_this_page;
- end++;
- } while (bh != head);
-
- for (; index < ret; index++) {
- if (!page_has_buffers(pages[index])) {
- bh = NULL;
- break;
- }
- head = page_buffers(pages[index]);
- if (!head) {
- bh = NULL;
- break;
- }
-
- if (pages[index]->index !=
- pages[start_index]->index + index
- - start_index) {
- /* Blocks are not contiguous. */
- bh = NULL;
- break;
- }
- bh = head;
- do {
- if (!buffer_delay(bh))
- /* Delayed-extent ends. */
- goto found_delayed_extent;
- bh = bh->b_this_page;
- end++;
- } while (bh != head);
- }
- } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
- /* a hole found. */
- goto out;
+ if (es.len == 0)
+ /* A hole found. */
+ return 0;
-found_delayed_extent:
- newex->ec_len = min(end - newex->ec_block,
- (ext4_lblk_t)EXT_INIT_MAX_LEN);
- if (ret == nr_pages && bh != NULL &&
- newex->ec_len < EXT_INIT_MAX_LEN &&
- buffer_delay(bh)) {
- /* Have not collected an extent and continue. */
- for (index = 0; index < ret; index++)
- page_cache_release(pages[index]);
- goto repeat;
+ if (es.start > newex->ec_block) {
+ /* A hole found. */
+ newex->ec_len = min(es.start - newex->ec_block,
+ newex->ec_len);
+ return 0;
}
- for (index = 0; index < ret; index++)
- page_cache_release(pages[index]);
- kfree(pages);
+ newex->ec_len = es.start + es.len - newex->ec_block;
}
- physical = (__u64)newex->ec_start << blksize_bits;
- length = (__u64)newex->ec_len << blksize_bits;
-
- if (ex && ext4_ext_is_uninitialized(ex))
- flags |= FIEMAP_EXTENT_UNWRITTEN;
-
- if (next == EXT_MAX_BLOCKS)
- flags |= FIEMAP_EXTENT_LAST;
-
- ret = fiemap_fill_next_extent(fieinfo, logical, physical,
- length, flags);
- if (ret < 0)
- return ret;
- if (ret == 1)
- return EXT_BREAK;
- return EXT_CONTINUE;
+ return next_del;
}
/* fiemap flags we can handle specified here */
#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -4755,9 +4652,32 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
loff_t first_page_offset, last_page_offset;
int credits, err = 0;
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ err = filemap_write_and_wait_range(mapping,
+ offset, offset + length - 1);
+
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ /* It's not possible punch hole on append only file */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+ err = -EPERM;
+ goto out_mutex;
+ }
+ if (IS_SWAPFILE(inode)) {
+ err = -ETXTBSY;
+ goto out_mutex;
+ }
+
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
- return 0;
+ goto out_mutex;
/*
* If the hole extends beyond i_size, set the hole
@@ -4775,35 +4695,26 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
first_page_offset = first_page << PAGE_CACHE_SHIFT;
last_page_offset = last_page << PAGE_CACHE_SHIFT;
- /*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- err = filemap_write_and_wait_range(mapping,
- offset, offset + length - 1);
-
- if (err)
- return err;
- }
-
/* Now release the pages */
if (last_page_offset > first_page_offset) {
truncate_pagecache_range(inode, first_page_offset,
last_page_offset - 1);
}
- /* finish any pending end_io work */
- ext4_flush_completed_IO(inode);
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ err = ext4_flush_unwritten_io(inode);
+ if (err)
+ goto out_dio;
+ inode_dio_wait(inode);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out_dio;
+ }
- err = ext4_orphan_add(handle, inode);
- if (err)
- goto out;
/*
* Now we need to zero out the non-page-aligned data in the
@@ -4878,6 +4789,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
ext4_ext_invalidate_cache(inode);
ext4_discard_preallocations(inode);
+ err = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
ext4_ext_invalidate_cache(inode);
@@ -4889,18 +4802,31 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
up_write(&EXT4_I(inode)->i_data_sem);
out:
- ext4_orphan_del(handle, inode);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
return err;
}
+
int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
ext4_lblk_t start_blk;
int error = 0;
+ if (ext4_has_inline_data(inode)) {
+ int has_inline = 1;
+
+ error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
+
+ if (has_inline)
+ return error;
+ }
+
/* fallback to generic here if not in extents fmt */
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return generic_block_fiemap(inode, fieinfo, start, len,
@@ -4922,11 +4848,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
/*
- * Walk the extent tree gathering extent information.
- * ext4_ext_fiemap_cb will push extents back to user.
+ * Walk the extent tree gathering extent information
+ * and pushing extents back to the user.
*/
- error = ext4_ext_walk_space(inode, start_blk, len_blks,
- ext4_ext_fiemap_cb, fieinfo);
+ error = ext4_fill_fiemap_extents(inode, start_blk,
+ len_blks, fieinfo);
}
return error;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
new file mode 100644
index 00000000000..564d981a2fc
--- /dev/null
+++ b/fs/ext4/extents_status.c
@@ -0,0 +1,500 @@
+/*
+ * fs/ext4/extents_status.c
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ * Allison Henderson <achender@linux.vnet.ibm.com>
+ * Hugh Dickins <hughd@google.com>
+ * Zheng Liu <wenqing.lz@taobao.com>
+ *
+ * Ext4 extents status tree core functions.
+ */
+#include <linux/rbtree.h>
+#include "ext4.h"
+#include "extents_status.h"
+#include "ext4_extents.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * According to previous discussion in Ext4 Developer Workshop, we
+ * will introduce a new structure called io tree to track all extent
+ * status in order to solve some problems that we have met
+ * (e.g. Reservation space warning), and provide extent-level locking.
+ * Delay extent tree is the first step to achieve this goal. It is
+ * original built by Yongqiang Yang. At that time it is called delay
+ * extent tree, whose goal is only track delay extent in memory to
+ * simplify the implementation of fiemap and bigalloc, and introduce
+ * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
+ * delay extent tree at the following comment. But for better
+ * understand what it does, it has been rename to extent status tree.
+ *
+ * Currently the first step has been done. All delay extents are
+ * tracked in the tree. It maintains the delay extent when a delay
+ * allocation is issued, and the delay extent is written out or
+ * invalidated. Therefore the implementation of fiemap and bigalloc
+ * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
+ *
+ * The following comment describes the implemenmtation of extent
+ * status tree and future works.
+ */
+
+/*
+ * extents status tree implementation for ext4.
+ *
+ *
+ * ==========================================================================
+ * Extents status encompass delayed extents and extent locks
+ *
+ * 1. Why delayed extent implementation ?
+ *
+ * Without delayed extent, ext4 identifies a delayed extent by looking
+ * up page cache, this has several deficiencies - complicated, buggy,
+ * and inefficient code.
+ *
+ * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
+ * to know if a block or a range of blocks are belonged to a delayed
+ * extent.
+ *
+ * Let us have a look at how they do without delayed extents implementation.
+ * -- FIEMAP
+ * FIEMAP looks up page cache to identify delayed allocations from holes.
+ *
+ * -- SEEK_HOLE/DATA
+ * SEEK_HOLE/DATA has the same problem as FIEMAP.
+ *
+ * -- bigalloc
+ * bigalloc looks up page cache to figure out if a block is
+ * already under delayed allocation or not to determine whether
+ * quota reserving is needed for the cluster.
+ *
+ * -- punch hole
+ * punch hole looks up page cache to identify a delayed extent.
+ *
+ * -- writeout
+ * Writeout looks up whole page cache to see if a buffer is
+ * mapped, If there are not very many delayed buffers, then it is
+ * time comsuming.
+ *
+ * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
+ * bigalloc and writeout can figure out if a block or a range of
+ * blocks is under delayed allocation(belonged to a delayed extent) or
+ * not by searching the delayed extent tree.
+ *
+ *
+ * ==========================================================================
+ * 2. ext4 delayed extents impelmentation
+ *
+ * -- delayed extent
+ * A delayed extent is a range of blocks which are contiguous
+ * logically and under delayed allocation. Unlike extent in
+ * ext4, delayed extent in ext4 is a in-memory struct, there is
+ * no corresponding on-disk data. There is no limit on length of
+ * delayed extent, so a delayed extent can contain as many blocks
+ * as they are contiguous logically.
+ *
+ * -- delayed extent tree
+ * Every inode has a delayed extent tree and all under delayed
+ * allocation blocks are added to the tree as delayed extents.
+ * Delayed extents in the tree are ordered by logical block no.
+ *
+ * -- operations on a delayed extent tree
+ * There are three operations on a delayed extent tree: find next
+ * delayed extent, adding a space(a range of blocks) and removing
+ * a space.
+ *
+ * -- race on a delayed extent tree
+ * Delayed extent tree is protected inode->i_es_lock.
+ *
+ *
+ * ==========================================================================
+ * 3. performance analysis
+ * -- overhead
+ * 1. There is a cache extent for write access, so if writes are
+ * not very random, adding space operaions are in O(1) time.
+ *
+ * -- gain
+ * 2. Code is much simpler, more readable, more maintainable and
+ * more efficient.
+ *
+ *
+ * ==========================================================================
+ * 4. TODO list
+ * -- Track all extent status
+ *
+ * -- Improve get block process
+ *
+ * -- Extent-level locking
+ */
+
+static struct kmem_cache *ext4_es_cachep;
+
+int __init ext4_init_es(void)
+{
+ ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
+ if (ext4_es_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void ext4_exit_es(void)
+{
+ if (ext4_es_cachep)
+ kmem_cache_destroy(ext4_es_cachep);
+}
+
+void ext4_es_init_tree(struct ext4_es_tree *tree)
+{
+ tree->root = RB_ROOT;
+ tree->cache_es = NULL;
+}
+
+#ifdef ES_DEBUG__
+static void ext4_es_print_tree(struct inode *inode)
+{
+ struct ext4_es_tree *tree;
+ struct rb_node *node;
+
+ printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
+ tree = &EXT4_I(inode)->i_es_tree;
+ node = rb_first(&tree->root);
+ while (node) {
+ struct extent_status *es;
+ es = rb_entry(node, struct extent_status, rb_node);
+ printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
+ node = rb_next(node);
+ }
+ printk(KERN_DEBUG "\n");
+}
+#else
+#define ext4_es_print_tree(inode)
+#endif
+
+static inline ext4_lblk_t extent_status_end(struct extent_status *es)
+{
+ BUG_ON(es->start + es->len < es->start);
+ return es->start + es->len - 1;
+}
+
+/*
+ * search through the tree for an delayed extent with a given offset. If
+ * it can't be found, try to find next extent.
+ */
+static struct extent_status *__es_tree_search(struct rb_root *root,
+ ext4_lblk_t offset)
+{
+ struct rb_node *node = root->rb_node;
+ struct extent_status *es = NULL;
+
+ while (node) {
+ es = rb_entry(node, struct extent_status, rb_node);
+ if (offset < es->start)
+ node = node->rb_left;
+ else if (offset > extent_status_end(es))
+ node = node->rb_right;
+ else
+ return es;
+ }
+
+ if (es && offset < es->start)
+ return es;
+
+ if (es && offset > extent_status_end(es)) {
+ node = rb_next(&es->rb_node);
+ return node ? rb_entry(node, struct extent_status, rb_node) :
+ NULL;
+ }
+
+ return NULL;
+}
+
+/*
+ * ext4_es_find_extent: find the 1st delayed extent covering @es->start
+ * if it exists, otherwise, the next extent after @es->start.
+ *
+ * @inode: the inode which owns delayed extents
+ * @es: delayed extent that we found
+ *
+ * Returns the first block of the next extent after es, otherwise
+ * EXT_MAX_BLOCKS if no delay extent is found.
+ * Delayed extent is returned via @es.
+ */
+ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
+{
+ struct ext4_es_tree *tree = NULL;
+ struct extent_status *es1 = NULL;
+ struct rb_node *node;
+ ext4_lblk_t ret = EXT_MAX_BLOCKS;
+
+ trace_ext4_es_find_extent_enter(inode, es->start);
+
+ read_lock(&EXT4_I(inode)->i_es_lock);
+ tree = &EXT4_I(inode)->i_es_tree;
+
+ /* find delay extent in cache firstly */
+ if (tree->cache_es) {
+ es1 = tree->cache_es;
+ if (in_range(es->start, es1->start, es1->len)) {
+ es_debug("%u cached by [%u/%u)\n",
+ es->start, es1->start, es1->len);
+ goto out;
+ }
+ }
+
+ es->len = 0;
+ es1 = __es_tree_search(&tree->root, es->start);
+
+out:
+ if (es1) {
+ tree->cache_es = es1;
+ es->start = es1->start;
+ es->len = es1->len;
+ node = rb_next(&es1->rb_node);
+ if (node) {
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ ret = es1->start;
+ }
+ }
+
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ trace_ext4_es_find_extent_exit(inode, es, ret);
+ return ret;
+}
+
+static struct extent_status *
+ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
+{
+ struct extent_status *es;
+ es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
+ if (es == NULL)
+ return NULL;
+ es->start = start;
+ es->len = len;
+ return es;
+}
+
+static void ext4_es_free_extent(struct extent_status *es)
+{
+ kmem_cache_free(ext4_es_cachep, es);
+}
+
+static struct extent_status *
+ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
+{
+ struct extent_status *es1;
+ struct rb_node *node;
+
+ node = rb_prev(&es->rb_node);
+ if (!node)
+ return es;
+
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (es->start == extent_status_end(es1) + 1) {
+ es1->len += es->len;
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(es);
+ es = es1;
+ }
+
+ return es;
+}
+
+static struct extent_status *
+ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
+{
+ struct extent_status *es1;
+ struct rb_node *node;
+
+ node = rb_next(&es->rb_node);
+ if (!node)
+ return es;
+
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (es1->start == extent_status_end(es) + 1) {
+ es->len += es1->len;
+ rb_erase(node, &tree->root);
+ ext4_es_free_extent(es1);
+ }
+
+ return es;
+}
+
+static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
+ ext4_lblk_t len)
+{
+ struct rb_node **p = &tree->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct extent_status *es;
+ ext4_lblk_t end = offset + len - 1;
+
+ BUG_ON(end < offset);
+ es = tree->cache_es;
+ if (es && offset == (extent_status_end(es) + 1)) {
+ es_debug("cached by [%u/%u)\n", es->start, es->len);
+ es->len += len;
+ es = ext4_es_try_to_merge_right(tree, es);
+ goto out;
+ } else if (es && es->start == end + 1) {
+ es_debug("cached by [%u/%u)\n", es->start, es->len);
+ es->start = offset;
+ es->len += len;
+ es = ext4_es_try_to_merge_left(tree, es);
+ goto out;
+ } else if (es && es->start <= offset &&
+ end <= extent_status_end(es)) {
+ es_debug("cached by [%u/%u)\n", es->start, es->len);
+ goto out;
+ }
+
+ while (*p) {
+ parent = *p;
+ es = rb_entry(parent, struct extent_status, rb_node);
+
+ if (offset < es->start) {
+ if (es->start == end + 1) {
+ es->start = offset;
+ es->len += len;
+ es = ext4_es_try_to_merge_left(tree, es);
+ goto out;
+ }
+ p = &(*p)->rb_left;
+ } else if (offset > extent_status_end(es)) {
+ if (offset == extent_status_end(es) + 1) {
+ es->len += len;
+ es = ext4_es_try_to_merge_right(tree, es);
+ goto out;
+ }
+ p = &(*p)->rb_right;
+ } else {
+ if (extent_status_end(es) <= end)
+ es->len = offset - es->start + len;
+ goto out;
+ }
+ }
+
+ es = ext4_es_alloc_extent(offset, len);
+ if (!es)
+ return -ENOMEM;
+ rb_link_node(&es->rb_node, parent, p);
+ rb_insert_color(&es->rb_node, &tree->root);
+
+out:
+ tree->cache_es = es;
+ return 0;
+}
+
+/*
+ * ext4_es_insert_extent() adds a space to a delayed extent tree.
+ * Caller holds inode->i_es_lock.
+ *
+ * ext4_es_insert_extent is called by ext4_da_write_begin and
+ * ext4_es_remove_extent.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
+ ext4_lblk_t len)
+{
+ struct ext4_es_tree *tree;
+ int err = 0;
+
+ trace_ext4_es_insert_extent(inode, offset, len);
+ es_debug("add [%u/%u) to extent status tree of inode %lu\n",
+ offset, len, inode->i_ino);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+ tree = &EXT4_I(inode)->i_es_tree;
+ err = __es_insert_extent(tree, offset, len);
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+
+ ext4_es_print_tree(inode);
+
+ return err;
+}
+
+/*
+ * ext4_es_remove_extent() removes a space from a delayed extent tree.
+ * Caller holds inode->i_es_lock.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
+ ext4_lblk_t len)
+{
+ struct rb_node *node;
+ struct ext4_es_tree *tree;
+ struct extent_status *es;
+ struct extent_status orig_es;
+ ext4_lblk_t len1, len2, end;
+ int err = 0;
+
+ trace_ext4_es_remove_extent(inode, offset, len);
+ es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
+ offset, len, inode->i_ino);
+
+ end = offset + len - 1;
+ BUG_ON(end < offset);
+ write_lock(&EXT4_I(inode)->i_es_lock);
+ tree = &EXT4_I(inode)->i_es_tree;
+ es = __es_tree_search(&tree->root, offset);
+ if (!es)
+ goto out;
+ if (es->start > end)
+ goto out;
+
+ /* Simply invalidate cache_es. */
+ tree->cache_es = NULL;
+
+ orig_es.start = es->start;
+ orig_es.len = es->len;
+ len1 = offset > es->start ? offset - es->start : 0;
+ len2 = extent_status_end(es) > end ?
+ extent_status_end(es) - end : 0;
+ if (len1 > 0)
+ es->len = len1;
+ if (len2 > 0) {
+ if (len1 > 0) {
+ err = __es_insert_extent(tree, end + 1, len2);
+ if (err) {
+ es->start = orig_es.start;
+ es->len = orig_es.len;
+ goto out;
+ }
+ } else {
+ es->start = end + 1;
+ es->len = len2;
+ }
+ goto out;
+ }
+
+ if (len1 > 0) {
+ node = rb_next(&es->rb_node);
+ if (node)
+ es = rb_entry(node, struct extent_status, rb_node);
+ else
+ es = NULL;
+ }
+
+ while (es && extent_status_end(es) <= end) {
+ node = rb_next(&es->rb_node);
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(es);
+ if (!node) {
+ es = NULL;
+ break;
+ }
+ es = rb_entry(node, struct extent_status, rb_node);
+ }
+
+ if (es && es->start < end + 1) {
+ len1 = extent_status_end(es) - end;
+ es->start = end + 1;
+ es->len = len1;
+ }
+
+out:
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+ ext4_es_print_tree(inode);
+ return err;
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
new file mode 100644
index 00000000000..077f82db092
--- /dev/null
+++ b/fs/ext4/extents_status.h
@@ -0,0 +1,45 @@
+/*
+ * fs/ext4/extents_status.h
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ * Allison Henderson <achender@linux.vnet.ibm.com>
+ * Zheng Liu <wenqing.lz@taobao.com>
+ *
+ */
+
+#ifndef _EXT4_EXTENTS_STATUS_H
+#define _EXT4_EXTENTS_STATUS_H
+
+/*
+ * Turn on ES_DEBUG__ to get lots of info about extent status operations.
+ */
+#ifdef ES_DEBUG__
+#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#else
+#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+struct extent_status {
+ struct rb_node rb_node;
+ ext4_lblk_t start; /* first block extent covers */
+ ext4_lblk_t len; /* length of extent in block */
+};
+
+struct ext4_es_tree {
+ struct rb_root root;
+ struct extent_status *cache_es; /* recently accessed extent */
+};
+
+extern int __init ext4_init_es(void);
+extern void ext4_exit_es(void);
+extern void ext4_es_init_tree(struct ext4_es_tree *tree);
+
+extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t len);
+extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t len);
+extern ext4_lblk_t ext4_es_find_extent(struct inode *inode,
+ struct extent_status *es);
+
+#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8c7642a0005..405565a6227 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -24,6 +24,7 @@
#include <linux/mount.h>
#include <linux/path.h>
#include <linux/quotaops.h>
+#include <linux/pagevec.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -55,11 +56,11 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
return 0;
}
-static void ext4_aiodio_wait(struct inode *inode)
+void ext4_unwritten_wait(struct inode *inode)
{
wait_queue_head_t *wq = ext4_ioend_wq(inode);
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+ wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
}
/*
@@ -90,11 +91,83 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
}
static ssize_t
+ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct blk_plug plug;
+ int unaligned_aio = 0;
+ ssize_t ret;
+ int overwrite = 0;
+ size_t length = iov_length(iov, nr_segs);
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ !is_sync_kiocb(iocb))
+ unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
+
+ /* Unaligned direct AIO must be serialized; see comment above */
+ if (unaligned_aio) {
+ mutex_lock(ext4_aio_mutex(inode));
+ ext4_unwritten_wait(inode);
+ }
+
+ BUG_ON(iocb->ki_pos != pos);
+
+ mutex_lock(&inode->i_mutex);
+ blk_start_plug(&plug);
+
+ iocb->private = &overwrite;
+
+ /* check whether we do a DIO overwrite or not */
+ if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
+ !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ int err, len;
+
+ map.m_lblk = pos >> blkbits;
+ map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
+ - map.m_lblk;
+ len = map.m_len;
+
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ /*
+ * 'err==len' means that all of blocks has been preallocated no
+ * matter they are initialized or not. For excluding
+ * uninitialized extents, we need to check m_flags. There are
+ * two conditions that indicate for initialized extents.
+ * 1) If we hit extent cache, EXT4_MAP_MAPPED flag is returned;
+ * 2) If we do a real lookup, non-flags are returned.
+ * So we should check these two conditions.
+ */
+ if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
+ overwrite = 1;
+ }
+
+ ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+ mutex_unlock(&inode->i_mutex);
+
+ if (ret > 0 || ret == -EIOCBQUEUED) {
+ ssize_t err;
+
+ err = generic_write_sync(file, pos, ret);
+ if (err < 0 && ret > 0)
+ ret = err;
+ }
+ blk_finish_plug(&plug);
+
+ if (unaligned_aio)
+ mutex_unlock(ext4_aio_mutex(inode));
+
+ return ret;
+}
+
+static ssize_t
ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
- int unaligned_aio = 0;
ssize_t ret;
/*
@@ -114,29 +187,12 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
sbi->s_bitmap_maxbytes - pos);
}
- } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
- !is_sync_kiocb(iocb))) {
- unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
- }
-
- /* Unaligned direct AIO must be serialized; see comment above */
- if (unaligned_aio) {
- static unsigned long unaligned_warn_time;
-
- /* Warn about this once per day */
- if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
- ext4_msg(inode->i_sb, KERN_WARNING,
- "Unaligned AIO/DIO on inode %ld by %s; "
- "performance will be poor.",
- inode->i_ino, current->comm);
- mutex_lock(ext4_aio_mutex(inode));
- ext4_aiodio_wait(inode);
}
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
- if (unaligned_aio)
- mutex_unlock(ext4_aio_mutex(inode));
+ if (unlikely(iocb->ki_filp->f_flags & O_DIRECT))
+ ret = ext4_file_dio_write(iocb, iov, nr_segs, pos);
+ else
+ ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
return ret;
}
@@ -144,6 +200,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = ext4_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -154,7 +211,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
return -ENOEXEC;
file_accessed(file);
vma->vm_ops = &ext4_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
@@ -181,9 +237,21 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
path.dentry = mnt->mnt_root;
cp = d_path(&path, buf, sizeof(buf));
if (!IS_ERR(cp)) {
+ handle_t *handle;
+ int err;
+
+ handle = ext4_journal_start_sb(sb, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err) {
+ ext4_journal_stop(handle);
+ return err;
+ }
strlcpy(sbi->s_es->s_last_mounted, cp,
sizeof(sbi->s_es->s_last_mounted));
- ext4_mark_super_dirty(sb);
+ ext4_handle_dirty_super(handle, sb);
+ ext4_journal_stop(handle);
}
}
/*
@@ -211,11 +279,329 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
}
/*
- * ext4_llseek() copied from generic_file_llseek() to handle both
- * block-mapped and extent-mapped maxbytes values. This should
- * otherwise be identical with generic_file_llseek().
+ * Here we use ext4_map_blocks() to get a block mapping for a extent-based
+ * file rather than ext4_ext_walk_space() because we can introduce
+ * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
+ * function. When extent status tree has been fully implemented, it will
+ * track all extent status for a file and we can directly use it to
+ * retrieve the offset for SEEK_DATA/SEEK_HOLE.
*/
-loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
+
+/*
+ * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
+ * lookup page cache to check whether or not there has some data between
+ * [startoff, endoff] because, if this range contains an unwritten extent,
+ * we determine this extent as a data or a hole according to whether the
+ * page cache has data or not.
+ */
+static int ext4_find_unwritten_pgoff(struct inode *inode,
+ int whence,
+ struct ext4_map_blocks *map,
+ loff_t *offset)
+{
+ struct pagevec pvec;
+ unsigned int blkbits;
+ pgoff_t index;
+ pgoff_t end;
+ loff_t endoff;
+ loff_t startoff;
+ loff_t lastoff;
+ int found = 0;
+
+ blkbits = inode->i_sb->s_blocksize_bits;
+ startoff = *offset;
+ lastoff = startoff;
+ endoff = (map->m_lblk + map->m_len) << blkbits;
+
+ index = startoff >> PAGE_CACHE_SHIFT;
+ end = endoff >> PAGE_CACHE_SHIFT;
+
+ pagevec_init(&pvec, 0);
+ do {
+ int i, num;
+ unsigned long nr_pages;
+
+ num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+ (pgoff_t)num);
+ if (nr_pages == 0) {
+ if (whence == SEEK_DATA)
+ break;
+
+ BUG_ON(whence != SEEK_HOLE);
+ /*
+ * If this is the first time to go into the loop and
+ * offset is not beyond the end offset, it will be a
+ * hole at this offset
+ */
+ if (lastoff == startoff || lastoff < endoff)
+ found = 1;
+ break;
+ }
+
+ /*
+ * If this is the first time to go into the loop and
+ * offset is smaller than the first page offset, it will be a
+ * hole at this offset.
+ */
+ if (lastoff == startoff && whence == SEEK_HOLE &&
+ lastoff < page_offset(pvec.pages[0])) {
+ found = 1;
+ break;
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct buffer_head *bh, *head;
+
+ /*
+ * If the current offset is not beyond the end of given
+ * range, it will be a hole.
+ */
+ if (lastoff < endoff && whence == SEEK_HOLE &&
+ page->index > end) {
+ found = 1;
+ *offset = lastoff;
+ goto out;
+ }
+
+ lock_page(page);
+
+ if (unlikely(page->mapping != inode->i_mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (!page_has_buffers(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (page_has_buffers(page)) {
+ lastoff = page_offset(page);
+ bh = head = page_buffers(page);
+ do {
+ if (buffer_uptodate(bh) ||
+ buffer_unwritten(bh)) {
+ if (whence == SEEK_DATA)
+ found = 1;
+ } else {
+ if (whence == SEEK_HOLE)
+ found = 1;
+ }
+ if (found) {
+ *offset = max_t(loff_t,
+ startoff, lastoff);
+ unlock_page(page);
+ goto out;
+ }
+ lastoff += bh->b_size;
+ bh = bh->b_this_page;
+ } while (bh != head);
+ }
+
+ lastoff = page_offset(page) + PAGE_SIZE;
+ unlock_page(page);
+ }
+
+ /*
+ * The no. of pages is less than our desired, that would be a
+ * hole in there.
+ */
+ if (nr_pages < num && whence == SEEK_HOLE) {
+ found = 1;
+ *offset = lastoff;
+ break;
+ }
+
+ index = pvec.pages[i - 1]->index + 1;
+ pagevec_release(&pvec);
+ } while (index <= end);
+
+out:
+ pagevec_release(&pvec);
+ return found;
+}
+
+/*
+ * ext4_seek_data() retrieves the offset for SEEK_DATA.
+ */
+static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_map_blocks map;
+ struct extent_status es;
+ ext4_lblk_t start, last, end;
+ loff_t dataoff, isize;
+ int blkbits;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ isize = i_size_read(inode);
+ if (offset >= isize) {
+ mutex_unlock(&inode->i_mutex);
+ return -ENXIO;
+ }
+
+ blkbits = inode->i_sb->s_blocksize_bits;
+ start = offset >> blkbits;
+ last = start;
+ end = isize >> blkbits;
+ dataoff = offset;
+
+ do {
+ map.m_lblk = last;
+ map.m_len = end - last + 1;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+ if (last != start)
+ dataoff = last << blkbits;
+ break;
+ }
+
+ /*
+ * If there is a delay extent at this offset,
+ * it will be as a data.
+ */
+ es.start = last;
+ (void)ext4_es_find_extent(inode, &es);
+ if (last >= es.start &&
+ last < es.start + es.len) {
+ if (last != start)
+ dataoff = last << blkbits;
+ break;
+ }
+
+ /*
+ * If there is a unwritten extent at this offset,
+ * it will be as a data or a hole according to page
+ * cache that has data or not.
+ */
+ if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ int unwritten;
+ unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+ &map, &dataoff);
+ if (unwritten)
+ break;
+ }
+
+ last++;
+ dataoff = last << blkbits;
+ } while (last <= end);
+
+ mutex_unlock(&inode->i_mutex);
+
+ if (dataoff > isize)
+ return -ENXIO;
+
+ if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+ return -EINVAL;
+ if (dataoff > maxsize)
+ return -EINVAL;
+
+ if (dataoff != file->f_pos) {
+ file->f_pos = dataoff;
+ file->f_version = 0;
+ }
+
+ return dataoff;
+}
+
+/*
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
+ */
+static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_map_blocks map;
+ struct extent_status es;
+ ext4_lblk_t start, last, end;
+ loff_t holeoff, isize;
+ int blkbits;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ isize = i_size_read(inode);
+ if (offset >= isize) {
+ mutex_unlock(&inode->i_mutex);
+ return -ENXIO;
+ }
+
+ blkbits = inode->i_sb->s_blocksize_bits;
+ start = offset >> blkbits;
+ last = start;
+ end = isize >> blkbits;
+ holeoff = offset;
+
+ do {
+ map.m_lblk = last;
+ map.m_len = end - last + 1;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+ last += ret;
+ holeoff = last << blkbits;
+ continue;
+ }
+
+ /*
+ * If there is a delay extent at this offset,
+ * we will skip this extent.
+ */
+ es.start = last;
+ (void)ext4_es_find_extent(inode, &es);
+ if (last >= es.start &&
+ last < es.start + es.len) {
+ last = es.start + es.len;
+ holeoff = last << blkbits;
+ continue;
+ }
+
+ /*
+ * If there is a unwritten extent at this offset,
+ * it will be as a data or a hole according to page
+ * cache that has data or not.
+ */
+ if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ int unwritten;
+ unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+ &map, &holeoff);
+ if (!unwritten) {
+ last += ret;
+ holeoff = last << blkbits;
+ continue;
+ }
+ }
+
+ /* find a hole */
+ break;
+ } while (last <= end);
+
+ mutex_unlock(&inode->i_mutex);
+
+ if (holeoff > isize)
+ holeoff = isize;
+
+ if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+ return -EINVAL;
+ if (holeoff > maxsize)
+ return -EINVAL;
+
+ if (holeoff != file->f_pos) {
+ file->f_pos = holeoff;
+ file->f_version = 0;
+ }
+
+ return holeoff;
+}
+
+/*
+ * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
+ * by calling generic_file_llseek_size() with the appropriate maxbytes
+ * value for each.
+ */
+loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
loff_t maxbytes;
@@ -225,7 +611,19 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
else
maxbytes = inode->i_sb->s_maxbytes;
- return generic_file_llseek_size(file, offset, origin, maxbytes);
+ switch (whence) {
+ case SEEK_SET:
+ case SEEK_CUR:
+ case SEEK_END:
+ return generic_file_llseek_size(file, offset, whence,
+ maxbytes, i_size_read(inode));
+ case SEEK_DATA:
+ return ext4_seek_data(file, offset, maxbytes);
+ case SEEK_HOLE:
+ return ext4_seek_hole(file, offset, maxbytes);
+ }
+
+ return -EINVAL;
}
const struct file_operations ext4_file_operations = {
@@ -250,12 +648,10 @@ const struct file_operations ext4_file_operations = {
const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
.get_acl = ext4_get_acl,
.fiemap = ext4_fiemap,
};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index bb6c7d81131..3278e64e57b 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,87 +34,6 @@
#include <trace/events/ext4.h>
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef EXT4FS_DEBUG
- struct list_head *cur, *before, *after;
- ext4_io_end_t *io, *io0, *io1;
- unsigned long flags;
-
- if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
- ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
- return;
- }
-
- ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
- cur = &io->list;
- before = cur->prev;
- io0 = container_of(before, ext4_io_end_t, list);
- after = cur->next;
- io1 = container_of(after, ext4_io_end_t, list);
-
- ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
- io, inode->i_ino, io0, io1);
- }
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int ext4_flush_completed_IO(struct inode *inode)
-{
- ext4_io_end_t *io;
- struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long flags;
- int ret = 0;
- int ret2 = 0;
-
- dump_completed_IO(inode);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- while (!list_empty(&ei->i_completed_io_list)){
- io = list_entry(ei->i_completed_io_list.next,
- ext4_io_end_t, list);
- list_del_init(&io->list);
- io->flag |= EXT4_IO_END_IN_FSYNC;
- /*
- * Calling ext4_end_io_nolock() to convert completed
- * IO to written.
- *
- * When ext4_sync_file() is called, run_queue() may already
- * about to flush the work corresponding to this io structure.
- * It will be upset if it founds the io structure related
- * to the work-to-be schedule is freed.
- *
- * Thus we need to keep the io structure still valid here after
- * conversion finished. The io structure has a flag to
- * avoid double converting from both fsync and background work
- * queue work.
- */
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- ret = ext4_end_io_nolock(io);
- if (ret < 0)
- ret2 = ret;
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- io->flag &= ~EXT4_IO_END_IN_FSYNC;
- }
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- return (ret2 < 0) ? ret2 : 0;
-}
-
/*
* If we're not journaling and this is a just-created file, we have to
* sync our parent directory (if it was freshly created) since
@@ -125,7 +44,6 @@ int ext4_flush_completed_IO(struct inode *inode)
*/
static int ext4_sync_parent(struct inode *inode)
{
- struct writeback_control wbc;
struct dentry *dentry = NULL;
struct inode *next;
int ret = 0;
@@ -135,14 +53,7 @@ static int ext4_sync_parent(struct inode *inode)
inode = igrab(inode);
while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
- dentry = NULL;
- spin_lock(&inode->i_lock);
- if (!list_empty(&inode->i_dentry)) {
- dentry = list_first_entry(&inode->i_dentry,
- struct dentry, d_alias);
- dget(dentry);
- }
- spin_unlock(&inode->i_lock);
+ dentry = d_find_any_alias(inode);
if (!dentry)
break;
next = igrab(dentry->d_parent->d_inode);
@@ -154,10 +65,7 @@ static int ext4_sync_parent(struct inode *inode)
ret = sync_mapping_buffers(inode->i_mapping);
if (ret)
break;
- memset(&wbc, 0, sizeof(wbc));
- wbc.sync_mode = WB_SYNC_ALL;
- wbc.nr_to_write = 0; /* only write out the inode */
- ret = sync_inode(inode, &wbc);
+ ret = sync_inode_metadata(inode, 1);
if (ret)
break;
}
@@ -201,8 +109,6 @@ static int __sync_inode(struct inode *inode, int datasync)
*
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
- *
- * i_mutex lock is held when entering and exiting this function
*/
int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
@@ -210,7 +116,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int ret;
+ int ret, err;
tid_t commit_tid;
bool needs_barrier = false;
@@ -226,13 +132,13 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (inode->i_sb->s_flags & MS_RDONLY)
goto out;
- ret = ext4_flush_completed_IO(inode);
+ ret = ext4_flush_unwritten_io(inode);
if (ret < 0)
goto out;
if (!journal) {
ret = __sync_inode(inode, datasync);
- if (!ret && !list_empty(&inode->i_dentry))
+ if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
}
@@ -262,8 +168,11 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
needs_barrier = true;
jbd2_log_start_commit(journal, commit_tid);
ret = jbd2_log_wait_commit(journal, commit_tid);
- if (needs_barrier)
- blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (needs_barrier) {
+ err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (!ret)
+ ret = err;
+ }
out:
mutex_unlock(&inode->i_mutex);
trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index d48e8b14928..3f32c801244 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -315,7 +315,6 @@ out:
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
if (!fatal)
fatal = err;
- ext4_mark_super_dirty(sb);
} else
ext4_error(sb, "bit already cleared for inode %lu", ino);
@@ -698,6 +697,15 @@ got_group:
if (!gdp)
goto fail;
+ /*
+ * Check free inodes count before loading bitmap.
+ */
+ if (ext4_free_inodes_count(sb, gdp) == 0) {
+ if (++group == ngroups)
+ group = 0;
+ continue;
+ }
+
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
if (!inode_bitmap_bh)
@@ -717,6 +725,10 @@ repeat_in_this_group:
"inode=%lu", ino + 1);
continue;
}
+ BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+ if (err)
+ goto fail;
ext4_lock_group(sb, group);
ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
ext4_unlock_group(sb, group);
@@ -730,6 +742,11 @@ repeat_in_this_group:
goto out;
got:
+ BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
+ if (err)
+ goto fail;
+
/* We may have to initialize the block bitmap if it isn't already */
if (ext4_has_group_desc_csum(sb) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -745,7 +762,6 @@ got:
BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
- brelse(block_bitmap_bh);
/* recheck and clear flag under lock if we still need to */
ext4_lock_group(sb, group);
@@ -754,22 +770,16 @@ got:
ext4_free_group_clusters_set(sb, gdp,
ext4_free_clusters_after_init(sb, group, gdp));
ext4_block_bitmap_csum_set(sb, group, gdp,
- block_bitmap_bh,
- EXT4_BLOCKS_PER_GROUP(sb) /
- 8);
+ block_bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
}
ext4_unlock_group(sb, group);
+ brelse(block_bitmap_bh);
if (err)
goto fail;
}
- BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
- if (err)
- goto fail;
-
BUFFER_TRACE(group_desc_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, group_desc_bh);
if (err)
@@ -817,11 +827,6 @@ got:
}
ext4_unlock_group(sb, group);
- BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
- if (err)
- goto fail;
-
BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
if (err)
@@ -830,7 +835,6 @@ got:
percpu_counter_dec(&sbi->s_freeinodes_counter);
if (S_ISDIR(mode))
percpu_counter_inc(&sbi->s_dirs_counter);
- ext4_mark_super_dirty(sb);
if (sbi->s_log_groups_per_flex) {
flex_group = ext4_flex_group(sbi, group);
@@ -898,6 +902,10 @@ got:
ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
+ ei->i_inline_off = 0;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+
ret = inode;
dquot_initialize(inode);
err = dquot_alloc_inode(inode);
@@ -1054,7 +1062,8 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
if (!bitmap_bh)
continue;
- x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
+ x = ext4_count_free(bitmap_bh->b_data,
+ EXT4_INODES_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
(unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
bitmap_count += x;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 830e1b2bf14..20862f96e8a 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -22,6 +22,7 @@
#include "ext4_jbd2.h"
#include "truncate.h"
+#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */
#include <trace/events/ext4.h>
@@ -755,8 +756,7 @@ cleanup:
partial--;
}
out:
- trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
- map->m_pblk, map->m_len, err);
+ trace_ext4_ind_map_blocks_exit(inode, map, err);
return err;
}
@@ -807,16 +807,30 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
retry:
if (rw == READ && ext4_should_dioread_nolock(inode)) {
- if (unlikely(!list_empty(&ei->i_completed_io_list))) {
+ if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
mutex_lock(&inode->i_mutex);
- ext4_flush_completed_IO(inode);
+ ext4_flush_unwritten_io(inode);
mutex_unlock(&inode->i_mutex);
}
+ /*
+ * Nolock dioread optimization may be dynamically disabled
+ * via ext4_inode_block_unlocked_dio(). Check inode's state
+ * while holding extra i_dio_count ref.
+ */
+ atomic_inc(&inode->i_dio_count);
+ smp_mb();
+ if (unlikely(ext4_test_inode_state(inode,
+ EXT4_STATE_DIOREAD_LOCK))) {
+ inode_dio_done(inode);
+ goto locked;
+ }
ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block, NULL, NULL, 0);
+ inode_dio_done(inode);
} else {
+locked:
ret = blockdev_direct_IO(rw, iocb, inode, iov,
offset, nr_segs, ext4_get_block);
@@ -1398,6 +1412,7 @@ void ext4_ind_truncate(struct inode *inode)
down_write(&ei->i_data_sem);
ext4_discard_preallocations(inode);
+ ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
/*
* The orphan list entry will now protect us from any crash which
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
new file mode 100644
index 00000000000..387c47c6cda
--- /dev/null
+++ b/fs/ext4/inline.c
@@ -0,0 +1,1884 @@
+/*
+ * Copyright (c) 2012 Taobao.
+ * Written by Tao Ma <boyu.mt@taobao.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+#include "truncate.h"
+#include <linux/fiemap.h>
+
+#define EXT4_XATTR_SYSTEM_DATA "data"
+#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
+#define EXT4_INLINE_DOTDOT_SIZE 4
+
+int ext4_get_inline_size(struct inode *inode)
+{
+ if (EXT4_I(inode)->i_inline_off)
+ return EXT4_I(inode)->i_inline_size;
+
+ return 0;
+}
+
+static int get_max_inline_xattr_value_size(struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_entry *entry;
+ struct ext4_inode *raw_inode;
+ int free, min_offs;
+
+ min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
+ EXT4_GOOD_OLD_INODE_SIZE -
+ EXT4_I(inode)->i_extra_isize -
+ sizeof(struct ext4_xattr_ibody_header);
+
+ /*
+ * We need to subtract another sizeof(__u32) since an in-inode xattr
+ * needs an empty 4 bytes to indicate the gap between the xattr entry
+ * and the name/value pair.
+ */
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+ return EXT4_XATTR_SIZE(min_offs -
+ EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) -
+ EXT4_XATTR_ROUND - sizeof(__u32));
+
+ raw_inode = ext4_raw_inode(iloc);
+ header = IHDR(inode, raw_inode);
+ entry = IFIRST(header);
+
+ /* Compute min_offs. */
+ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+ if (!entry->e_value_block && entry->e_value_size) {
+ size_t offs = le16_to_cpu(entry->e_value_offs);
+ if (offs < min_offs)
+ min_offs = offs;
+ }
+ }
+ free = min_offs -
+ ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
+
+ if (EXT4_I(inode)->i_inline_off) {
+ entry = (struct ext4_xattr_entry *)
+ ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
+
+ free += le32_to_cpu(entry->e_value_size);
+ goto out;
+ }
+
+ free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA));
+
+ if (free > EXT4_XATTR_ROUND)
+ free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND);
+ else
+ free = 0;
+
+out:
+ return free;
+}
+
+/*
+ * Get the maximum size we now can store in an inode.
+ * If we can't find the space for a xattr entry, don't use the space
+ * of the extents since we have no space to indicate the inline data.
+ */
+int ext4_get_max_inline_size(struct inode *inode)
+{
+ int error, max_inline_size;
+ struct ext4_iloc iloc;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error) {
+ ext4_error_inode(inode, __func__, __LINE__, 0,
+ "can't get inode location %lu",
+ inode->i_ino);
+ return 0;
+ }
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ max_inline_size = get_max_inline_xattr_value_size(inode, &iloc);
+ up_read(&EXT4_I(inode)->xattr_sem);
+
+ brelse(iloc.bh);
+
+ if (!max_inline_size)
+ return 0;
+
+ return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
+}
+
+int ext4_has_inline_data(struct inode *inode)
+{
+ return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
+ EXT4_I(inode)->i_inline_off;
+}
+
+/*
+ * this function does not take xattr_sem, which is OK because it is
+ * currently only used in a code path coming form ext4_iget, before
+ * the new inode has been unlocked
+ */
+int ext4_find_inline_data_nolock(struct inode *inode)
+{
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+ int error;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ if (!is.s.not_found) {
+ EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+ (void *)ext4_raw_inode(&is.iloc));
+ EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+ le32_to_cpu(is.s.here->e_value_size);
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ }
+out:
+ brelse(is.iloc.bh);
+ return error;
+}
+
+static int ext4_read_inline_data(struct inode *inode, void *buffer,
+ unsigned int len,
+ struct ext4_iloc *iloc)
+{
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_ibody_header *header;
+ int cp_len = 0;
+ struct ext4_inode *raw_inode;
+
+ if (!len)
+ return 0;
+
+ BUG_ON(len > EXT4_I(inode)->i_inline_size);
+
+ cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
+ len : EXT4_MIN_INLINE_DATA_SIZE;
+
+ raw_inode = ext4_raw_inode(iloc);
+ memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
+
+ len -= cp_len;
+ buffer += cp_len;
+
+ if (!len)
+ goto out;
+
+ header = IHDR(inode, raw_inode);
+ entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+ EXT4_I(inode)->i_inline_off);
+ len = min_t(unsigned int, len,
+ (unsigned int)le32_to_cpu(entry->e_value_size));
+
+ memcpy(buffer,
+ (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
+ cp_len += len;
+
+out:
+ return cp_len;
+}
+
+/*
+ * write the buffer to the inline inode.
+ * If 'create' is set, we don't need to do the extra copy in the xattr
+ * value since it is already handled by ext4_xattr_ibody_inline_set.
+ * That saves us one memcpy.
+ */
+void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
+ void *buffer, loff_t pos, unsigned int len)
+{
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_inode *raw_inode;
+ int cp_len = 0;
+
+ BUG_ON(!EXT4_I(inode)->i_inline_off);
+ BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
+
+ raw_inode = ext4_raw_inode(iloc);
+ buffer += pos;
+
+ if (pos < EXT4_MIN_INLINE_DATA_SIZE) {
+ cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
+ EXT4_MIN_INLINE_DATA_SIZE - pos : len;
+ memcpy((void *)raw_inode->i_block + pos, buffer, cp_len);
+
+ len -= cp_len;
+ buffer += cp_len;
+ pos += cp_len;
+ }
+
+ if (!len)
+ return;
+
+ pos -= EXT4_MIN_INLINE_DATA_SIZE;
+ header = IHDR(inode, raw_inode);
+ entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+ EXT4_I(inode)->i_inline_off);
+
+ memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
+ buffer, len);
+}
+
+static int ext4_create_inline_data(handle_t *handle,
+ struct inode *inode, unsigned len)
+{
+ int error;
+ void *value = NULL;
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto out;
+
+ if (len > EXT4_MIN_INLINE_DATA_SIZE) {
+ value = EXT4_ZERO_XATTR_VALUE;
+ len -= EXT4_MIN_INLINE_DATA_SIZE;
+ } else {
+ value = "";
+ len = 0;
+ }
+
+ /* Insert the the xttr entry. */
+ i.value = value;
+ i.value_len = len;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ BUG_ON(!is.s.not_found);
+
+ error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ if (error) {
+ if (error == -ENOSPC)
+ ext4_clear_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA);
+ goto out;
+ }
+
+ memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+ 0, EXT4_MIN_INLINE_DATA_SIZE);
+
+ EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+ (void *)ext4_raw_inode(&is.iloc));
+ EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+ get_bh(is.iloc.bh);
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+out:
+ brelse(is.iloc.bh);
+ return error;
+}
+
+static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len)
+{
+ int error;
+ void *value = NULL;
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+
+ /* If the old space is ok, write the data directly. */
+ if (len <= EXT4_I(inode)->i_inline_size)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ BUG_ON(is.s.not_found);
+
+ len -= EXT4_MIN_INLINE_DATA_SIZE;
+ value = kzalloc(len, GFP_NOFS);
+ if (!value)
+ goto out;
+
+ error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
+ value, len);
+ if (error == -ENODATA)
+ goto out;
+
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto out;
+
+ /* Update the xttr entry. */
+ i.value = value;
+ i.value_len = len;
+
+ error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ if (error)
+ goto out;
+
+ EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+ (void *)ext4_raw_inode(&is.iloc));
+ EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+ le32_to_cpu(is.s.here->e_value_size);
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ get_bh(is.iloc.bh);
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+out:
+ kfree(value);
+ brelse(is.iloc.bh);
+ return error;
+}
+
+int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len)
+{
+ int ret, size;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+ return -ENOSPC;
+
+ size = ext4_get_max_inline_size(inode);
+ if (size < len)
+ return -ENOSPC;
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+
+ if (ei->i_inline_off)
+ ret = ext4_update_inline_data(handle, inode, len);
+ else
+ ret = ext4_create_inline_data(handle, inode, len);
+
+ up_write(&EXT4_I(inode)->xattr_sem);
+
+ return ret;
+}
+
+static int ext4_destroy_inline_data_nolock(handle_t *handle,
+ struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = 0, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ .value = NULL,
+ .value_len = 0,
+ };
+ int error;
+
+ if (!ei->i_inline_off)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto out;
+
+ error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ if (error)
+ goto out;
+
+ memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+ 0, EXT4_MIN_INLINE_DATA_SIZE);
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (S_ISDIR(inode->i_mode) ||
+ S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_ext_tree_init(handle, inode);
+ }
+ }
+ ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+
+ get_bh(is.iloc.bh);
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+ EXT4_I(inode)->i_inline_off = 0;
+ EXT4_I(inode)->i_inline_size = 0;
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+out:
+ brelse(is.iloc.bh);
+ if (error == -ENODATA)
+ error = 0;
+ return error;
+}
+
+static int ext4_read_inline_page(struct inode *inode, struct page *page)
+{
+ void *kaddr;
+ int ret = 0;
+ size_t len;
+ struct ext4_iloc iloc;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!ext4_has_inline_data(inode));
+ BUG_ON(page->index);
+
+ if (!EXT4_I(inode)->i_inline_off) {
+ ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.",
+ inode->i_ino);
+ goto out;
+ }
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ goto out;
+
+ len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
+ kaddr = kmap_atomic(page);
+ ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr);
+ zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ brelse(iloc.bh);
+
+out:
+ return ret;
+}
+
+int ext4_readpage_inline(struct inode *inode, struct page *page)
+{
+ int ret = 0;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_read(&EXT4_I(inode)->xattr_sem);
+ return -EAGAIN;
+ }
+
+ /*
+ * Current inline data can only exist in the 1st page,
+ * So for all the other pages, just set them uptodate.
+ */
+ if (!page->index)
+ ret = ext4_read_inline_page(inode, page);
+ else if (!PageUptodate(page)) {
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ }
+
+ up_read(&EXT4_I(inode)->xattr_sem);
+
+ unlock_page(page);
+ return ret >= 0 ? 0 : ret;
+}
+
+static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
+ struct inode *inode,
+ unsigned flags)
+{
+ int ret, needed_blocks;
+ handle_t *handle = NULL;
+ int retries = 0, sem_held = 0;
+ struct page *page = NULL;
+ unsigned from, to;
+ struct ext4_iloc iloc;
+
+ if (!ext4_has_inline_data(inode)) {
+ /*
+ * clear the flag so that no new write
+ * will trap here again.
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ return 0;
+ }
+
+ needed_blocks = ext4_writepage_trans_blocks(inode);
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+retry:
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ goto out;
+ }
+
+ /* We cannot recurse into the filesystem as the transaction is already
+ * started */
+ flags |= AOP_FLAG_NOFS;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ sem_held = 1;
+ /* If some one has already done this for us, just exit. */
+ if (!ext4_has_inline_data(inode)) {
+ ret = 0;
+ goto out;
+ }
+
+ from = 0;
+ to = ext4_get_inline_size(inode);
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = ext4_destroy_inline_data_nolock(handle, inode);
+ if (ret)
+ goto out;
+
+ if (ext4_should_dioread_nolock(inode))
+ ret = __block_write_begin(page, from, to, ext4_get_block_write);
+ else
+ ret = __block_write_begin(page, from, to, ext4_get_block);
+
+ if (!ret && ext4_should_journal_data(inode)) {
+ ret = ext4_walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL,
+ do_journal_get_write_access);
+ }
+
+ if (ret) {
+ unlock_page(page);
+ page_cache_release(page);
+ ext4_orphan_add(handle, inode);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ sem_held = 0;
+ ext4_journal_stop(handle);
+ handle = NULL;
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might
+ * still be on the orphan list; we need to
+ * make sure the inode is removed from the
+ * orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
+ block_commit_write(page, from, to);
+out:
+ if (page) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ if (sem_held)
+ up_write(&EXT4_I(inode)->xattr_sem);
+ if (handle)
+ ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ return ret;
+}
+
+/*
+ * Try to write data in the inode.
+ * If the inode has inline data, check whether the new write can be
+ * in the inode also. If not, create the page the handle, move the data
+ * to the page make it update and let the later codes create extent for it.
+ */
+int ext4_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep)
+{
+ int ret;
+ handle_t *handle;
+ struct page *page;
+ struct ext4_iloc iloc;
+
+ if (pos + len > ext4_get_max_inline_size(inode))
+ goto convert;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ /*
+ * The possible write could happen in the inode,
+ * so try to reserve the space in inode first.
+ */
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ goto out;
+ }
+
+ ret = ext4_prepare_inline_data(handle, inode, pos + len);
+ if (ret && ret != -ENOSPC)
+ goto out;
+
+ /* We don't have space in inline inode, so convert it to extent. */
+ if (ret == -ENOSPC) {
+ ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ goto convert;
+ }
+
+ flags |= AOP_FLAG_NOFS;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ *pagep = page;
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ ret = 0;
+ unlock_page(page);
+ page_cache_release(page);
+ goto out_up_read;
+ }
+
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out_up_read;
+ }
+
+ ret = 1;
+ handle = NULL;
+out_up_read:
+ up_read(&EXT4_I(inode)->xattr_sem);
+out:
+ if (handle)
+ ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ return ret;
+convert:
+ return ext4_convert_inline_data_to_extent(mapping,
+ inode, flags);
+}
+
+int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
+ unsigned copied, struct page *page)
+{
+ int ret;
+ void *kaddr;
+ struct ext4_iloc iloc;
+
+ if (unlikely(copied < len)) {
+ if (!PageUptodate(page)) {
+ copied = 0;
+ goto out;
+ }
+ }
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret) {
+ ext4_std_error(inode->i_sb, ret);
+ copied = 0;
+ goto out;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ BUG_ON(!ext4_has_inline_data(inode));
+
+ kaddr = kmap_atomic(page);
+ ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
+ kunmap_atomic(kaddr);
+ SetPageUptodate(page);
+ /* clear page dirty so that writepages wouldn't work for us. */
+ ClearPageDirty(page);
+
+ up_write(&EXT4_I(inode)->xattr_sem);
+ brelse(iloc.bh);
+out:
+ return copied;
+}
+
+struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+ unsigned len,
+ struct page *page)
+{
+ int ret;
+ void *kaddr;
+ struct ext4_iloc iloc;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret) {
+ ext4_std_error(inode->i_sb, ret);
+ return NULL;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ kaddr = kmap_atomic(page);
+ ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
+ kunmap_atomic(kaddr);
+ up_write(&EXT4_I(inode)->xattr_sem);
+
+ return iloc.bh;
+}
+
+/*
+ * Try to make the page cache and handle ready for the inline data case.
+ * We can call this function in 2 cases:
+ * 1. The inode is created and the first write exceeds inline size. We can
+ * clear the inode state safely.
+ * 2. The inode has inline data, then we need to read the data, make it
+ * update and dirty so that ext4_da_writepages can handle it. We don't
+ * need to start the journal since the file's metatdata isn't changed now.
+ */
+static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
+ struct inode *inode,
+ unsigned flags,
+ void **fsdata)
+{
+ int ret = 0, inline_size;
+ struct page *page;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page)
+ return -ENOMEM;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ goto out;
+ }
+
+ inline_size = ext4_get_inline_size(inode);
+
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = __block_write_begin(page, 0, inline_size,
+ ext4_da_get_block_prep);
+ if (ret) {
+ ext4_truncate_failed_write(inode);
+ goto out;
+ }
+
+ SetPageDirty(page);
+ SetPageUptodate(page);
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ *fsdata = (void *)CONVERT_INLINE_DATA;
+
+out:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (page) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ return ret;
+}
+
+/*
+ * Prepare the write for the inline data.
+ * If the the data can be written into the inode, we just read
+ * the page and make it uptodate, and start the journal.
+ * Otherwise read the page, makes it dirty so that it can be
+ * handle in writepages(the i_disksize update is left to the
+ * normal ext4_da_write_end).
+ */
+int ext4_da_write_inline_data_begin(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep,
+ void **fsdata)
+{
+ int ret, inline_size;
+ handle_t *handle;
+ struct page *page;
+ struct ext4_iloc iloc;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ goto out;
+ }
+
+ inline_size = ext4_get_max_inline_size(inode);
+
+ ret = -ENOSPC;
+ if (inline_size >= pos + len) {
+ ret = ext4_prepare_inline_data(handle, inode, pos + len);
+ if (ret && ret != -ENOSPC)
+ goto out;
+ }
+
+ if (ret == -ENOSPC) {
+ ret = ext4_da_convert_inline_data_to_extent(mapping,
+ inode,
+ flags,
+ fsdata);
+ goto out;
+ }
+
+ /*
+ * We cannot recurse into the filesystem as the transaction
+ * is already started.
+ */
+ flags |= AOP_FLAG_NOFS;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ ret = 0;
+ goto out_release_page;
+ }
+
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out_release_page;
+ }
+
+ up_read(&EXT4_I(inode)->xattr_sem);
+ *pagep = page;
+ handle = NULL;
+ brelse(iloc.bh);
+ return 1;
+out_release_page:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ unlock_page(page);
+ page_cache_release(page);
+out:
+ if (handle)
+ ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ return ret;
+}
+
+int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+ unsigned len, unsigned copied,
+ struct page *page)
+{
+ int i_size_changed = 0;
+
+ copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
+
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold i_mutex.
+ *
+ * But it's important to update i_size while still holding page lock:
+ * page writeout could otherwise come in and zero beyond i_size.
+ */
+ if (pos+copied > inode->i_size) {
+ i_size_write(inode, pos+copied);
+ i_size_changed = 1;
+ }
+ unlock_page(page);
+ page_cache_release(page);
+
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ mark_inode_dirty(inode);
+
+ return copied;
+}
+
+#ifdef INLINE_DIR_DEBUG
+void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
+ void *inline_start, int inline_size)
+{
+ int offset;
+ unsigned short de_len;
+ struct ext4_dir_entry_2 *de = inline_start;
+ void *dlimit = inline_start + inline_size;
+
+ trace_printk("inode %lu\n", dir->i_ino);
+ offset = 0;
+ while ((void *)de < dlimit) {
+ de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
+ trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n",
+ offset, de_len, de->name_len, de->name,
+ de->name_len, le32_to_cpu(de->inode));
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ inline_start, inline_size, offset))
+ BUG();
+
+ offset += de_len;
+ de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
+ }
+}
+#else
+#define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
+#endif
+
+/*
+ * Add a new entry into a inline dir.
+ * It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ */
+static int ext4_add_dirent_to_inline(handle_t *handle,
+ struct dentry *dentry,
+ struct inode *inode,
+ struct ext4_iloc *iloc,
+ void *inline_start, int inline_size)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ unsigned short reclen;
+ int err;
+ struct ext4_dir_entry_2 *de;
+
+ reclen = EXT4_DIR_REC_LEN(namelen);
+ err = ext4_find_dest_de(dir, inode, iloc->bh,
+ inline_start, inline_size,
+ name, namelen, &de);
+ if (err)
+ return err;
+
+ err = ext4_journal_get_write_access(handle, iloc->bh);
+ if (err)
+ return err;
+ ext4_insert_dentry(inode, de, inline_size, name, namelen);
+
+ ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
+
+ /*
+ * XXX shouldn't update any times until successful
+ * completion of syscall, but too many callers depend
+ * on this.
+ *
+ * XXX similarly, too many callers depend on
+ * ext4_new_inode() setting the times, but error
+ * recovery deletes the inode, so the worst that can
+ * happen is that the times are slightly out of date
+ * and/or different from the directory change time.
+ */
+ dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+ ext4_update_dx_flag(dir);
+ dir->i_version++;
+ ext4_mark_inode_dirty(handle, dir);
+ return 1;
+}
+
+static void *ext4_get_inline_xattr_pos(struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_ibody_header *header;
+
+ BUG_ON(!EXT4_I(inode)->i_inline_off);
+
+ header = IHDR(inode, ext4_raw_inode(iloc));
+ entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
+ EXT4_I(inode)->i_inline_off);
+
+ return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
+}
+
+/* Set the final de to cover the whole block. */
+static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
+{
+ struct ext4_dir_entry_2 *de, *prev_de;
+ void *limit;
+ int de_len;
+
+ de = (struct ext4_dir_entry_2 *)de_buf;
+ if (old_size) {
+ limit = de_buf + old_size;
+ do {
+ prev_de = de;
+ de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
+ de_buf += de_len;
+ de = (struct ext4_dir_entry_2 *)de_buf;
+ } while (de_buf < limit);
+
+ prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
+ old_size, new_size);
+ } else {
+ /* this is just created, so create an empty entry. */
+ de->inode = 0;
+ de->rec_len = ext4_rec_len_to_disk(new_size, new_size);
+ }
+}
+
+static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
+ struct ext4_iloc *iloc)
+{
+ int ret;
+ int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
+ int new_size = get_max_inline_xattr_value_size(dir, iloc);
+
+ if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
+ return -ENOSPC;
+
+ ret = ext4_update_inline_data(handle, dir,
+ new_size + EXT4_MIN_INLINE_DATA_SIZE);
+ if (ret)
+ return ret;
+
+ ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size,
+ EXT4_I(dir)->i_inline_size -
+ EXT4_MIN_INLINE_DATA_SIZE);
+ dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size;
+ return 0;
+}
+
+static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
+ struct ext4_iloc *iloc,
+ void *buf, int inline_size)
+{
+ ext4_create_inline_data(handle, inode, inline_size);
+ ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+}
+
+static int ext4_finish_convert_inline_dir(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *dir_block,
+ void *buf,
+ int inline_size)
+{
+ int err, csum_size = 0, header_size = 0;
+ struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_tail *t;
+ void *target = dir_block->b_data;
+
+ /*
+ * First create "." and ".." and then copy the dir information
+ * back to the block.
+ */
+ de = (struct ext4_dir_entry_2 *)target;
+ de = ext4_init_dot_dotdot(inode, de,
+ inode->i_sb->s_blocksize, csum_size,
+ le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
+ header_size = (void *)de - target;
+
+ memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
+ inline_size - EXT4_INLINE_DOTDOT_SIZE);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
+ inode->i_size = inode->i_sb->s_blocksize;
+ i_size_write(inode, inode->i_sb->s_blocksize);
+ EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+ ext4_update_final_de(dir_block->b_data,
+ inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
+ inode->i_sb->s_blocksize - csum_size);
+
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(dir_block->b_data,
+ inode->i_sb->s_blocksize);
+ initialize_dirent_tail(t, inode->i_sb->s_blocksize);
+ }
+ set_buffer_uptodate(dir_block);
+ err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+ if (err)
+ goto out;
+ set_buffer_verified(dir_block);
+out:
+ return err;
+}
+
+static int ext4_convert_inline_data_nolock(handle_t *handle,
+ struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ int error;
+ void *buf = NULL;
+ struct buffer_head *data_bh = NULL;
+ struct ext4_map_blocks map;
+ int inline_size;
+
+ inline_size = ext4_get_inline_size(inode);
+ buf = kmalloc(inline_size, GFP_NOFS);
+ if (!buf) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ error = ext4_read_inline_data(inode, buf, inline_size, iloc);
+ if (error < 0)
+ goto out;
+
+ error = ext4_destroy_inline_data_nolock(handle, inode);
+ if (error)
+ goto out;
+
+ map.m_lblk = 0;
+ map.m_len = 1;
+ map.m_flags = 0;
+ error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
+ if (error < 0)
+ goto out_restore;
+ if (!(map.m_flags & EXT4_MAP_MAPPED)) {
+ error = -EIO;
+ goto out_restore;
+ }
+
+ data_bh = sb_getblk(inode->i_sb, map.m_pblk);
+ if (!data_bh) {
+ error = -EIO;
+ goto out_restore;
+ }
+
+ lock_buffer(data_bh);
+ error = ext4_journal_get_create_access(handle, data_bh);
+ if (error) {
+ unlock_buffer(data_bh);
+ error = -EIO;
+ goto out_restore;
+ }
+ memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);
+
+ if (!S_ISDIR(inode->i_mode)) {
+ memcpy(data_bh->b_data, buf, inline_size);
+ set_buffer_uptodate(data_bh);
+ error = ext4_handle_dirty_metadata(handle,
+ inode, data_bh);
+ } else {
+ error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
+ buf, inline_size);
+ }
+
+ unlock_buffer(data_bh);
+out_restore:
+ if (error)
+ ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
+
+out:
+ brelse(data_bh);
+ kfree(buf);
+ return error;
+}
+
+/*
+ * Try to add the new entry to the inline data.
+ * If succeeds, return 0. If not, extended the inline dir and copied data to
+ * the new created block.
+ */
+int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+{
+ int ret, inline_size;
+ void *inline_start;
+ struct ext4_iloc iloc;
+ struct inode *dir = dentry->d_parent->d_inode;
+
+ ret = ext4_get_inode_loc(dir, &iloc);
+ if (ret)
+ return ret;
+
+ down_write(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir))
+ goto out;
+
+ inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+ EXT4_INLINE_DOTDOT_SIZE;
+ inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+
+ ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+ inline_start, inline_size);
+ if (ret != -ENOSPC)
+ goto out;
+
+ /* check whether it can be inserted to inline xattr space. */
+ inline_size = EXT4_I(dir)->i_inline_size -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ if (!inline_size) {
+ /* Try to use the xattr space.*/
+ ret = ext4_update_inline_dir(handle, dir, &iloc);
+ if (ret && ret != -ENOSPC)
+ goto out;
+
+ inline_size = EXT4_I(dir)->i_inline_size -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ }
+
+ if (inline_size) {
+ inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+
+ ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+ inline_start, inline_size);
+
+ if (ret != -ENOSPC)
+ goto out;
+ }
+
+ /*
+ * The inline space is filled up, so create a new block for it.
+ * As the extent tree will be created, we have to save the inline
+ * dir first.
+ */
+ ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);
+
+out:
+ ext4_mark_inode_dirty(handle, dir);
+ up_write(&EXT4_I(dir)->xattr_sem);
+ brelse(iloc.bh);
+ return ret;
+}
+
+int ext4_read_inline_dir(struct file *filp,
+ void *dirent, filldir_t filldir,
+ int *has_inline_data)
+{
+ int error = 0;
+ unsigned int offset, parent_ino;
+ int i, stored;
+ struct ext4_dir_entry_2 *de;
+ struct super_block *sb;
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ int ret, inline_size = 0;
+ struct ext4_iloc iloc;
+ void *dir_buf = NULL;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_read(&EXT4_I(inode)->xattr_sem);
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ inline_size = ext4_get_inline_size(inode);
+ dir_buf = kmalloc(inline_size, GFP_NOFS);
+ if (!dir_buf) {
+ ret = -ENOMEM;
+ up_read(&EXT4_I(inode)->xattr_sem);
+ goto out;
+ }
+
+ ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (ret < 0)
+ goto out;
+
+ sb = inode->i_sb;
+ stored = 0;
+ parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+
+ while (!error && !stored && filp->f_pos < inode->i_size) {
+revalidate:
+ /*
+ * If the version has changed since the last call to
+ * readdir(2), then we might be pointing to an invalid
+ * dirent right now. Scan from the start of the inline
+ * dir to make sure.
+ */
+ if (filp->f_version != inode->i_version) {
+ for (i = 0;
+ i < inode->i_size && i < offset;) {
+ if (!i) {
+ /* skip "." and ".." if needed. */
+ i += EXT4_INLINE_DOTDOT_SIZE;
+ continue;
+ }
+ de = (struct ext4_dir_entry_2 *)
+ (dir_buf + i);
+ /* It's too expensive to do a full
+ * dirent test each time round this
+ * loop, but we do have to test at
+ * least that it is non-zero. A
+ * failure will be detected in the
+ * dirent test below. */
+ if (ext4_rec_len_from_disk(de->rec_len,
+ inline_size) < EXT4_DIR_REC_LEN(1))
+ break;
+ i += ext4_rec_len_from_disk(de->rec_len,
+ inline_size);
+ }
+ offset = i;
+ filp->f_pos = offset;
+ filp->f_version = inode->i_version;
+ }
+
+ while (!error && filp->f_pos < inode->i_size) {
+ if (filp->f_pos == 0) {
+ error = filldir(dirent, ".", 1, 0, inode->i_ino,
+ DT_DIR);
+ if (error)
+ break;
+ stored++;
+
+ error = filldir(dirent, "..", 2, 0, parent_ino,
+ DT_DIR);
+ if (error)
+ break;
+ stored++;
+
+ filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
+ continue;
+ }
+
+ de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
+ if (ext4_check_dir_entry(inode, filp, de,
+ iloc.bh, dir_buf,
+ inline_size, offset)) {
+ ret = stored;
+ goto out;
+ }
+ offset += ext4_rec_len_from_disk(de->rec_len,
+ inline_size);
+ if (le32_to_cpu(de->inode)) {
+ /* We might block in the next section
+ * if the data destination is
+ * currently swapped out. So, use a
+ * version stamp to detect whether or
+ * not the directory has been modified
+ * during the copy operation.
+ */
+ u64 version = filp->f_version;
+
+ error = filldir(dirent, de->name,
+ de->name_len,
+ filp->f_pos,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type));
+ if (error)
+ break;
+ if (version != filp->f_version)
+ goto revalidate;
+ stored++;
+ }
+ filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+ inline_size);
+ }
+ offset = 0;
+ }
+out:
+ kfree(dir_buf);
+ brelse(iloc.bh);
+ return ret;
+}
+
+struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+ struct ext4_dir_entry_2 **parent_de,
+ int *retval)
+{
+ struct ext4_iloc iloc;
+
+ *retval = ext4_get_inode_loc(inode, &iloc);
+ if (*retval)
+ return NULL;
+
+ *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+
+ return iloc.bh;
+}
+
+/*
+ * Try to create the inline data for the new dir.
+ * If it succeeds, return 0, otherwise return the error.
+ * In case of ENOSPC, the caller should create the normal disk layout dir.
+ */
+int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent,
+ struct inode *inode)
+{
+ int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+ struct ext4_iloc iloc;
+ struct ext4_dir_entry_2 *de;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ ret = ext4_prepare_inline_data(handle, inode, inline_size);
+ if (ret)
+ goto out;
+
+ /*
+ * For inline dir, we only save the inode information for the ".."
+ * and create a fake dentry to cover the left space.
+ */
+ de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+ de->inode = cpu_to_le32(parent->i_ino);
+ de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE);
+ de->inode = 0;
+ de->rec_len = ext4_rec_len_to_disk(
+ inline_size - EXT4_INLINE_DOTDOT_SIZE,
+ inline_size);
+ set_nlink(inode, 2);
+ inode->i_size = EXT4_I(inode)->i_disksize = inline_size;
+out:
+ brelse(iloc.bh);
+ return ret;
+}
+
+struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *has_inline_data)
+{
+ int ret;
+ struct ext4_iloc iloc;
+ void *inline_start;
+ int inline_size;
+
+ if (ext4_get_inode_loc(dir, &iloc))
+ return NULL;
+
+ down_read(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir)) {
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+ EXT4_INLINE_DOTDOT_SIZE;
+ inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+ ret = search_dir(iloc.bh, inline_start, inline_size,
+ dir, d_name, 0, res_dir);
+ if (ret == 1)
+ goto out_find;
+ if (ret < 0)
+ goto out;
+
+ if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
+ goto out;
+
+ inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+ inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
+
+ ret = search_dir(iloc.bh, inline_start, inline_size,
+ dir, d_name, 0, res_dir);
+ if (ret == 1)
+ goto out_find;
+
+out:
+ brelse(iloc.bh);
+ iloc.bh = NULL;
+out_find:
+ up_read(&EXT4_I(dir)->xattr_sem);
+ return iloc.bh;
+}
+
+int ext4_delete_inline_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ int *has_inline_data)
+{
+ int err, inline_size;
+ struct ext4_iloc iloc;
+ void *inline_start;
+
+ err = ext4_get_inode_loc(dir, &iloc);
+ if (err)
+ return err;
+
+ down_write(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir)) {
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) <
+ EXT4_MIN_INLINE_DATA_SIZE) {
+ inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+ EXT4_INLINE_DOTDOT_SIZE;
+ inline_size = EXT4_MIN_INLINE_DATA_SIZE -
+ EXT4_INLINE_DOTDOT_SIZE;
+ } else {
+ inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+ inline_size = ext4_get_inline_size(dir) -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ }
+
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ goto out;
+
+ err = ext4_generic_delete_entry(handle, dir, de_del, bh,
+ inline_start, inline_size, 0);
+ if (err)
+ goto out;
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_mark_inode_dirty(handle, dir);
+ if (unlikely(err))
+ goto out;
+
+ ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
+out:
+ up_write(&EXT4_I(dir)->xattr_sem);
+ brelse(iloc.bh);
+ if (err != -ENOENT)
+ ext4_std_error(dir->i_sb, err);
+ return err;
+}
+
+/*
+ * Get the inline dentry at offset.
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_get_inline_entry(struct inode *inode,
+ struct ext4_iloc *iloc,
+ unsigned int offset,
+ void **inline_start,
+ int *inline_size)
+{
+ void *inline_pos;
+
+ BUG_ON(offset > ext4_get_inline_size(inode));
+
+ if (offset < EXT4_MIN_INLINE_DATA_SIZE) {
+ inline_pos = (void *)ext4_raw_inode(iloc)->i_block;
+ *inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+ } else {
+ inline_pos = ext4_get_inline_xattr_pos(inode, iloc);
+ offset -= EXT4_MIN_INLINE_DATA_SIZE;
+ *inline_size = ext4_get_inline_size(inode) -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ }
+
+ if (inline_start)
+ *inline_start = inline_pos;
+ return (struct ext4_dir_entry_2 *)(inline_pos + offset);
+}
+
+int empty_inline_dir(struct inode *dir, int *has_inline_data)
+{
+ int err, inline_size;
+ struct ext4_iloc iloc;
+ void *inline_pos;
+ unsigned int offset;
+ struct ext4_dir_entry_2 *de;
+ int ret = 1;
+
+ err = ext4_get_inode_loc(dir, &iloc);
+ if (err) {
+ EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
+ err, dir->i_ino);
+ return 1;
+ }
+
+ down_read(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir)) {
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+ if (!le32_to_cpu(de->inode)) {
+ ext4_warning(dir->i_sb,
+ "bad inline directory (dir #%lu) - no `..'",
+ dir->i_ino);
+ ret = 1;
+ goto out;
+ }
+
+ offset = EXT4_INLINE_DOTDOT_SIZE;
+ while (offset < dir->i_size) {
+ de = ext4_get_inline_entry(dir, &iloc, offset,
+ &inline_pos, &inline_size);
+ if (ext4_check_dir_entry(dir, NULL, de,
+ iloc.bh, inline_pos,
+ inline_size, offset)) {
+ ext4_warning(dir->i_sb,
+ "bad inline directory (dir #%lu) - "
+ "inode %u, rec_len %u, name_len %d"
+ "inline size %d\n",
+ dir->i_ino, le32_to_cpu(de->inode),
+ le16_to_cpu(de->rec_len), de->name_len,
+ inline_size);
+ ret = 1;
+ goto out;
+ }
+ if (le32_to_cpu(de->inode)) {
+ ret = 0;
+ goto out;
+ }
+ offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
+ }
+
+out:
+ up_read(&EXT4_I(dir)->xattr_sem);
+ brelse(iloc.bh);
+ return ret;
+}
+
+int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
+{
+ int ret;
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ ret = ext4_destroy_inline_data_nolock(handle, inode);
+ up_write(&EXT4_I(inode)->xattr_sem);
+
+ return ret;
+}
+
+int ext4_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ int *has_inline)
+{
+ __u64 physical = 0;
+ __u64 length;
+ __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST;
+ int error = 0;
+ struct ext4_iloc iloc;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ *has_inline = 0;
+ goto out;
+ }
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ goto out;
+
+ physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+ physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
+ physical += offsetof(struct ext4_inode, i_block);
+ length = i_size_read(inode);
+
+ if (physical)
+ error = fiemap_fill_next_extent(fieinfo, 0, physical,
+ length, flags);
+ brelse(iloc.bh);
+out:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ return (error < 0 ? error : 0);
+}
+
+/*
+ * Called during xattr set, and if we can sparse space 'needed',
+ * just create the extent tree evict the data to the outer block.
+ *
+ * We use jbd2 instead of page cache to move data to the 1st block
+ * so that the whole transaction can be committed as a whole and
+ * the data isn't lost because of the delayed page cache write.
+ */
+int ext4_try_to_evict_inline_data(handle_t *handle,
+ struct inode *inode,
+ int needed)
+{
+ int error;
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_inode *raw_inode;
+ struct ext4_iloc iloc;
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+
+ raw_inode = ext4_raw_inode(&iloc);
+ header = IHDR(inode, raw_inode);
+ entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+ EXT4_I(inode)->i_inline_off);
+ if (EXT4_XATTR_LEN(entry->e_name_len) +
+ EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) {
+ error = -ENOSPC;
+ goto out;
+ }
+
+ error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+out:
+ brelse(iloc.bh);
+ return error;
+}
+
+void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
+{
+ handle_t *handle;
+ int inline_size, value_len, needed_blocks;
+ size_t i_size;
+ void *value = NULL;
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+
+
+ needed_blocks = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle))
+ return;
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ *has_inline = 0;
+ ext4_journal_stop(handle);
+ return;
+ }
+
+ if (ext4_orphan_add(handle, inode))
+ goto out;
+
+ if (ext4_get_inode_loc(inode, &is.iloc))
+ goto out;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = inode->i_size;
+ inline_size = ext4_get_inline_size(inode);
+ EXT4_I(inode)->i_disksize = i_size;
+
+ if (i_size < inline_size) {
+ /* Clear the content in the xattr space. */
+ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
+ if (ext4_xattr_ibody_find(inode, &i, &is))
+ goto out_error;
+
+ BUG_ON(is.s.not_found);
+
+ value_len = le32_to_cpu(is.s.here->e_value_size);
+ value = kmalloc(value_len, GFP_NOFS);
+ if (!value)
+ goto out_error;
+
+ if (ext4_xattr_ibody_get(inode, i.name_index, i.name,
+ value, value_len))
+ goto out_error;
+
+ i.value = value;
+ i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
+ i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
+ if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is))
+ goto out_error;
+ }
+
+ /* Clear the content within i_blocks. */
+ if (i_size < EXT4_MIN_INLINE_DATA_SIZE)
+ memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0,
+ EXT4_MIN_INLINE_DATA_SIZE - i_size);
+
+ EXT4_I(inode)->i_inline_size = i_size <
+ EXT4_MIN_INLINE_DATA_SIZE ?
+ EXT4_MIN_INLINE_DATA_SIZE : i_size;
+ }
+
+out_error:
+ up_write(&EXT4_I(inode)->i_data_sem);
+out:
+ brelse(is.iloc.bh);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ kfree(value);
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+ ext4_journal_stop(handle);
+ return;
+}
+
+int ext4_convert_inline_data(struct inode *inode)
+{
+ int error, needed_blocks;
+ handle_t *handle;
+ struct ext4_iloc iloc;
+
+ if (!ext4_has_inline_data(inode)) {
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ return 0;
+ }
+
+ needed_blocks = ext4_writepage_trans_blocks(inode);
+
+ iloc.bh = NULL;
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto out_free;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_write(&EXT4_I(inode)->xattr_sem);
+ goto out;
+ }
+
+ error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+ up_write(&EXT4_I(inode)->xattr_sem);
+out:
+ ext4_journal_stop(handle);
+out_free:
+ brelse(iloc.bh);
+ return error;
+}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 02bc8cbe728..cbfe13bf5b2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -233,6 +233,11 @@ void ext4_evict_inode(struct inode *inode)
if (is_bad_inode(inode))
goto no_delete;
+ /*
+ * Protect us against freezing - iput() caller didn't have to have any
+ * protection against it
+ */
+ sb_start_intwrite(inode->i_sb);
handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
@@ -242,6 +247,7 @@ void ext4_evict_inode(struct inode *inode)
* cleaned up.
*/
ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
goto no_delete;
}
@@ -273,6 +279,7 @@ void ext4_evict_inode(struct inode *inode)
stop_handle:
ext4_journal_stop(handle);
ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
goto no_delete;
}
}
@@ -301,6 +308,7 @@ void ext4_evict_inode(struct inode *inode)
else
ext4_free_inode(handle, inode);
ext4_journal_stop(handle);
+ sb_end_intwrite(inode->i_sb);
return;
no_delete:
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
@@ -346,6 +354,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
used = ei->i_reserved_data_blocks;
}
+ if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
+ ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d "
+ "with only %d reserved metadata blocks\n", __func__,
+ inode->i_ino, ei->i_allocated_meta_blocks,
+ ei->i_reserved_meta_blocks);
+ WARN_ON(1);
+ ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
+ }
+
/* Update per-inode reservations */
ei->i_reserved_data_blocks -= used;
ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
@@ -467,49 +484,6 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
}
/*
- * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
- */
-static void set_buffers_da_mapped(struct inode *inode,
- struct ext4_map_blocks *map)
-{
- struct address_space *mapping = inode->i_mapping;
- struct pagevec pvec;
- int i, nr_pages;
- pgoff_t index, end;
-
- index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
- end = (map->m_lblk + map->m_len - 1) >>
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
- pagevec_init(&pvec, 0);
- while (index <= end) {
- nr_pages = pagevec_lookup(&pvec, mapping, index,
- min(end - index + 1,
- (pgoff_t)PAGEVEC_SIZE));
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct buffer_head *bh, *head;
-
- if (unlikely(page->mapping != mapping) ||
- !PageDirty(page))
- break;
-
- if (page_has_buffers(page)) {
- bh = head = page_buffers(page);
- do {
- set_buffer_da_mapped(bh);
- bh = bh->b_this_page;
- } while (bh != head);
- }
- index++;
- }
- pagevec_release(&pvec);
- }
-}
-
-/*
* The ext4_map_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
*
@@ -544,7 +518,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* Try to see if we can get the block without requesting a new
* file system block.
*/
- down_read((&EXT4_I(inode)->i_data_sem));
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ down_read((&EXT4_I(inode)->i_data_sem));
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -552,10 +527,20 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
retval = ext4_ind_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
}
- up_read((&EXT4_I(inode)->i_data_sem));
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ up_read((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret = check_block_validity(inode, map);
+ int ret;
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ /* delayed alloc may be allocated by fallocate and
+ * coverted to initialized by directIO.
+ * we need to handle delayed extent here.
+ */
+ down_write((&EXT4_I(inode)->i_data_sem));
+ goto delayed_mapped;
+ }
+ ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -633,12 +618,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
- /* If we have successfully mapped the delayed allocated blocks,
- * set the BH_Da_Mapped bit on them. Its important to do this
- * under the protection of i_data_sem.
- */
- if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
- set_buffers_da_mapped(inode, map);
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ int ret;
+delayed_mapped:
+ /* delayed allocation blocks has been allocated */
+ ret = ext4_es_remove_extent(inode, map->m_lblk,
+ map->m_len);
+ if (ret < 0)
+ retval = ret;
+ }
}
up_write((&EXT4_I(inode)->i_data_sem));
@@ -661,10 +649,13 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
int ret = 0, started = 0;
int dio_credits;
+ if (ext4_has_inline_data(inode))
+ return -ERANGE;
+
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
- if (flags && !handle) {
+ if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
/* Direct IO write... */
if (map.m_len > DIO_MAX_BLOCKS)
map.m_len = DIO_MAX_BLOCKS;
@@ -713,11 +704,13 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
err = ext4_map_blocks(handle, inode, &map,
create ? EXT4_GET_BLOCKS_CREATE : 0);
+ /* ensure we send some value back into *errp */
+ *errp = 0;
+
if (err < 0)
*errp = err;
if (err <= 0)
return NULL;
- *errp = 0;
bh = sb_getblk(inode->i_sb, map.m_pblk);
if (!bh) {
@@ -777,13 +770,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return NULL;
}
-static int walk_page_buffers(handle_t *handle,
- struct buffer_head *head,
- unsigned from,
- unsigned to,
- int *partial,
- int (*fn)(handle_t *handle,
- struct buffer_head *bh))
+int ext4_walk_page_buffers(handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)(handle_t *handle,
+ struct buffer_head *bh))
{
struct buffer_head *bh;
unsigned block_start, block_end;
@@ -833,8 +826,8 @@ static int walk_page_buffers(handle_t *handle,
* is elevated. We'll still have enough credits for the tiny quotafile
* write.
*/
-static int do_journal_get_write_access(handle_t *handle,
- struct buffer_head *bh)
+int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh)
{
int dirty = buffer_dirty(bh);
int ret;
@@ -857,7 +850,7 @@ static int do_journal_get_write_access(handle_t *handle,
return ret;
}
-static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -881,6 +874,17 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
+ flags, pagep);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ ret = 0;
+ goto out;
+ }
+ }
+
retry:
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
@@ -898,6 +902,7 @@ retry:
ret = -ENOMEM;
goto out;
}
+
*pagep = page;
if (ext4_should_dioread_nolock(inode))
@@ -906,8 +911,9 @@ retry:
ret = __block_write_begin(page, pos, len, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, do_journal_get_write_access);
+ ret = ext4_walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL,
+ do_journal_get_write_access);
}
if (ret) {
@@ -962,7 +968,12 @@ static int ext4_generic_write_end(struct file *file,
struct inode *inode = mapping->host;
handle_t *handle = ext4_journal_current_handle();
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ if (ext4_has_inline_data(inode))
+ copied = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ else
+ copied = block_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
/*
* No need to use i_size_read() here, the i_size
@@ -1113,16 +1124,21 @@ static int ext4_journalled_write_end(struct file *file,
BUG_ON(!ext4_handle_valid(handle));
- if (copied < len) {
- if (!PageUptodate(page))
- copied = 0;
- page_zero_new_buffers(page, from+copied, to);
- }
+ if (ext4_has_inline_data(inode))
+ copied = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ else {
+ if (copied < len) {
+ if (!PageUptodate(page))
+ copied = 0;
+ page_zero_new_buffers(page, from+copied, to);
+ }
- ret = walk_page_buffers(handle, page_buffers(page), from,
- to, &partial, write_end_fn);
- if (!partial)
- SetPageUptodate(page);
+ ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
+ to, &partial, write_end_fn);
+ if (!partial)
+ SetPageUptodate(page);
+ }
new_i_size = pos + copied;
if (new_i_size > inode->i_size)
i_size_write(inode, pos+copied);
@@ -1171,6 +1187,17 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int md_needed;
int ret;
+ ext4_lblk_t save_last_lblock;
+ int save_len;
+
+ /*
+ * We will charge metadata quota at writeout time; this saves
+ * us from metadata over-estimation, though we may go over by
+ * a small amount in the end. Here we just reserve for data.
+ */
+ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
+ if (ret)
+ return ret;
/*
* recalculate the amount of metadata blocks to reserve
@@ -1179,32 +1206,31 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
*/
repeat:
spin_lock(&ei->i_block_reservation_lock);
+ /*
+ * ext4_calc_metadata_amount() has side effects, which we have
+ * to be prepared undo if we fail to claim space.
+ */
+ save_len = ei->i_da_metadata_calc_len;
+ save_last_lblock = ei->i_da_metadata_calc_last_lblock;
md_needed = EXT4_NUM_B2C(sbi,
ext4_calc_metadata_amount(inode, lblock));
trace_ext4_da_reserve_space(inode, md_needed);
- spin_unlock(&ei->i_block_reservation_lock);
/*
- * We will charge metadata quota at writeout time; this saves
- * us from metadata over-estimation, though we may go over by
- * a small amount in the end. Here we just reserve for data.
- */
- ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
- if (ret)
- return ret;
- /*
* We do still charge estimated metadata to the sb though;
* we cannot afford to run out of free blocks.
*/
if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
- dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+ ei->i_da_metadata_calc_len = save_len;
+ ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+ spin_unlock(&ei->i_block_reservation_lock);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
}
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
return -ENOSPC;
}
- spin_lock(&ei->i_block_reservation_lock);
ei->i_reserved_data_blocks++;
ei->i_reserved_meta_blocks += md_needed;
spin_unlock(&ei->i_block_reservation_lock);
@@ -1270,6 +1296,7 @@ static void ext4_da_page_release_reservation(struct page *page,
struct inode *inode = page->mapping->host;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int num_clusters;
+ ext4_fsblk_t lblk;
head = page_buffers(page);
bh = head;
@@ -1279,20 +1306,23 @@ static void ext4_da_page_release_reservation(struct page *page,
if ((offset <= curr_off) && (buffer_delay(bh))) {
to_release++;
clear_buffer_delay(bh);
- clear_buffer_da_mapped(bh);
}
curr_off = next_off;
} while ((bh = bh->b_this_page) != head);
+ if (to_release) {
+ lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, lblk, to_release);
+ }
+
/* If we have released all the blocks belonging to a cluster, then we
* need to release the reserved space for that cluster. */
num_clusters = EXT4_NUM_B2C(sbi, to_release);
while (num_clusters > 0) {
- ext4_fsblk_t lblk;
lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
((num_clusters - 1) << sbi->s_cluster_bits);
if (sbi->s_cluster_ratio == 1 ||
- !ext4_find_delalloc_cluster(inode, lblk, 1))
+ !ext4_find_delalloc_cluster(inode, lblk))
ext4_da_release_space(inode, 1);
num_clusters--;
@@ -1398,8 +1428,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
clear_buffer_delay(bh);
bh->b_blocknr = pblock;
}
- if (buffer_da_mapped(bh))
- clear_buffer_da_mapped(bh);
if (buffer_unwritten(bh) ||
buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock);
@@ -1469,9 +1497,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
struct pagevec pvec;
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
+ ext4_lblk_t start, last;
index = mpd->first_page;
end = mpd->next_page - 1;
+
+ start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, start, last - start + 1);
+
+ pagevec_init(&pvec, 0);
while (index <= end) {
nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
if (nr_pages == 0)
@@ -1625,15 +1660,6 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
for (i = 0; i < map.m_len; i++)
unmap_underlying_metadata(bdev, map.m_pblk + i);
-
- if (ext4_should_order_data(mpd->inode)) {
- err = ext4_jbd2_file_inode(handle, mpd->inode);
- if (err) {
- /* Only if the journal is aborted */
- mpd->retval = err;
- goto submit_io;
- }
- }
}
/*
@@ -1764,7 +1790,19 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
* file system block.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ if (ext4_has_inline_data(inode)) {
+ /*
+ * We will soon create blocks for this page, and let
+ * us pretend as if the blocks aren't allocated yet.
+ * In case of clusters, we have to handle the work
+ * of mapping from cluster so that the reserved space
+ * is calculated properly.
+ */
+ if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
+ ext4_find_delalloc_cluster(inode, map->m_lblk))
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ retval = 0;
+ } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
retval = ext4_ext_map_blocks(NULL, inode, map, 0);
else
retval = ext4_ind_map_blocks(NULL, inode, map, 0);
@@ -1783,6 +1821,10 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
goto out_unlock;
}
+ retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
+ if (retval)
+ goto out_unlock;
+
/* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
* and it should not appear on the bh->b_state.
*/
@@ -1811,8 +1853,8 @@ out_unlock:
* We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
* initialized properly.
*/
-static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
{
struct ext4_map_blocks map;
int ret = 0;
@@ -1886,15 +1928,29 @@ static int __ext4_journalled_writepage(struct page *page,
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
- struct buffer_head *page_bufs;
+ struct buffer_head *page_bufs = NULL;
handle_t *handle = NULL;
- int ret = 0;
- int err;
+ int ret = 0, err = 0;
+ int inline_data = ext4_has_inline_data(inode);
+ struct buffer_head *inode_bh = NULL;
ClearPageChecked(page);
- page_bufs = page_buffers(page);
- BUG_ON(!page_bufs);
- walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
+
+ if (inline_data) {
+ BUG_ON(page->index != 0);
+ BUG_ON(len > ext4_get_max_inline_size(inode));
+ inode_bh = ext4_journalled_write_inline_data(inode, len, page);
+ if (inode_bh == NULL)
+ goto out;
+ } else {
+ page_bufs = page_buffers(page);
+ if (!page_bufs) {
+ BUG();
+ goto out;
+ }
+ ext4_walk_page_buffers(handle, page_bufs, 0, len,
+ NULL, bget_one);
+ }
/* As soon as we unlock the page, it can go away, but we have
* references to buffers so we are safe */
unlock_page(page);
@@ -1907,11 +1963,18 @@ static int __ext4_journalled_writepage(struct page *page,
BUG_ON(!ext4_handle_valid(handle));
- ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
- do_journal_get_write_access);
+ if (inline_data) {
+ ret = ext4_journal_get_write_access(handle, inode_bh);
+
+ err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
- err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
- write_end_fn);
+ } else {
+ ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ do_journal_get_write_access);
+
+ err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ write_end_fn);
+ }
if (ret == 0)
ret = err;
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
@@ -1919,15 +1982,15 @@ static int __ext4_journalled_writepage(struct page *page,
if (!ret)
ret = err;
- walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+ if (!ext4_has_inline_data(inode))
+ ext4_walk_page_buffers(handle, page_bufs, 0, len,
+ NULL, bput_one);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
+ brelse(inode_bh);
return ret;
}
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
-
/*
* Note that we don't need to start a transaction unless we're journaling data
* because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -1941,7 +2004,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
* This function can get called via...
* - ext4_da_writepages after taking page lock (have journal handle)
* - journal_submit_inode_data_buffers (no journal handle)
- * - shrink_page_list via pdflush (no journal handle)
+ * - shrink_page_list via the kswapd/direct reclaim (no journal handle)
* - grab_page_cache when doing write_begin (have journal handle)
*
* We don't do any block allocation in this function. If we have page with
@@ -2001,8 +2064,8 @@ static int ext4_writepage(struct page *page,
commit_write = 1;
}
page_bufs = page_buffers(page);
- if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
- ext4_bh_delay_or_unwritten)) {
+ if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_delay_or_unwritten)) {
/*
* We don't want to do block allocation, so redirty
* the page and return. We may reach here when we do
@@ -2068,7 +2131,8 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
* mpage_da_map_and_submit to map a single contiguous memory region
* and then write them.
*/
-static int write_cache_pages_da(struct address_space *mapping,
+static int write_cache_pages_da(handle_t *handle,
+ struct address_space *mapping,
struct writeback_control *wbc,
struct mpage_da_data *mpd,
pgoff_t *done_index)
@@ -2147,6 +2211,17 @@ static int write_cache_pages_da(struct address_space *mapping,
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
+ /*
+ * If we have inline data and arrive here, it means that
+ * we will soon create the block for the 1st page, so
+ * we'd better clear the inline data here.
+ */
+ if (ext4_has_inline_data(inode)) {
+ BUG_ON(ext4_test_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA));
+ ext4_destroy_inline_data(handle, inode);
+ }
+
if (mpd->next_page != page->index)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
@@ -2353,7 +2428,8 @@ retry:
* contiguous region of logical blocks that need
* blocks to be allocated by ext4 and submit them.
*/
- ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
+ ret = write_cache_pages_da(handle, mapping,
+ wbc, &mpd, &done_index);
/*
* If we have a contiguous extent of pages and we
* haven't done the I/O yet, map the blocks and submit
@@ -2417,7 +2493,6 @@ out_writepages:
return ret;
}
-#define FALL_BACK_TO_NONDELALLOC 1
static int ext4_nonda_switch(struct super_block *sb)
{
s64 free_blocks, dirty_blocks;
@@ -2434,6 +2509,16 @@ static int ext4_nonda_switch(struct super_block *sb)
free_blocks = EXT4_C2B(sbi,
percpu_counter_read_positive(&sbi->s_freeclusters_counter));
dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+ /*
+ * Start pushing delalloc when 1/2 of free blocks are dirty.
+ */
+ if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
+ !writeback_in_progress(sb->s_bdi) &&
+ down_read_trylock(&sb->s_umount)) {
+ writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
+ up_read(&sb->s_umount);
+ }
+
if (2 * free_blocks < 3 * dirty_blocks ||
free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
/*
@@ -2442,13 +2527,6 @@ static int ext4_nonda_switch(struct super_block *sb)
*/
return 1;
}
- /*
- * Even if we don't switch but are nearing capacity,
- * start pushing delalloc when 1/2 of free blocks are dirty.
- */
- if (free_blocks < 2 * dirty_blocks)
- writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
-
return 0;
}
@@ -2471,6 +2549,19 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
}
*fsdata = (void *)0;
trace_ext4_da_write_begin(inode, pos, len, flags);
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ ret = ext4_da_write_inline_data_begin(mapping, inode,
+ pos, len, flags,
+ pagep, fsdata);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ ret = 0;
+ goto out;
+ }
+ }
+
retry:
/*
* With delayed allocation, we don't log the i_disksize update
@@ -2572,22 +2663,13 @@ static int ext4_da_write_end(struct file *file,
* changes. So let's piggyback the i_disksize mark_inode_dirty
* into that.
*/
-
new_i_size = pos + copied;
if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
- if (ext4_da_should_update_i_disksize(page, end)) {
+ if (ext4_has_inline_data(inode) ||
+ ext4_da_should_update_i_disksize(page, end)) {
down_write(&EXT4_I(inode)->i_data_sem);
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- /*
- * Updating i_disksize when extending file
- * without needing block allocation
- */
- if (ext4_should_order_data(inode))
- ret = ext4_jbd2_file_inode(handle,
- inode);
-
+ if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- }
up_write(&EXT4_I(inode)->i_data_sem);
/* We need to mark inode dirty even if
* new_i_size is less that inode->i_size
@@ -2596,8 +2678,16 @@ static int ext4_da_write_end(struct file *file,
ext4_mark_inode_dirty(handle, inode);
}
}
- ret2 = generic_write_end(file, mapping, pos, len, copied,
+
+ if (write_mode != CONVERT_INLINE_DATA &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+ ext4_has_inline_data(inode))
+ ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
+ page);
+ else
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
+
copied = ret2;
if (ret2 < 0)
ret = ret2;
@@ -2690,6 +2780,12 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
+ /*
+ * We can get here for an inline file via the FIBMAP ioctl
+ */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
test_opt(inode->i_sb, DELALLOC)) {
/*
@@ -2735,14 +2831,30 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
static int ext4_readpage(struct file *file, struct page *page)
{
+ int ret = -EAGAIN;
+ struct inode *inode = page->mapping->host;
+
trace_ext4_readpage(page);
- return mpage_readpage(page, ext4_get_block);
+
+ if (ext4_has_inline_data(inode))
+ ret = ext4_readpage_inline(inode, page);
+
+ if (ret == -EAGAIN)
+ return mpage_readpage(page, ext4_get_block);
+
+ return ret;
}
static int
ext4_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
+ struct inode *inode = mapping->host;
+
+ /* If the file has inline data, no need to do readpages. */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
@@ -2768,8 +2880,6 @@ static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offs
static void ext4_invalidatepage(struct page *page, unsigned long offset)
{
- journal_t *journal = EXT4_JOURNAL(page->mapping->host);
-
trace_ext4_invalidatepage(page, offset);
/*
@@ -2777,16 +2887,34 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
*/
if (ext4_should_dioread_nolock(page->mapping->host))
ext4_invalidatepage_free_endio(page, offset);
+
+ /* No journalling happens on data buffers when this function is used */
+ WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+
+ block_invalidatepage(page, offset);
+}
+
+static int __ext4_journalled_invalidatepage(struct page *page,
+ unsigned long offset)
+{
+ journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+
+ trace_ext4_journalled_invalidatepage(page, offset);
+
/*
* If it's a full truncate we just forget about the pending dirtying
*/
if (offset == 0)
ClearPageChecked(page);
- if (journal)
- jbd2_journal_invalidatepage(journal, page, offset);
- else
- block_invalidatepage(page, offset);
+ return jbd2_journal_invalidatepage(journal, page, offset);
+}
+
+/* Wrapper for aops... */
+static void ext4_journalled_invalidatepage(struct page *page,
+ unsigned long offset)
+{
+ WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -2809,7 +2937,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
* We allocate an uinitialized extent if blocks haven't been allocated.
* The extent will be converted to initialized after the IO is complete.
*/
-static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
@@ -2818,15 +2946,21 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
EXT4_GET_BLOCKS_IO_CREATE_EXT);
}
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_NO_LOCK);
+}
+
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private, int ret,
bool is_async)
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
ext4_io_end_t *io_end = iocb->private;
- struct workqueue_struct *wq;
- unsigned long flags;
- struct ext4_inode_info *ei;
/* if not async direct IO or dio with 0 bytes write, just return */
if (!io_end || !size)
@@ -2855,24 +2989,14 @@ out:
io_end->iocb = iocb;
io_end->result = ret;
}
- wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-
- /* Add the io_end to per-inode completed aio dio list*/
- ei = EXT4_I(io_end->inode);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &ei->i_completed_io_list);
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
}
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
{
ext4_io_end_t *io_end = bh->b_private;
- struct workqueue_struct *wq;
struct inode *inode;
- unsigned long flags;
if (!test_clear_buffer_uninit(bh) || !io_end)
goto out;
@@ -2891,15 +3015,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
*/
inode = io_end->inode;
ext4_set_io_unwritten_flag(inode, io_end);
-
- /* Add the io_end to per-inode completed io list*/
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
- wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
out:
bh->b_private = NULL;
bh->b_end_io = NULL;
@@ -2942,10 +3058,10 @@ retry:
* fall back to buffered IO.
*
* For holes, we fallocate those blocks, mark them as uninitialized
- * If those blocks were preallocated, we mark sure they are splited, but
+ * If those blocks were preallocated, we mark sure they are split, but
* still keep the range to write as uninitialized.
*
- * The unwrritten extents will be converted to written when DIO is completed.
+ * The unwritten extents will be converted to written when DIO is completed.
* For async direct IO, since the IO may still pending when return, we
* set up an end_io call back function, which will do the conversion
* when async direct IO completed.
@@ -2963,92 +3079,120 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
struct inode *inode = file->f_mapping->host;
ssize_t ret;
size_t count = iov_length(iov, nr_segs);
-
+ int overwrite = 0;
+ get_block_t *get_block_func = NULL;
+ int dio_flags = 0;
loff_t final_size = offset + count;
- if (rw == WRITE && final_size <= inode->i_size) {
- /*
- * We could direct write to holes and fallocate.
- *
- * Allocated blocks to fill the hole are marked as uninitialized
- * to prevent parallel buffered read to expose the stale data
- * before DIO complete the data IO.
- *
- * As to previously fallocated extents, ext4 get_block
- * will just simply mark the buffer mapped but still
- * keep the extents uninitialized.
- *
- * for non AIO case, we will convert those unwritten extents
- * to written after return back from blockdev_direct_IO.
- *
- * for async DIO, the conversion needs to be defered when
- * the IO is completed. The ext4 end_io callback function
- * will be called to take care of the conversion work.
- * Here for async case, we allocate an io_end structure to
- * hook to the iocb.
- */
- iocb->private = NULL;
- EXT4_I(inode)->cur_aio_dio = NULL;
- if (!is_sync_kiocb(iocb)) {
- ext4_io_end_t *io_end =
- ext4_init_io_end(inode, GFP_NOFS);
- if (!io_end)
- return -ENOMEM;
- io_end->flag |= EXT4_IO_END_DIRECT;
- iocb->private = io_end;
- /*
- * we save the io structure for current async
- * direct IO, so that later ext4_map_blocks()
- * could flag the io structure whether there
- * is a unwritten extents needs to be converted
- * when IO is completed.
- */
- EXT4_I(inode)->cur_aio_dio = iocb->private;
- }
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block_write,
- ext4_end_io_dio,
- NULL,
- DIO_LOCKING);
- if (iocb->private)
- EXT4_I(inode)->cur_aio_dio = NULL;
+ /* Use the old path for reads and writes beyond i_size. */
+ if (rw != WRITE || final_size > inode->i_size)
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+
+ BUG_ON(iocb->private == NULL);
+
+ /* If we do a overwrite dio, i_mutex locking can be released */
+ overwrite = *((int *)iocb->private);
+
+ if (overwrite) {
+ atomic_inc(&inode->i_dio_count);
+ down_read(&EXT4_I(inode)->i_data_sem);
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ /*
+ * We could direct write to holes and fallocate.
+ *
+ * Allocated blocks to fill the hole are marked as
+ * uninitialized to prevent parallel buffered read to expose
+ * the stale data before DIO complete the data IO.
+ *
+ * As to previously fallocated extents, ext4 get_block will
+ * just simply mark the buffer mapped but still keep the
+ * extents uninitialized.
+ *
+ * For non AIO case, we will convert those unwritten extents
+ * to written after return back from blockdev_direct_IO.
+ *
+ * For async DIO, the conversion needs to be deferred when the
+ * IO is completed. The ext4 end_io callback function will be
+ * called to take care of the conversion work. Here for async
+ * case, we allocate an io_end structure to hook to the iocb.
+ */
+ iocb->private = NULL;
+ ext4_inode_aio_set(inode, NULL);
+ if (!is_sync_kiocb(iocb)) {
+ ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_end) {
+ ret = -ENOMEM;
+ goto retake_lock;
+ }
+ io_end->flag |= EXT4_IO_END_DIRECT;
+ iocb->private = io_end;
/*
- * The io_end structure takes a reference to the inode,
- * that structure needs to be destroyed and the
- * reference to the inode need to be dropped, when IO is
- * complete, even with 0 byte write, or failed.
- *
- * In the successful AIO DIO case, the io_end structure will be
- * desctroyed and the reference to the inode will be dropped
- * after the end_io call back function is called.
- *
- * In the case there is 0 byte write, or error case, since
- * VFS direct IO won't invoke the end_io call back function,
- * we need to free the end_io structure here.
+ * we save the io structure for current async direct
+ * IO, so that later ext4_map_blocks() could flag the
+ * io structure whether there is a unwritten extents
+ * needs to be converted when IO is completed.
*/
- if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
- ext4_free_io_end(iocb->private);
- iocb->private = NULL;
- } else if (ret > 0 && ext4_test_inode_state(inode,
+ ext4_inode_aio_set(inode, io_end);
+ }
+
+ if (overwrite) {
+ get_block_func = ext4_get_block_write_nolock;
+ } else {
+ get_block_func = ext4_get_block_write;
+ dio_flags = DIO_LOCKING;
+ }
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ get_block_func,
+ ext4_end_io_dio,
+ NULL,
+ dio_flags);
+
+ if (iocb->private)
+ ext4_inode_aio_set(inode, NULL);
+ /*
+ * The io_end structure takes a reference to the inode, that
+ * structure needs to be destroyed and the reference to the
+ * inode need to be dropped, when IO is complete, even with 0
+ * byte write, or failed.
+ *
+ * In the successful AIO DIO case, the io_end structure will
+ * be destroyed and the reference to the inode will be dropped
+ * after the end_io call back function is called.
+ *
+ * In the case there is 0 byte write, or error case, since VFS
+ * direct IO won't invoke the end_io call back function, we
+ * need to free the end_io structure here.
+ */
+ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+ ext4_free_io_end(iocb->private);
+ iocb->private = NULL;
+ } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
- int err;
- /*
- * for non AIO case, since the IO is already
- * completed, we could do the conversion right here
- */
- err = ext4_convert_unwritten_extents(inode,
- offset, ret);
- if (err < 0)
- ret = err;
- ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
- }
- return ret;
+ int err;
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the conversion right here
+ */
+ err = ext4_convert_unwritten_extents(inode,
+ offset, ret);
+ if (err < 0)
+ ret = err;
+ ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+ }
+
+retake_lock:
+ /* take i_mutex locking again if we do a ovewrite dio */
+ if (overwrite) {
+ inode_dio_done(inode);
+ up_read(&EXT4_I(inode)->i_data_sem);
+ mutex_lock(&inode->i_mutex);
}
- /* for write the the end of file case, we fall back to old way */
- return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+ return ret;
}
static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
@@ -3065,6 +3209,10 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
if (ext4_should_journal_data(inode))
return 0;
+ /* Let buffer I/O handle the inline data case. */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -3132,7 +3280,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.write_end = ext4_journalled_write_end,
.set_page_dirty = ext4_journalled_set_page_dirty,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
+ .invalidatepage = ext4_journalled_invalidatepage,
.releasepage = ext4_releasepage,
.direct_IO = ext4_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -3227,7 +3375,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
* handle: The journal handle
* inode: The files inode
* page: A locked page that contains the offset "from"
- * from: The starting byte offset (from the begining of the file)
+ * from: The starting byte offset (from the beginning of the file)
* to begin discarding
* len: The length of bytes to discard
* flags: Optional flags that may be used:
@@ -3235,11 +3383,11 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
* EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
* Only zero the regions of the page whose buffer heads
* have already been unmapped. This flag is appropriate
- * for updateing the contents of a page whose blocks may
+ * for updating the contents of a page whose blocks may
* have already been released, and we only want to zero
* out the regions that correspond to those released blocks.
*
- * Returns zero on sucess or negative on failure.
+ * Returns zero on success or negative on failure.
*/
static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
struct inode *inode, struct page *page, loff_t from,
@@ -3400,7 +3548,7 @@ int ext4_can_truncate(struct inode *inode)
* @offset: The offset where the hole will begin
* @len: The length of the hole
*
- * Returns: 0 on sucess or negative on failure
+ * Returns: 0 on success or negative on failure
*/
int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
@@ -3462,6 +3610,14 @@ void ext4_truncate(struct inode *inode)
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+ if (ext4_has_inline_data(inode)) {
+ int has_inline = 1;
+
+ ext4_inline_data_truncate(inode, &has_inline);
+ if (has_inline)
+ return;
+ }
+
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ext4_ext_truncate(inode);
else
@@ -3687,6 +3843,19 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
}
}
+static inline void ext4_iget_extra_inode(struct inode *inode,
+ struct ext4_inode *raw_inode,
+ struct ext4_inode_info *ei)
+{
+ __le32 *magic = (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
+ if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ ext4_find_inline_data_nolock(inode);
+ } else
+ EXT4_I(inode)->i_inline_off = 0;
+}
+
struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
{
struct ext4_iloc iloc;
@@ -3757,6 +3926,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
+ ei->i_inline_off = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
@@ -3829,11 +3999,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
} else {
- __le32 *magic = (void *)raw_inode +
- EXT4_GOOD_OLD_INODE_SIZE +
- ei->i_extra_isize;
- if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
- ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ ext4_iget_extra_inode(inode, raw_inode, ei);
}
}
@@ -3856,17 +4022,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_file_acl);
ret = -EIO;
goto bad_inode;
- } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- (S_ISLNK(inode->i_mode) &&
- !ext4_inode_is_fast_symlink(inode)))
- /* Validate extent which is part of inode */
- ret = ext4_ext_check_inode(inode);
- } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- (S_ISLNK(inode->i_mode) &&
- !ext4_inode_is_fast_symlink(inode))) {
- /* Validate block references which are part of inode */
- ret = ext4_ind_check_inode(inode);
+ } else if (!ext4_has_inline_data(inode)) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))))
+ /* Validate extent which is part of inode */
+ ret = ext4_ext_check_inode(inode);
+ } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))) {
+ /* Validate block references which are part of inode */
+ ret = ext4_ind_check_inode(inode);
+ }
}
if (ret)
goto bad_inode;
@@ -3922,7 +4090,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
if (i_blocks <= ~0U) {
/*
- * i_blocks can be represnted in a 32 bit variable
+ * i_blocks can be represented in a 32 bit variable
* as multiple of 512 bytes
*/
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
@@ -3966,6 +4134,7 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
+ int need_datasync = 0;
uid_t i_uid;
gid_t i_gid;
@@ -4016,7 +4185,10 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- ext4_isize_set(raw_inode, ei->i_disksize);
+ if (ei->i_disksize != ext4_isize(raw_inode)) {
+ ext4_isize_set(raw_inode, ei->i_disksize);
+ need_datasync = 1;
+ }
if (ei->i_disksize > 0x7fffffffULL) {
struct super_block *sb = inode->i_sb;
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -4034,7 +4206,7 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_SET_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
ext4_handle_sync(handle);
- err = ext4_handle_dirty_super_now(handle, sb);
+ err = ext4_handle_dirty_super(handle, sb);
}
}
raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -4049,9 +4221,10 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le32(new_encode_dev(inode->i_rdev));
raw_inode->i_block[2] = 0;
}
- } else
+ } else if (!ext4_has_inline_data(inode)) {
for (block = 0; block < EXT4_N_BLOCKS; block++)
raw_inode->i_block[block] = ei->i_data[block];
+ }
raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
if (ei->i_extra_isize) {
@@ -4069,7 +4242,7 @@ static int ext4_do_update_inode(handle_t *handle,
err = rc;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
- ext4_update_inode_fsync_trans(handle, inode, 0);
+ ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_brelse:
brelse(bh);
ext4_std_error(inode->i_sb, err);
@@ -4083,7 +4256,7 @@ out_brelse:
*
* - Within generic_file_write() for O_SYNC files.
* Here, there will be no transaction running. We wait for any running
- * trasnaction to commit.
+ * transaction to commit.
*
* - Within sys_sync(), kupdate and such.
* We wait on commit, if tol to.
@@ -4148,6 +4321,47 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
+ * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
+ * buffers that are attached to a page stradding i_size and are undergoing
+ * commit. In that case we have to wait for commit to finish and try again.
+ */
+static void ext4_wait_for_tail_page_commit(struct inode *inode)
+{
+ struct page *page;
+ unsigned offset;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = 0;
+ int ret;
+
+ offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ /*
+ * All buffers in the last page remain valid? Then there's nothing to
+ * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
+ * blocksize case
+ */
+ if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
+ return;
+ while (1) {
+ page = find_lock_page(inode->i_mapping,
+ inode->i_size >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return;
+ ret = __ext4_journalled_invalidatepage(page, offset);
+ unlock_page(page);
+ page_cache_release(page);
+ if (ret != -EBUSY)
+ return;
+ commit_tid = 0;
+ read_lock(&journal->j_state_lock);
+ if (journal->j_committing_transaction)
+ commit_tid = journal->j_committing_transaction->t_tid;
+ read_unlock(&journal->j_state_lock);
+ if (commit_tid)
+ jbd2_log_wait_commit(journal, commit_tid);
+ }
+}
+
+/*
* ext4_setattr()
*
* Called from notify_change.
@@ -4212,7 +4426,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- inode_dio_wait(inode);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4261,8 +4474,29 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- if (attr->ia_size != i_size_read(inode))
- truncate_setsize(inode, attr->ia_size);
+ if (attr->ia_size != inode->i_size) {
+ loff_t oldsize = inode->i_size;
+
+ i_size_write(inode, attr->ia_size);
+ /*
+ * Blocks are going to be removed from the inode. Wait
+ * for dio in flight. Temporarily disable
+ * dioread_nolock to prevent livelock.
+ */
+ if (orphan) {
+ if (!ext4_should_journal_data(inode)) {
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ ext4_inode_resume_unlocked_dio(inode);
+ } else
+ ext4_wait_for_tail_page_commit(inode);
+ }
+ /*
+ * Truncate pagecache after we've waited for commit
+ * in data=journal mode to make pages freeable.
+ */
+ truncate_pagecache(inode, oldsize, inode->i_size);
+ }
ext4_truncate(inode);
}
@@ -4327,7 +4561,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* worse case, the indexs blocks spread over different block groups
*
* If datablocks are discontiguous, they are possible to spread over
- * different block groups too. If they are contiuguous, with flexbg,
+ * different block groups too. If they are contiguous, with flexbg,
* they could still across block group boundary.
*
* Also account for superblock, inode, quota and xattr blocks
@@ -4503,14 +4737,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
* inode out, but prune_icache isn't a user-visible syncing function.
* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
* we start and wait on commits.
- *
- * Is this efficient/effective? Well, we're being nice to the system
- * by cleaning up our inodes proactively so they can be reaped
- * without I/O. But we are potentially leaving up to five seconds'
- * worth of inodes floating about which prune_icache wants us to
- * write out. One way to fix that would be to get prune_icache()
- * to do a write_super() to free up some memory. It has the desired
- * effect.
*/
int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
{
@@ -4649,6 +4875,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
jbd2_journal_lock_updates(journal);
/*
@@ -4668,6 +4898,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
+ ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */
@@ -4701,11 +4932,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
get_block_t *get_block;
int retries = 0;
- /*
- * This check is racy but catches the common case. We rely on
- * __block_page_mkwrite() to do a reliable check.
- */
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
@@ -4736,8 +4964,9 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
* journal_start/journal_stop which can block and take a long time
*/
if (page_has_buffers(page)) {
- if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
- ext4_bh_unmapped)) {
+ if (!ext4_walk_page_buffers(NULL, page_buffers(page),
+ 0, len, NULL,
+ ext4_bh_unmapped)) {
/* Wait so that we don't change page under IO */
wait_on_page_writeback(page);
ret = VM_FAULT_LOCKED;
@@ -4758,7 +4987,7 @@ retry_alloc:
}
ret = __block_page_mkwrite(vma, vmf, get_block);
if (!ret && ext4_should_journal_data(inode)) {
- if (walk_page_buffers(handle, page_buffers(page), 0,
+ if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
unlock_page(page);
ret = VM_FAULT_SIGBUS;
@@ -4773,5 +5002,6 @@ retry_alloc:
out_ret:
ret = block_page_mkwrite_return(ret);
out:
+ sb_end_pagefault(inode->i_sb);
return ret;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 6ec6f9ee2fe..5747f52f7c7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -233,7 +233,7 @@ group_extend_out:
case EXT4_IOC_MOVE_EXT: {
struct move_extent me;
- struct file *donor_filp;
+ struct fd donor;
int err;
if (!(filp->f_mode & FMODE_READ) ||
@@ -245,11 +245,11 @@ group_extend_out:
return -EFAULT;
me.moved_len = 0;
- donor_filp = fget(me.donor_fd);
- if (!donor_filp)
+ donor = fdget(me.donor_fd);
+ if (!donor.file)
return -EBADF;
- if (!(donor_filp->f_mode & FMODE_WRITE)) {
+ if (!(donor.file->f_mode & FMODE_WRITE)) {
err = -EBADF;
goto mext_out;
}
@@ -258,14 +258,15 @@ group_extend_out:
EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
ext4_msg(sb, KERN_ERR,
"Online defrag not supported with bigalloc");
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ goto mext_out;
}
err = mnt_want_write_file(filp);
if (err)
goto mext_out;
- err = ext4_move_extents(filp, donor_filp, me.orig_start,
+ err = ext4_move_extents(filp, donor.file, me.orig_start,
me.donor_start, me.len, &me.moved_len);
mnt_drop_write_file(filp);
@@ -273,7 +274,7 @@ group_extend_out:
&me, sizeof(me)))
err = -EFAULT;
mext_out:
- fput(donor_filp);
+ fdput(donor);
return err;
}
@@ -365,31 +366,16 @@ group_add_out:
return -EOPNOTSUPP;
}
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_META_BG)) {
- ext4_msg(sb, KERN_ERR,
- "Online resizing not (yet) supported with meta_bg");
- return -EOPNOTSUPP;
- }
-
if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
sizeof(__u64))) {
return -EFAULT;
}
- if (n_blocks_count > MAX_32_NUM &&
- !EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_64BIT)) {
- ext4_msg(sb, KERN_ERR,
- "File system only supports 32-bit block numbers");
- return -EOPNOTSUPP;
- }
-
err = ext4_resize_begin(sb);
if (err)
return err;
- err = mnt_want_write(filp->f_path.mnt);
+ err = mnt_want_write_file(filp);
if (err)
goto resizefs_out;
@@ -401,7 +387,7 @@ group_add_out:
}
if (err == 0)
err = err2;
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
resizefs_out:
ext4_resize_end(sb);
return err;
@@ -419,13 +405,6 @@ resizefs_out:
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
- ext4_msg(sb, KERN_ERR,
- "FITRIM not supported with bigalloc");
- return -EOPNOTSUPP;
- }
-
if (copy_from_user(&range, (struct fstrim_range __user *)arg,
sizeof(range)))
return -EFAULT;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1cd6994fc44..1bf6fe785c4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -24,6 +24,7 @@
#include "ext4_jbd2.h"
#include "mballoc.h"
#include <linux/debugfs.h>
+#include <linux/log2.h>
#include <linux/slab.h>
#include <trace/events/ext4.h>
@@ -969,7 +970,6 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
block++;
pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (!page)
return -EIO;
@@ -1339,17 +1339,17 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
mb_check_buddy(e4b);
}
-static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
+static int mb_find_extent(struct ext4_buddy *e4b, int block,
int needed, struct ext4_free_extent *ex)
{
int next = block;
- int max;
+ int max, order;
void *buddy;
assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
BUG_ON(ex == NULL);
- buddy = mb_find_buddy(e4b, order, &max);
+ buddy = mb_find_buddy(e4b, 0, &max);
BUG_ON(buddy == NULL);
BUG_ON(block >= max);
if (mb_test_bit(block, buddy)) {
@@ -1359,12 +1359,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
return 0;
}
- /* FIXME dorp order completely ? */
- if (likely(order == 0)) {
- /* find actual order */
- order = mb_find_order_for_block(e4b, block);
- block = block >> order;
- }
+ /* find actual order */
+ order = mb_find_order_for_block(e4b, block);
+ block = block >> order;
ex->fe_len = 1 << order;
ex->fe_start = block << order;
@@ -1376,7 +1373,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
ex->fe_start += next;
while (needed > ex->fe_len &&
- (buddy = mb_find_buddy(e4b, order, &max))) {
+ mb_find_buddy(e4b, order, &max)) {
if (block + 1 >= max)
break;
@@ -1550,7 +1547,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
/* recheck chunk's availability - we don't know
* when it was found (within this lock-unlock
* period or not) */
- max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
+ max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
if (max >= gex->fe_len) {
ext4_mb_use_best_found(ac, e4b);
return;
@@ -1642,7 +1639,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
return err;
ext4_lock_group(ac->ac_sb, group);
- max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
+ max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
if (max > 0) {
ac->ac_b_ex = ex;
@@ -1663,17 +1660,20 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
int max;
int err;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct ext4_free_extent ex;
if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
return 0;
+ if (grp->bb_free == 0)
+ return 0;
err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
if (err)
return err;
ext4_lock_group(ac->ac_sb, group);
- max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
+ max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
@@ -1789,7 +1789,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
break;
}
- mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
+ mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
BUG_ON(ex.fe_len <= 0);
if (free < ex.fe_len) {
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
@@ -1841,7 +1841,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) {
- max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
+ max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
if (max >= sbi->s_stripe) {
ac->ac_found++;
ac->ac_b_ex = ex;
@@ -1863,6 +1863,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
BUG_ON(cr < 0 || cr >= 4);
+ free = grp->bb_free;
+ if (free == 0)
+ return 0;
+ if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ return 0;
+
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
int ret = ext4_mb_init_group(ac->ac_sb, group);
@@ -1870,10 +1876,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
return 0;
}
- free = grp->bb_free;
fragments = grp->bb_fragments;
- if (free == 0)
- return 0;
if (fragments == 0)
return 0;
@@ -2077,8 +2080,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
struct super_block *sb = seq->private;
ext4_group_t group = (ext4_group_t) ((unsigned long) v);
int i;
- int err;
+ int err, buddy_loaded = 0;
struct ext4_buddy e4b;
+ struct ext4_group_info *grinfo;
struct sg {
struct ext4_group_info info;
ext4_grpblk_t counters[16];
@@ -2095,15 +2099,21 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
sizeof(struct ext4_group_info);
- err = ext4_mb_load_buddy(sb, group, &e4b);
- if (err) {
- seq_printf(seq, "#%-5u: I/O error\n", group);
- return 0;
+ grinfo = ext4_get_group_info(sb, group);
+ /* Load the group info in memory only if not already loaded. */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
+ err = ext4_mb_load_buddy(sb, group, &e4b);
+ if (err) {
+ seq_printf(seq, "#%-5u: I/O error\n", group);
+ return 0;
+ }
+ buddy_loaded = 1;
}
- ext4_lock_group(sb, group);
+
memcpy(&sg, ext4_get_group_info(sb, group), i);
- ext4_unlock_group(sb, group);
- ext4_mb_unload_buddy(&e4b);
+
+ if (buddy_loaded)
+ ext4_mb_unload_buddy(&e4b);
seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2157,6 +2167,39 @@ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
return cachep;
}
+/*
+ * Allocate the top-level s_group_info array for the specified number
+ * of groups
+ */
+int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned size;
+ struct ext4_group_info ***new_groupinfo;
+
+ size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ if (size <= sbi->s_group_info_size)
+ return 0;
+
+ size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
+ new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
+ if (!new_groupinfo) {
+ ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
+ return -ENOMEM;
+ }
+ if (sbi->s_group_info) {
+ memcpy(new_groupinfo, sbi->s_group_info,
+ sbi->s_group_info_size * sizeof(*sbi->s_group_info));
+ ext4_kvfree(sbi->s_group_info);
+ }
+ sbi->s_group_info = new_groupinfo;
+ sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
+ ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
+ sbi->s_group_info_size);
+ return 0;
+}
+
/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *desc)
@@ -2189,12 +2232,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
- meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
+ meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
if (meta_group_info[i] == NULL) {
ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
goto exit_group_info;
}
- memset(meta_group_info[i], 0, kmem_cache_size(cachep));
set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
&(meta_group_info[i]->bb_state));
@@ -2246,49 +2288,14 @@ static int ext4_mb_init_backend(struct super_block *sb)
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
- int num_meta_group_infos;
- int num_meta_group_infos_max;
- int array_size;
+ int err;
struct ext4_group_desc *desc;
struct kmem_cache *cachep;
- /* This is the number of blocks used by GDT */
- num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
- 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
-
- /*
- * This is the total number of blocks used by GDT including
- * the number of reserved blocks for GDT.
- * The s_group_info array is allocated with this value
- * to allow a clean online resize without a complex
- * manipulation of pointer.
- * The drawback is the unused memory when no resize
- * occurs but it's very low in terms of pages
- * (see comments below)
- * Need to handle this properly when META_BG resizing is allowed
- */
- num_meta_group_infos_max = num_meta_group_infos +
- le16_to_cpu(es->s_reserved_gdt_blocks);
+ err = ext4_mb_alloc_groupinfo(sb, ngroups);
+ if (err)
+ return err;
- /*
- * array_size is the size of s_group_info array. We round it
- * to the next power of two because this approximation is done
- * internally by kmalloc so we can have some more memory
- * for free here (e.g. may be used for META_BG resize).
- */
- array_size = 1;
- while (array_size < sizeof(*sbi->s_group_info) *
- num_meta_group_infos_max)
- array_size = array_size << 1;
- /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
- * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
- * So a two level scheme suffices for now. */
- sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL);
- if (sbi->s_group_info == NULL) {
- ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
- return -ENOMEM;
- }
sbi->s_buddy_cache = new_inode(sb);
if (sbi->s_buddy_cache == NULL) {
ext4_msg(sb, KERN_ERR, "can't get new inode");
@@ -2316,7 +2323,7 @@ err_freebuddy:
cachep = get_groupinfo_cache(sb->s_blocksize_bits);
while (i-- > 0)
kmem_cache_free(cachep, ext4_get_group_info(sb, i));
- i = num_meta_group_infos;
+ i = sbi->s_group_info_size;
while (i-- > 0)
kfree(sbi->s_group_info[i]);
iput(sbi->s_buddy_cache);
@@ -2600,9 +2607,17 @@ static void ext4_free_data_callback(struct super_block *sb,
mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
entry->efd_count, entry->efd_group, entry);
- if (test_opt(sb, DISCARD))
- ext4_issue_discard(sb, entry->efd_group,
- entry->efd_start_cluster, entry->efd_count);
+ if (test_opt(sb, DISCARD)) {
+ err = ext4_issue_discard(sb, entry->efd_group,
+ entry->efd_start_cluster,
+ entry->efd_count);
+ if (err && err != -EOPNOTSUPP)
+ ext4_msg(sb, KERN_WARNING, "discard request in"
+ " group:%d block:%d count:%d failed"
+ " with %d", entry->efd_group,
+ entry->efd_start_cluster,
+ entry->efd_count, err);
+ }
err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
/* we expect to find existing buddy because it's pinned */
@@ -2798,8 +2813,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
ext4_free_group_clusters_set(sb, gdp, len);
- ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh,
- EXT4_BLOCKS_PER_GROUP(sb) / 8);
+ ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
@@ -2825,7 +2839,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
out_err:
- ext4_mark_super_dirty(sb);
brelse(bitmap_bh);
return err;
}
@@ -4003,7 +4016,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ext4_get_group_no_and_offset(sb, goal, &group, &block);
/* set up allocation goals */
- memset(ac, 0, sizeof(struct ext4_allocation_context));
ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_sb = sb;
@@ -4286,7 +4298,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
}
- ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+ ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
if (!ac) {
ar->len = 0;
*errp = -ENOMEM;
@@ -4306,8 +4318,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
repeat:
/* allocate space in core */
*errp = ext4_mb_regular_allocator(ac);
- if (*errp)
+ if (*errp) {
+ ext4_discard_allocated_blocks(ac);
goto errout;
+ }
/* as we've just preallocated more space than
* user requested orinally, we store allocated
@@ -4329,10 +4343,10 @@ repeat:
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
goto repeat;
- } else if (*errp)
- errout:
+ } else if (*errp) {
ext4_discard_allocated_blocks(ac);
- else {
+ goto errout;
+ } else {
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
ar->len = ac->ac_b_ex.fe_len;
}
@@ -4343,6 +4357,7 @@ repeat:
*errp = -ENOSPC;
}
+errout:
if (*errp) {
ac->ac_b_ex.fe_len = 0;
ar->len = 0;
@@ -4652,6 +4667,16 @@ do_more:
* with group lock held. generate_buddy look at
* them with group lock_held
*/
+ if (test_opt(sb, DISCARD)) {
+ err = ext4_issue_discard(sb, block_group, bit, count);
+ if (err && err != -EOPNOTSUPP)
+ ext4_msg(sb, KERN_WARNING, "discard request in"
+ " group:%d block:%d count:%lu failed"
+ " with %d", block_group, bit, count,
+ err);
+ }
+
+
ext4_lock_group(sb, block_group);
mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
mb_free_blocks(inode, &e4b, bit, count_clusters);
@@ -4659,8 +4684,7 @@ do_more:
ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
ext4_free_group_clusters_set(sb, gdp, ret);
- ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
- EXT4_BLOCKS_PER_GROUP(sb) / 8);
+ ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
@@ -4694,7 +4718,6 @@ do_more:
put_bh(bitmap_bh);
goto do_more;
}
- ext4_mark_super_dirty(sb);
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, err);
@@ -4705,7 +4728,7 @@ error_return:
* ext4_group_add_blocks() -- Add given blocks to an existing group
* @handle: handle to this transaction
* @sb: super block
- * @block: start physcial block to add to the block group
+ * @block: start physical block to add to the block group
* @count: number of blocks to free
*
* This marks the blocks as free in the bitmap and buddy.
@@ -4805,8 +4828,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
mb_free_blocks(NULL, &e4b, bit, count);
blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
ext4_free_group_clusters_set(sb, desc, blk_free_count);
- ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh,
- EXT4_BLOCKS_PER_GROUP(sb) / 8);
+ ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
ext4_group_desc_csum_set(sb, block_group, desc);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeclusters_counter,
@@ -4848,10 +4870,11 @@ error_return:
* one will allocate those blocks, mark it as used in buddy bitmap. This must
* be called with under the group lock.
*/
-static void ext4_trim_extent(struct super_block *sb, int start, int count,
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
ext4_group_t group, struct ext4_buddy *e4b)
{
struct ext4_free_extent ex;
+ int ret = 0;
trace_ext4_trim_extent(sb, group, start, count);
@@ -4867,9 +4890,10 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
- ext4_issue_discard(sb, group, start, count);
+ ret = ext4_issue_discard(sb, group, start, count);
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
+ return ret;
}
/**
@@ -4898,7 +4922,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
void *bitmap;
ext4_grpblk_t next, count = 0, free_count = 0;
struct ext4_buddy e4b;
- int ret;
+ int ret = 0;
trace_ext4_trim_all_free(sb, group, start, max);
@@ -4925,8 +4949,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
next = mb_find_next_bit(bitmap, max + 1, start);
if ((next - start) >= minblocks) {
- ext4_trim_extent(sb, start,
- next - start, group, &e4b);
+ ret = ext4_trim_extent(sb, start,
+ next - start, group, &e4b);
+ if (ret && ret != -EOPNOTSUPP)
+ break;
+ ret = 0;
count += next - start;
}
free_count += next - start;
@@ -4947,8 +4974,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
break;
}
- if (!ret)
+ if (!ret) {
+ ret = count;
EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+ }
out:
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
@@ -4956,7 +4985,7 @@ out:
ext4_debug("trimmed %d blocks in the group %d\n",
count, group);
- return count;
+ return ret;
}
/**
@@ -4984,10 +5013,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
start = range->start >> sb->s_blocksize_bits;
end = start + (range->len >> sb->s_blocksize_bits) - 1;
- minlen = range->minlen >> sb->s_blocksize_bits;
+ minlen = EXT4_NUM_B2C(EXT4_SB(sb),
+ range->minlen >> sb->s_blocksize_bits);
- if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) ||
- unlikely(start >= max_blks))
+ if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
+ start >= max_blks ||
+ range->len < sb->s_blocksize)
return -EINVAL;
if (end >= max_blks)
end = max_blks - 1;
@@ -5044,6 +5075,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
out:
- range->len = trimmed * sb->s_blocksize;
+ range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
return ret;
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c070618c21c..3ccd889ba95 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -65,11 +65,6 @@ extern u8 mb_enable_debug;
#define MB_DEFAULT_MIN_TO_SCAN 10
/*
- * How many groups mballoc will scan looking for the best chunk
- */
-#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5
-
-/*
* with 'ext4_mb_stats' allocator will collect stats that will be
* shown at umount. The collecting costs though!
*/
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f1bb32ec016..db8226d595f 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -14,6 +14,7 @@
#include <linux/slab.h>
#include "ext4_jbd2.h"
+#include "ext4_extents.h"
/*
* The contiguous blocks details which can be
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index f99a1311e84..fe7c63f4717 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -44,6 +44,11 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
{
struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
+ /*
+ * We protect against freezing so that we don't create dirty buffers
+ * on frozen filesystem.
+ */
+ sb_start_write(sb);
ext4_mmp_csum_set(sb, mmp);
mark_buffer_dirty(bh);
lock_buffer(bh);
@@ -51,6 +56,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
get_bh(bh);
submit_bh(WRITE_SYNC, bh);
wait_on_buffer(bh);
+ sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
return 1;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c5826c623e7..d9cc5ee42f5 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include "ext4_jbd2.h"
#include "ext4.h"
+#include "ext4_extents.h"
/**
* get_ext_path - Find an extent path for designated logical block number.
@@ -141,55 +142,21 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
}
/**
- * mext_check_null_inode - NULL check for two inodes
- *
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
- */
-static int
-mext_check_null_inode(struct inode *inode1, struct inode *inode2,
- const char *function, unsigned int line)
-{
- int ret = 0;
-
- if (inode1 == NULL) {
- __ext4_error(inode2->i_sb, function, line,
- "Both inodes should not be NULL: "
- "inode1 NULL inode2 %lu", inode2->i_ino);
- ret = -EIO;
- } else if (inode2 == NULL) {
- __ext4_error(inode1->i_sb, function, line,
- "Both inodes should not be NULL: "
- "inode1 %lu inode2 NULL", inode1->i_ino);
- ret = -EIO;
- }
- return ret;
-}
-
-/**
* double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
*
- * @orig_inode: original inode structure
- * @donor_inode: donor inode structure
- * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
- * i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes
*/
static void
-double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *first, struct inode *second)
{
- struct inode *first = orig_inode, *second = donor_inode;
+ if (first < second) {
+ down_write(&EXT4_I(first)->i_data_sem);
+ down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+ } else {
+ down_write(&EXT4_I(second)->i_data_sem);
+ down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
- /*
- * Use the inode number to provide the stable locking order instead
- * of its address, because the C language doesn't guarantee you can
- * compare pointers that don't come from the same array.
- */
- if (donor_inode->i_ino < orig_inode->i_ino) {
- first = donor_inode;
- second = orig_inode;
}
-
- down_write(&EXT4_I(first)->i_data_sem);
- down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
}
/**
@@ -604,9 +571,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
- tmp_dext->ee_block =
- cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
- tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
+ le32_add_cpu(&tmp_dext->ee_block, diff);
+ le16_add_cpu(&tmp_dext->ee_len, -diff);
if (max_count < ext4_ext_get_actual_len(tmp_dext))
tmp_dext->ee_len = cpu_to_le16(max_count);
@@ -629,6 +595,43 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
}
/**
+ * mext_check_coverage - Check that all extents in range has the same type
+ *
+ * @inode: inode in question
+ * @from: block offset of inode
+ * @count: block count to be checked
+ * @uninit: extents expected to be uninitialized
+ * @err: pointer to save error value
+ *
+ * Return 1 if all extents in range has expected type, and zero otherwise.
+ */
+static int
+mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
+ int uninit, int *err)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ext;
+ ext4_lblk_t last = from + count;
+ while (from < last) {
+ *err = get_ext_path(inode, from, &path);
+ if (*err)
+ return 0;
+ ext = path[ext_depth(inode)].p_ext;
+ if (!ext) {
+ ext4_ext_drop_refs(path);
+ return 0;
+ }
+ if (uninit != ext4_ext_is_uninitialized(ext)) {
+ ext4_ext_drop_refs(path);
+ return 0;
+ }
+ from += ext4_ext_get_actual_len(ext);
+ ext4_ext_drop_refs(path);
+ }
+ return 1;
+}
+
+/**
* mext_replace_branches - Replace original extents with new extents
*
* @handle: journal handle
@@ -663,9 +666,6 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
int replaced_count = 0;
int dext_alen;
- /* Protect extent trees against block allocations via delalloc */
- double_down_write_data_sem(orig_inode, donor_inode);
-
/* Get the original extent for the block "orig_off" */
*err = get_ext_path(orig_inode, orig_off, &orig_path);
if (*err)
@@ -764,12 +764,122 @@ out:
ext4_ext_invalidate_cache(orig_inode);
ext4_ext_invalidate_cache(donor_inode);
- double_up_write_data_sem(orig_inode, donor_inode);
-
return replaced_count;
}
/**
+ * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
+ *
+ * @inode1: the inode structure
+ * @inode2: the inode structure
+ * @index: page index
+ * @page: result page vector
+ *
+ * Grab two locked pages for inode's by inode order
+ */
+static int
+mext_page_double_lock(struct inode *inode1, struct inode *inode2,
+ pgoff_t index, struct page *page[2])
+{
+ struct address_space *mapping[2];
+ unsigned fl = AOP_FLAG_NOFS;
+
+ BUG_ON(!inode1 || !inode2);
+ if (inode1 < inode2) {
+ mapping[0] = inode1->i_mapping;
+ mapping[1] = inode2->i_mapping;
+ } else {
+ mapping[0] = inode2->i_mapping;
+ mapping[1] = inode1->i_mapping;
+ }
+
+ page[0] = grab_cache_page_write_begin(mapping[0], index, fl);
+ if (!page[0])
+ return -ENOMEM;
+
+ page[1] = grab_cache_page_write_begin(mapping[1], index, fl);
+ if (!page[1]) {
+ unlock_page(page[0]);
+ page_cache_release(page[0]);
+ return -ENOMEM;
+ }
+
+ if (inode1 > inode2) {
+ struct page *tmp;
+ tmp = page[0];
+ page[0] = page[1];
+ page[1] = tmp;
+ }
+ return 0;
+}
+
+/* Force page buffers uptodate w/o dropping page's lock */
+static int
+mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
+{
+ struct inode *inode = page->mapping->host;
+ sector_t block;
+ struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+ unsigned int blocksize, block_start, block_end;
+ int i, err, nr = 0, partial = 0;
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+
+ if (PageUptodate(page))
+ return 0;
+
+ blocksize = 1 << inode->i_blkbits;
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, blocksize, 0);
+
+ head = page_buffers(page);
+ block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ for (bh = head, block_start = 0; bh != head || !block_start;
+ block++, block_start = block_end, bh = bh->b_this_page) {
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to) {
+ if (!buffer_uptodate(bh))
+ partial = 1;
+ continue;
+ }
+ if (buffer_uptodate(bh))
+ continue;
+ if (!buffer_mapped(bh)) {
+ int err = 0;
+ err = ext4_get_block(inode, block, bh, 0);
+ if (err) {
+ SetPageError(page);
+ return err;
+ }
+ if (!buffer_mapped(bh)) {
+ zero_user(page, block_start, blocksize);
+ if (!err)
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ }
+ BUG_ON(nr >= MAX_BUF_PER_PAGE);
+ arr[nr++] = bh;
+ }
+ /* No io required */
+ if (!nr)
+ goto out;
+
+ for (i = 0; i < nr; i++) {
+ bh = arr[i];
+ if (!bh_uptodate_or_lock(bh)) {
+ err = bh_submit_read(bh);
+ if (err)
+ return err;
+ }
+ }
+out:
+ if (!partial)
+ SetPageUptodate(page);
+ return 0;
+}
+
+/**
* move_extent_per_page - Move extent data per page
*
* @o_filp: file structure of original file
@@ -791,26 +901,24 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
int block_len_in_page, int uninit, int *err)
{
struct inode *orig_inode = o_filp->f_dentry->d_inode;
- struct address_space *mapping = orig_inode->i_mapping;
- struct buffer_head *bh;
- struct page *page = NULL;
- const struct address_space_operations *a_ops = mapping->a_ops;
+ struct page *pagep[2] = {NULL, NULL};
handle_t *handle;
ext4_lblk_t orig_blk_offset;
long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int w_flags = 0;
unsigned int tmp_data_size, data_size, replaced_size;
- void *fsdata;
- int i, jblocks;
- int err2 = 0;
+ int err2, jblocks, retries = 0;
int replaced_count = 0;
+ int from = data_offset_in_page << orig_inode->i_blkbits;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
/*
* It needs twice the amount of ordinary journal buffers because
* inode and donor_inode may change each different metadata blocks.
*/
+again:
+ *err = 0;
jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
handle = ext4_journal_start(orig_inode, jblocks);
if (IS_ERR(handle)) {
@@ -824,19 +932,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
orig_blk_offset = orig_page_offset * blocks_per_page +
data_offset_in_page;
- /*
- * If orig extent is uninitialized one,
- * it's not necessary force the page into memory
- * and then force it to be written out again.
- * Just swap data blocks between orig and donor.
- */
- if (uninit) {
- replaced_count = mext_replace_branches(handle, orig_inode,
- donor_inode, orig_blk_offset,
- block_len_in_page, err);
- goto out2;
- }
-
offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
/* Calculate data_size */
@@ -858,75 +953,120 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
replaced_size = data_size;
- *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
- &page, &fsdata);
+ *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
+ pagep);
if (unlikely(*err < 0))
- goto out;
-
- if (!PageUptodate(page)) {
- mapping->a_ops->readpage(o_filp, page);
- lock_page(page);
- }
-
+ goto stop_journal;
/*
- * try_to_release_page() doesn't call releasepage in writeback mode.
- * We should care about the order of writing to the same file
- * by multiple move extent processes.
- * It needs to call wait_on_page_writeback() to wait for the
- * writeback of the page.
+ * If orig extent was uninitialized it can become initialized
+ * at any time after i_data_sem was dropped, in order to
+ * serialize with delalloc we have recheck extent while we
+ * hold page's lock, if it is still the case data copy is not
+ * necessary, just swap data blocks between orig and donor.
*/
- wait_on_page_writeback(page);
+ if (uninit) {
+ double_down_write_data_sem(orig_inode, donor_inode);
+ /* If any of extents in range became initialized we have to
+ * fallback to data copying */
+ uninit = mext_check_coverage(orig_inode, orig_blk_offset,
+ block_len_in_page, 1, err);
+ if (*err)
+ goto drop_data_sem;
- /* Release old bh and drop refs */
- try_to_release_page(page, 0);
+ uninit &= mext_check_coverage(donor_inode, orig_blk_offset,
+ block_len_in_page, 1, err);
+ if (*err)
+ goto drop_data_sem;
+
+ if (!uninit) {
+ double_up_write_data_sem(orig_inode, donor_inode);
+ goto data_copy;
+ }
+ if ((page_has_private(pagep[0]) &&
+ !try_to_release_page(pagep[0], 0)) ||
+ (page_has_private(pagep[1]) &&
+ !try_to_release_page(pagep[1], 0))) {
+ *err = -EBUSY;
+ goto drop_data_sem;
+ }
+ replaced_count = mext_replace_branches(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ block_len_in_page, err);
+ drop_data_sem:
+ double_up_write_data_sem(orig_inode, donor_inode);
+ goto unlock_pages;
+ }
+data_copy:
+ *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
+ if (*err)
+ goto unlock_pages;
+
+ /* At this point all buffers in range are uptodate, old mapping layout
+ * is no longer required, try to drop it now. */
+ if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
+ (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
+ *err = -EBUSY;
+ goto unlock_pages;
+ }
replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
- orig_blk_offset, block_len_in_page,
- &err2);
- if (err2) {
+ orig_blk_offset,
+ block_len_in_page, err);
+ if (*err) {
if (replaced_count) {
block_len_in_page = replaced_count;
replaced_size =
block_len_in_page << orig_inode->i_blkbits;
} else
- goto out;
+ goto unlock_pages;
}
+ /* Perform all necessary steps similar write_begin()/write_end()
+ * but keeping in mind that i_size will not change */
+ *err = __block_write_begin(pagep[0], from, from + replaced_size,
+ ext4_get_block);
+ if (!*err)
+ *err = block_commit_write(pagep[0], from, from + replaced_size);
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
-
- bh = page_buffers(page);
- for (i = 0; i < data_offset_in_page; i++)
- bh = bh->b_this_page;
-
- for (i = 0; i < block_len_in_page; i++) {
- *err = ext4_get_block(orig_inode,
- (sector_t)(orig_blk_offset + i), bh, 0);
- if (*err < 0)
- goto out;
-
- if (bh->b_this_page != NULL)
- bh = bh->b_this_page;
- }
-
- *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
- page, fsdata);
- page = NULL;
-
-out:
- if (unlikely(page)) {
- if (PageLocked(page))
- unlock_page(page);
- page_cache_release(page);
- ext4_journal_stop(handle);
- }
-out2:
+ if (unlikely(*err < 0))
+ goto repair_branches;
+
+ /* Even in case of data=writeback it is reasonable to pin
+ * inode to transaction, to prevent unexpected data loss */
+ *err = ext4_jbd2_file_inode(handle, orig_inode);
+
+unlock_pages:
+ unlock_page(pagep[0]);
+ page_cache_release(pagep[0]);
+ unlock_page(pagep[1]);
+ page_cache_release(pagep[1]);
+stop_journal:
ext4_journal_stop(handle);
-
- if (err2)
- *err = err2;
-
+ /* Buffer was busy because probably is pinned to journal transaction,
+ * force transaction commit may help to free it. */
+ if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb,
+ &retries))
+ goto again;
return replaced_count;
+
+repair_branches:
+ /*
+ * This should never ever happen!
+ * Extents are swapped already, but we are not able to copy data.
+ * Try to swap extents to it's original places
+ */
+ double_down_write_data_sem(orig_inode, donor_inode);
+ replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
+ orig_blk_offset,
+ block_len_in_page, &err2);
+ double_up_write_data_sem(orig_inode, donor_inode);
+ if (replaced_count != block_len_in_page) {
+ EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
+ "Unable to copy data block,"
+ " data will be lost.");
+ *err = -EIO;
+ }
+ replaced_count = 0;
+ goto unlock_pages;
}
/**
@@ -969,14 +1109,6 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- /* Files should be in the same ext4 FS */
- if (orig_inode->i_sb != donor_inode->i_sb) {
- ext4_debug("ext4 move extent: The argument files "
- "should be in same FS [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
/* Ext4 move extent supports only extent based file */
if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: orig file is not extents "
@@ -1002,7 +1134,6 @@ mext_check_arguments(struct inode *orig_inode,
}
if ((orig_start >= EXT_MAX_BLOCKS) ||
- (donor_start >= EXT_MAX_BLOCKS) ||
(*len > EXT_MAX_BLOCKS) ||
(orig_start + *len >= EXT_MAX_BLOCKS)) {
ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
@@ -1072,35 +1203,19 @@ mext_check_arguments(struct inode *orig_inode,
* @inode1: the inode structure
* @inode2: the inode structure
*
- * Lock two inodes' i_mutex by i_ino order.
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ * Lock two inodes' i_mutex
*/
-static int
+static void
mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
{
- int ret = 0;
-
- BUG_ON(inode1 == NULL && inode2 == NULL);
-
- ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
- if (ret < 0)
- goto out;
-
- if (inode1 == inode2) {
- mutex_lock(&inode1->i_mutex);
- goto out;
- }
-
- if (inode1->i_ino < inode2->i_ino) {
+ BUG_ON(inode1 == inode2);
+ if (inode1 < inode2) {
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
} else {
mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
}
-
-out:
- return ret;
}
/**
@@ -1109,28 +1224,13 @@ out:
* @inode1: the inode that is released first
* @inode2: the inode that is released second
*
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static int
+static void
mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
{
- int ret = 0;
-
- BUG_ON(inode1 == NULL && inode2 == NULL);
-
- ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
- if (ret < 0)
- goto out;
-
- if (inode1)
- mutex_unlock(&inode1->i_mutex);
-
- if (inode2 && inode2 != inode1)
- mutex_unlock(&inode2->i_mutex);
-
-out:
- return ret;
+ mutex_unlock(&inode1->i_mutex);
+ mutex_unlock(&inode2->i_mutex);
}
/**
@@ -1187,16 +1287,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
ext4_lblk_t rest_blocks;
pgoff_t orig_page_offset = 0, seq_end_page;
- int ret1, ret2, depth, last_extent = 0;
+ int ret, depth, last_extent = 0;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
int data_offset_in_page;
int block_len_in_page;
int uninit;
- /* orig and donor should be different file */
- if (orig_inode->i_ino == donor_inode->i_ino) {
+ if (orig_inode->i_sb != donor_inode->i_sb) {
+ ext4_debug("ext4 move extent: The argument files "
+ "should be in same FS [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* orig and donor should be different inodes */
+ if (orig_inode == donor_inode) {
ext4_debug("ext4 move extent: The argument files should not "
- "be same file [ino:orig %lu, donor %lu]\n",
+ "be same inode [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
@@ -1208,18 +1315,27 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
-
+ /* TODO: This is non obvious task to swap blocks for inodes with full
+ jornaling enabled */
+ if (ext4_should_journal_data(orig_inode) ||
+ ext4_should_journal_data(donor_inode)) {
+ return -EINVAL;
+ }
/* Protect orig and donor inodes against a truncate */
- ret1 = mext_inode_double_lock(orig_inode, donor_inode);
- if (ret1 < 0)
- return ret1;
+ mext_inode_double_lock(orig_inode, donor_inode);
+
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(orig_inode);
+ ext4_inode_block_unlocked_dio(donor_inode);
+ inode_dio_wait(orig_inode);
+ inode_dio_wait(donor_inode);
/* Protect extent tree against block allocations via delalloc */
double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
- ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
donor_start, &len);
- if (ret1)
+ if (ret)
goto out;
file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
@@ -1227,13 +1343,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
if (file_end < block_end)
len -= block_end - file_end;
- ret1 = get_ext_path(orig_inode, block_start, &orig_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, block_start, &orig_path);
+ if (ret)
goto out;
/* Get path structure to check the hole */
- ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, block_start, &holecheck_path);
+ if (ret)
goto out;
depth = ext_depth(orig_inode);
@@ -1252,13 +1368,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
last_extent = mext_next_extent(orig_inode,
holecheck_path, &ext_cur);
if (last_extent < 0) {
- ret1 = last_extent;
+ ret = last_extent;
goto out;
}
last_extent = mext_next_extent(orig_inode, orig_path,
&ext_dummy);
if (last_extent < 0) {
- ret1 = last_extent;
+ ret = last_extent;
goto out;
}
seq_start = le32_to_cpu(ext_cur->ee_block);
@@ -1272,7 +1388,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
if (le32_to_cpu(ext_cur->ee_block) > block_end) {
ext4_debug("ext4 move extent: The specified range of file "
"may be the hole\n");
- ret1 = -EINVAL;
+ ret = -EINVAL;
goto out;
}
@@ -1292,7 +1408,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
last_extent = mext_next_extent(orig_inode, holecheck_path,
&ext_cur);
if (last_extent < 0) {
- ret1 = last_extent;
+ ret = last_extent;
break;
}
add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1349,18 +1465,18 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
orig_page_offset,
data_offset_in_page,
block_len_in_page, uninit,
- &ret1);
+ &ret);
/* Count how many blocks we have exchanged */
*moved_len += block_len_in_page;
- if (ret1 < 0)
+ if (ret < 0)
break;
if (*moved_len > len) {
EXT4_ERROR_INODE(orig_inode,
"We replaced blocks too much! "
"sum of replaced: %llu requested: %llu",
*moved_len, len);
- ret1 = -EIO;
+ ret = -EIO;
break;
}
@@ -1374,22 +1490,22 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
}
double_down_write_data_sem(orig_inode, donor_inode);
- if (ret1 < 0)
+ if (ret < 0)
break;
/* Decrease buffer counter */
if (holecheck_path)
ext4_ext_drop_refs(holecheck_path);
- ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
+ if (ret)
break;
depth = holecheck_path->p_depth;
/* Decrease buffer counter */
if (orig_path)
ext4_ext_drop_refs(orig_path);
- ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, seq_start, &orig_path);
+ if (ret)
break;
ext_cur = holecheck_path[depth].p_ext;
@@ -1412,12 +1528,9 @@ out:
kfree(holecheck_path);
}
double_up_write_data_sem(orig_inode, donor_inode);
- ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
-
- if (ret1)
- return ret1;
- else if (ret2)
- return ret2;
+ ext4_inode_resume_unlocked_dio(orig_inode);
+ ext4_inode_resume_unlocked_dio(donor_inode);
+ mext_inode_double_unlock(orig_inode, donor_inode);
- return 0;
+ return ret;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5845cd97bf8..f9ed946a448 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -55,6 +55,13 @@ static struct buffer_head *ext4_append(handle_t *handle,
{
struct buffer_head *bh;
+ if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
+ ((inode->i_size >> 10) >=
+ EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) {
+ *err = -ENOSPC;
+ return NULL;
+ }
+
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
bh = ext4_bread(handle, inode, *block, 1, err);
@@ -67,6 +74,12 @@ static struct buffer_head *ext4_append(handle_t *handle,
bh = NULL;
}
}
+ if (!bh && !(*err)) {
+ *err = -EIO;
+ ext4_error(inode->i_sb,
+ "Directory hole detected on inode %lu\n",
+ inode->i_ino);
+ }
return bh;
}
@@ -189,13 +202,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
/* checksumming functions */
-#define EXT4_DIRENT_TAIL(block, blocksize) \
- ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
- ((blocksize) - \
- sizeof(struct ext4_dir_entry_tail))))
-
-static void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
- unsigned int blocksize)
+void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+ unsigned int blocksize)
{
memset(t, 0, sizeof(struct ext4_dir_entry_tail));
t->det_rec_len = ext4_rec_len_to_disk(
@@ -248,6 +256,12 @@ static __le32 ext4_dirent_csum(struct inode *inode,
return cpu_to_le32(csum);
}
+static void warn_no_space_for_csum(struct inode *inode)
+{
+ ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for "
+ "checksum. Please run e2fsck -D.", inode->i_ino);
+}
+
int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
{
struct ext4_dir_entry_tail *t;
@@ -258,8 +272,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
t = get_dirent_tail(inode, dirent);
if (!t) {
- EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
- "leaf for checksum. Please run e2fsck -D.");
+ warn_no_space_for_csum(inode);
return 0;
}
@@ -281,8 +294,7 @@ static void ext4_dirent_csum_set(struct inode *inode,
t = get_dirent_tail(inode, dirent);
if (!t) {
- EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
- "leaf for checksum. Please run e2fsck -D.");
+ warn_no_space_for_csum(inode);
return;
}
@@ -290,9 +302,9 @@ static void ext4_dirent_csum_set(struct inode *inode,
(void *)t - (void *)dirent);
}
-static inline int ext4_handle_dirty_dirent_node(handle_t *handle,
- struct inode *inode,
- struct buffer_head *bh)
+int ext4_handle_dirty_dirent_node(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh)
{
ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
return ext4_handle_dirty_metadata(handle, inode, bh);
@@ -364,8 +376,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
count = le16_to_cpu(c->count);
if (count_offset + (limit * sizeof(struct dx_entry)) >
EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
- EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
- "tree checksum found. Run e2fsck -D.");
+ warn_no_space_for_csum(inode);
return 1;
}
t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -395,8 +406,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
count = le16_to_cpu(c->count);
if (count_offset + (limit * sizeof(struct dx_entry)) >
EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
- EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
- "tree checksum. Run e2fsck -D.");
+ warn_no_space_for_csum(inode);
return;
}
t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -594,8 +604,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
u32 hash;
frame->bh = NULL;
- if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+ if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) {
+ if (*err == 0)
+ *err = ERR_BAD_DX_DIR;
goto fail;
+ }
root = (struct dx_root *) bh->b_data;
if (root->info.hash_version != DX_HASH_TEA &&
root->info.hash_version != DX_HASH_HALF_MD4 &&
@@ -696,8 +709,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
frame->entries = entries;
frame->at = at;
if (!indirect--) return frame;
- if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
+ if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) {
+ if (!(*err))
+ *err = ERR_BAD_DX_DIR;
goto fail2;
+ }
at = entries = ((struct dx_node *) bh->b_data)->entries;
if (!buffer_verified(bh) &&
@@ -706,7 +722,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
ext4_warning(dir->i_sb, "Node failed checksum");
brelse(bh);
*err = ERR_BAD_DX_DIR;
- goto fail;
+ goto fail2;
}
set_buffer_verified(bh);
@@ -807,8 +823,15 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
*/
while (num_frames--) {
if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
- 0, &err)))
+ 0, &err))) {
+ if (!err) {
+ ext4_error(dir->i_sb,
+ "Directory hole detected on inode %lu\n",
+ dir->i_ino);
+ return -EIO;
+ }
return err; /* Failure */
+ }
if (!buffer_verified(bh) &&
!ext4_dx_csum_verify(dir,
@@ -839,12 +862,19 @@ static int htree_dirblock_to_tree(struct file *dir_file,
{
struct buffer_head *bh;
struct ext4_dir_entry_2 *de, *top;
- int err, count = 0;
+ int err = 0, count = 0;
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
- if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
+ if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) {
+ if (!err) {
+ err = -EIO;
+ ext4_error(dir->i_sb,
+ "Directory hole detected on inode %lu\n",
+ dir->i_ino);
+ }
return err;
+ }
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
@@ -857,6 +887,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
EXT4_DIR_REC_LEN(0));
for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
+ bh->b_data, bh->b_size,
(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+ ((char *)de - bh->b_data))) {
/* On error, skip the f_pos to the next block. */
@@ -974,6 +1005,15 @@ errout:
return (err);
}
+static inline int search_dirblock(struct buffer_head *bh,
+ struct inode *dir,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 **res_dir)
+{
+ return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
+ d_name, offset, res_dir);
+}
/*
* Directory block splitting, compacting
@@ -1048,13 +1088,6 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
dx_set_count(entries, count + 1);
}
-static void ext4_update_dx_flag(struct inode *inode)
-{
- if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_COMPAT_DIR_INDEX))
- ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
-}
-
/*
* NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
*
@@ -1074,11 +1107,13 @@ static inline int ext4_match (int len, const char * const name,
/*
* Returns 0 if not found, -1 on failure, and 1 on success
*/
-static inline int search_dirblock(struct buffer_head *bh,
- struct inode *dir,
- const struct qstr *d_name,
- unsigned int offset,
- struct ext4_dir_entry_2 ** res_dir)
+int search_dir(struct buffer_head *bh,
+ char *search_buf,
+ int buf_size,
+ struct inode *dir,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 **res_dir)
{
struct ext4_dir_entry_2 * de;
char * dlimit;
@@ -1086,8 +1121,8 @@ static inline int search_dirblock(struct buffer_head *bh,
const char *name = d_name->name;
int namelen = d_name->len;
- de = (struct ext4_dir_entry_2 *) bh->b_data;
- dlimit = bh->b_data + dir->i_sb->s_blocksize;
+ de = (struct ext4_dir_entry_2 *)search_buf;
+ dlimit = search_buf + buf_size;
while ((char *) de < dlimit) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
@@ -1095,7 +1130,8 @@ static inline int search_dirblock(struct buffer_head *bh,
if ((char *) de + namelen <= dlimit &&
ext4_match (namelen, name, de)) {
/* found a match - just to be sure, do a full check */
- if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
+ if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
+ bh->b_size, offset))
return -1;
*res_dir = de;
return 1;
@@ -1111,6 +1147,21 @@ static inline int search_dirblock(struct buffer_head *bh,
return 0;
}
+static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
+ struct ext4_dir_entry *de)
+{
+ struct super_block *sb = dir->i_sb;
+
+ if (!is_dx(dir))
+ return 0;
+ if (block == 0)
+ return 1;
+ if (de->inode == 0 &&
+ ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) ==
+ sb->s_blocksize)
+ return 1;
+ return 0;
+}
/*
* ext4_find_entry()
@@ -1125,7 +1176,8 @@ static inline int search_dirblock(struct buffer_head *bh,
*/
static struct buffer_head * ext4_find_entry (struct inode *dir,
const struct qstr *d_name,
- struct ext4_dir_entry_2 ** res_dir)
+ struct ext4_dir_entry_2 **res_dir,
+ int *inlined)
{
struct super_block *sb;
struct buffer_head *bh_use[NAMEI_RA_SIZE];
@@ -1146,6 +1198,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
namelen = d_name->len;
if (namelen > EXT4_NAME_LEN)
return NULL;
+
+ if (ext4_has_inline_data(dir)) {
+ int has_inline_data = 1;
+ ret = ext4_find_inline_entry(dir, d_name, res_dir,
+ &has_inline_data);
+ if (has_inline_data) {
+ if (inlined)
+ *inlined = 1;
+ return ret;
+ }
+ }
+
if ((namelen <= 2) && (name[0] == '.') &&
(name[1] == '.' || name[1] == '\0')) {
/*
@@ -1211,6 +1275,8 @@ restart:
goto next;
}
if (!buffer_verified(bh) &&
+ !is_dx_internal_node(dir, block,
+ (struct ext4_dir_entry *)bh->b_data) &&
!ext4_dirent_csum_verify(dir,
(struct ext4_dir_entry *)bh->b_data)) {
EXT4_ERROR_INODE(dir, "checksumming directory "
@@ -1267,8 +1333,15 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
return NULL;
do {
block = dx_get_block(frame->at);
- if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
+ if (!(bh = ext4_bread(NULL, dir, block, 0, err))) {
+ if (!(*err)) {
+ *err = -EIO;
+ ext4_error(dir->i_sb,
+ "Directory hole detected on inode %lu\n",
+ dir->i_ino);
+ }
goto errout;
+ }
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(dir,
@@ -1312,7 +1385,7 @@ errout:
return NULL;
}
-static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
struct ext4_dir_entry_2 *de;
@@ -1321,7 +1394,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
if (dentry->d_name.len > EXT4_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- bh = ext4_find_entry(dir, &dentry->d_name, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
inode = NULL;
if (bh) {
__u32 ino = le32_to_cpu(de->inode);
@@ -1355,7 +1428,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
struct ext4_dir_entry_2 * de;
struct buffer_head *bh;
- bh = ext4_find_entry(child->d_inode, &dotdot, &de);
+ bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
if (!bh)
return ERR_PTR(-ENOENT);
ino = le32_to_cpu(de->inode);
@@ -1553,6 +1626,63 @@ errout:
return NULL;
}
+int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+ struct buffer_head *bh,
+ void *buf, int buf_size,
+ const char *name, int namelen,
+ struct ext4_dir_entry_2 **dest_de)
+{
+ struct ext4_dir_entry_2 *de;
+ unsigned short reclen = EXT4_DIR_REC_LEN(namelen);
+ int nlen, rlen;
+ unsigned int offset = 0;
+ char *top;
+
+ de = (struct ext4_dir_entry_2 *)buf;
+ top = buf + buf_size - reclen;
+ while ((char *) de <= top) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ buf, buf_size, offset))
+ return -EIO;
+ if (ext4_match(namelen, name, de))
+ return -EEXIST;
+ nlen = EXT4_DIR_REC_LEN(de->name_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+ if ((de->inode ? rlen - nlen : rlen) >= reclen)
+ break;
+ de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+ offset += rlen;
+ }
+ if ((char *) de > top)
+ return -ENOSPC;
+
+ *dest_de = de;
+ return 0;
+}
+
+void ext4_insert_dentry(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ const char *name, int namelen)
+{
+
+ int nlen, rlen;
+
+ nlen = EXT4_DIR_REC_LEN(de->name_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+ if (de->inode) {
+ struct ext4_dir_entry_2 *de1 =
+ (struct ext4_dir_entry_2 *)((char *)de + nlen);
+ de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
+ de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
+ de = de1;
+ }
+ de->file_type = EXT4_FT_UNKNOWN;
+ de->inode = cpu_to_le32(inode->i_ino);
+ ext4_set_de_type(inode->i_sb, de, inode->i_mode);
+ de->name_len = namelen;
+ memcpy(de->name, name, namelen);
+}
/*
* Add a new entry into a directory (leaf) block. If de is non-NULL,
* it points to a directory entry which is guaranteed to be large
@@ -1568,12 +1698,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
struct inode *dir = dentry->d_parent->d_inode;
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
- unsigned int offset = 0;
unsigned int blocksize = dir->i_sb->s_blocksize;
unsigned short reclen;
- int nlen, rlen, err;
- char *top;
int csum_size = 0;
+ int err;
if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -1581,22 +1709,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
reclen = EXT4_DIR_REC_LEN(namelen);
if (!de) {
- de = (struct ext4_dir_entry_2 *)bh->b_data;
- top = bh->b_data + (blocksize - csum_size) - reclen;
- while ((char *) de <= top) {
- if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
- return -EIO;
- if (ext4_match(namelen, name, de))
- return -EEXIST;
- nlen = EXT4_DIR_REC_LEN(de->name_len);
- rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
- if ((de->inode? rlen - nlen: rlen) >= reclen)
- break;
- de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
- offset += rlen;
- }
- if ((char *) de > top)
- return -ENOSPC;
+ err = ext4_find_dest_de(dir, inode,
+ bh, bh->b_data, blocksize - csum_size,
+ name, namelen, &de);
+ if (err)
+ return err;
}
BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
@@ -1606,19 +1723,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
}
/* By now the buffer is marked for journaling */
- nlen = EXT4_DIR_REC_LEN(de->name_len);
- rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
- if (de->inode) {
- struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
- de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
- de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
- de = de1;
- }
- de->file_type = EXT4_FT_UNKNOWN;
- de->inode = cpu_to_le32(inode->i_ino);
- ext4_set_de_type(dir->i_sb, de, inode->i_mode);
- de->name_len = namelen;
- memcpy(de->name, name, namelen);
+ ext4_insert_dentry(inode, de, blocksize, name, namelen);
+
/*
* XXX shouldn't update any times until successful
* completion of syscall, but too many callers depend
@@ -1791,6 +1897,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
blocksize = sb->s_blocksize;
if (!dentry->d_name.len)
return -EINVAL;
+
+ if (ext4_has_inline_data(dir)) {
+ retval = ext4_try_add_inline_entry(handle, dentry, inode);
+ if (retval < 0)
+ return retval;
+ if (retval == 1) {
+ retval = 0;
+ return retval;
+ }
+ }
+
if (is_dx(dir)) {
retval = ext4_dx_add_entry(handle, dentry, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
@@ -1801,9 +1918,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
}
blocks = dir->i_size >> sb->s_blocksize_bits;
for (block = 0; block < blocks; block++) {
- bh = ext4_bread(handle, dir, block, 0, &retval);
- if(!bh)
+ if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) {
+ if (!retval) {
+ retval = -EIO;
+ ext4_error(inode->i_sb,
+ "Directory hole detected on inode %lu\n",
+ inode->i_ino);
+ }
return retval;
+ }
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(dir,
(struct ext4_dir_entry *)bh->b_data))
@@ -1860,8 +1983,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
entries = frame->entries;
at = frame->at;
- if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+ if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) {
+ if (!err) {
+ err = -EIO;
+ ext4_error(dir->i_sb,
+ "Directory hole detected on inode %lu\n",
+ dir->i_ino);
+ }
goto cleanup;
+ }
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
@@ -1983,36 +2113,29 @@ cleanup:
}
/*
- * ext4_delete_entry deletes a directory entry by merging it with the
- * previous entry
+ * ext4_generic_delete_entry deletes a directory entry by merging it
+ * with the previous entry
*/
-static int ext4_delete_entry(handle_t *handle,
- struct inode *dir,
- struct ext4_dir_entry_2 *de_del,
- struct buffer_head *bh)
+int ext4_generic_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ void *entry_buf,
+ int buf_size,
+ int csum_size)
{
struct ext4_dir_entry_2 *de, *pde;
unsigned int blocksize = dir->i_sb->s_blocksize;
- int csum_size = 0;
- int i, err;
-
- if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
- csum_size = sizeof(struct ext4_dir_entry_tail);
+ int i;
i = 0;
pde = NULL;
- de = (struct ext4_dir_entry_2 *) bh->b_data;
- while (i < bh->b_size - csum_size) {
- if (ext4_check_dir_entry(dir, NULL, de, bh, i))
+ de = (struct ext4_dir_entry_2 *)entry_buf;
+ while (i < buf_size - csum_size) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ bh->b_data, bh->b_size, i))
return -EIO;
if (de == de_del) {
- BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
- if (unlikely(err)) {
- ext4_std_error(dir->i_sb, err);
- return err;
- }
if (pde)
pde->rec_len = ext4_rec_len_to_disk(
ext4_rec_len_from_disk(pde->rec_len,
@@ -2023,12 +2146,6 @@ static int ext4_delete_entry(handle_t *handle,
else
de->inode = 0;
dir->i_version++;
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_dirent_node(handle, dir, bh);
- if (unlikely(err)) {
- ext4_std_error(dir->i_sb, err);
- return err;
- }
return 0;
}
i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -2038,6 +2155,48 @@ static int ext4_delete_entry(handle_t *handle,
return -ENOENT;
}
+static int ext4_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh)
+{
+ int err, csum_size = 0;
+
+ if (ext4_has_inline_data(dir)) {
+ int has_inline_data = 1;
+ err = ext4_delete_inline_entry(handle, dir, de_del, bh,
+ &has_inline_data);
+ if (has_inline_data)
+ return err;
+ }
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (unlikely(err))
+ goto out;
+
+ err = ext4_generic_delete_entry(handle, dir, de_del,
+ bh, bh->b_data,
+ dir->i_sb->s_blocksize, csum_size);
+ if (err)
+ goto out;
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_dirent_node(handle, dir, bh);
+ if (unlikely(err))
+ goto out;
+
+ return 0;
+out:
+ if (err != -ENOENT)
+ ext4_std_error(dir->i_sb, err);
+ return err;
+}
+
/*
* DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
* since this indicates that nlinks count was previously 1.
@@ -2072,8 +2231,8 @@ static int ext4_add_nondir(handle_t *handle,
int err = ext4_add_entry(handle, dentry, inode);
if (!err) {
ext4_mark_inode_dirty(handle, inode);
- d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ d_instantiate(dentry, inode);
return 0;
}
drop_nlink(inode);
@@ -2091,7 +2250,7 @@ static int ext4_add_nondir(handle_t *handle,
* with d_instantiate().
*/
static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ bool excl)
{
handle_t *handle;
struct inode *inode;
@@ -2149,9 +2308,7 @@ retry:
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
-#ifdef CONFIG_EXT4_FS_XATTR
inode->i_op = &ext4_special_inode_operations;
-#endif
err = ext4_add_nondir(handle, dentry, inode);
}
ext4_journal_stop(handle);
@@ -2160,21 +2317,94 @@ retry:
return err;
}
-static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int blocksize, int csum_size,
+ unsigned int parent_ino, int dotdot_real_len)
+{
+ de->inode = cpu_to_le32(inode->i_ino);
+ de->name_len = 1;
+ de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+ blocksize);
+ strcpy(de->name, ".");
+ ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+
+ de = ext4_next_entry(de, blocksize);
+ de->inode = cpu_to_le32(parent_ino);
+ de->name_len = 2;
+ if (!dotdot_real_len)
+ de->rec_len = ext4_rec_len_to_disk(blocksize -
+ (csum_size + EXT4_DIR_REC_LEN(1)),
+ blocksize);
+ else
+ de->rec_len = ext4_rec_len_to_disk(
+ EXT4_DIR_REC_LEN(de->name_len), blocksize);
+ strcpy(de->name, "..");
+ ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+
+ return ext4_next_entry(de, blocksize);
+}
+
+static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+ struct inode *inode)
{
- handle_t *handle;
- struct inode *inode;
struct buffer_head *dir_block = NULL;
struct ext4_dir_entry_2 *de;
struct ext4_dir_entry_tail *t;
unsigned int blocksize = dir->i_sb->s_blocksize;
int csum_size = 0;
- int err, retries = 0;
+ int err;
if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
csum_size = sizeof(struct ext4_dir_entry_tail);
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ err = ext4_try_create_inline_dir(handle, dir, inode);
+ if (err < 0 && err != -ENOSPC)
+ goto out;
+ if (!err)
+ goto out;
+ }
+
+ inode->i_size = EXT4_I(inode)->i_disksize = blocksize;
+ if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
+ if (!err) {
+ err = -EIO;
+ ext4_error(inode->i_sb,
+ "Directory hole detected on inode %lu\n",
+ inode->i_ino);
+ }
+ goto out;
+ }
+ BUFFER_TRACE(dir_block, "get_write_access");
+ err = ext4_journal_get_write_access(handle, dir_block);
+ if (err)
+ goto out;
+ de = (struct ext4_dir_entry_2 *)dir_block->b_data;
+ ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
+ set_nlink(inode, 2);
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
+ initialize_dirent_tail(t, blocksize);
+ }
+
+ BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+ if (err)
+ goto out;
+ set_buffer_verified(dir_block);
+out:
+ brelse(dir_block);
+ return err;
+}
+
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ handle_t *handle;
+ struct inode *inode;
+ int err, retries = 0;
+
if (EXT4_DIR_LINK_MAX(dir))
return -EMLINK;
@@ -2198,41 +2428,9 @@ retry:
inode->i_op = &ext4_dir_inode_operations;
inode->i_fop = &ext4_dir_operations;
- inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- dir_block = ext4_bread(handle, inode, 0, 1, &err);
- if (!dir_block)
- goto out_clear_inode;
- BUFFER_TRACE(dir_block, "get_write_access");
- err = ext4_journal_get_write_access(handle, dir_block);
- if (err)
- goto out_clear_inode;
- de = (struct ext4_dir_entry_2 *) dir_block->b_data;
- de->inode = cpu_to_le32(inode->i_ino);
- de->name_len = 1;
- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
- blocksize);
- strcpy(de->name, ".");
- ext4_set_de_type(dir->i_sb, de, S_IFDIR);
- de = ext4_next_entry(de, blocksize);
- de->inode = cpu_to_le32(dir->i_ino);
- de->rec_len = ext4_rec_len_to_disk(blocksize -
- (csum_size + EXT4_DIR_REC_LEN(1)),
- blocksize);
- de->name_len = 2;
- strcpy(de->name, "..");
- ext4_set_de_type(dir->i_sb, de, S_IFDIR);
- set_nlink(inode, 2);
-
- if (csum_size) {
- t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
- initialize_dirent_tail(t, blocksize);
- }
-
- BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+ err = ext4_init_new_dir(handle, dir, inode);
if (err)
goto out_clear_inode;
- set_buffer_verified(dir_block);
err = ext4_mark_inode_dirty(handle, inode);
if (!err)
err = ext4_add_entry(handle, dentry, inode);
@@ -2249,10 +2447,9 @@ out_clear_inode:
err = ext4_mark_inode_dirty(handle, dir);
if (err)
goto out_clear_inode;
- d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ d_instantiate(dentry, inode);
out_stop:
- brelse(dir_block);
ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
@@ -2270,6 +2467,14 @@ static int empty_dir(struct inode *inode)
struct super_block *sb;
int err = 0;
+ if (ext4_has_inline_data(inode)) {
+ int has_inline_data = 1;
+
+ err = empty_inline_dir(inode, &has_inline_data);
+ if (has_inline_data)
+ return err;
+ }
+
sb = inode->i_sb;
if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
!(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
@@ -2318,6 +2523,11 @@ static int empty_dir(struct inode *inode)
EXT4_ERROR_INODE(inode,
"error %d reading directory "
"lblock %u", err, lblock);
+ else
+ ext4_warning(inode->i_sb,
+ "bad directory (dir #%lu) - no data block",
+ inode->i_ino);
+
offset += sb->s_blocksize;
continue;
}
@@ -2331,7 +2541,8 @@ static int empty_dir(struct inode *inode)
set_buffer_verified(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
}
- if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
+ if (ext4_check_dir_entry(inode, NULL, de, bh,
+ bh->b_data, bh->b_size, offset)) {
de = (struct ext4_dir_entry_2 *)(bh->b_data +
sb->s_blocksize);
offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2362,7 +2573,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
struct ext4_iloc iloc;
int err = 0, rc;
- if (!ext4_handle_valid(handle))
+ if (!EXT4_SB(sb)->s_journal)
return 0;
mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
@@ -2397,7 +2608,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
/* Insert this inode at the head of the on-disk orphan list... */
NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
- err = ext4_handle_dirty_super_now(handle, sb);
+ err = ext4_handle_dirty_super(handle, sb);
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (!err)
err = rc;
@@ -2436,8 +2647,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
struct ext4_iloc iloc;
int err = 0;
- /* ext4_handle_valid() assumes a valid handle_t pointer */
- if (handle && !ext4_handle_valid(handle))
+ if ((!EXT4_SB(inode->i_sb)->s_journal) &&
+ !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
return 0;
mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
@@ -2456,7 +2667,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
* transaction handle with which to update the orphan list on
* disk, but we still need to remove the inode from the linked
* list in memory. */
- if (sbi->s_journal && !handle)
+ if (!handle)
goto out;
err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2470,7 +2681,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
if (err)
goto out_brelse;
sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
- err = ext4_handle_dirty_super_now(handle, inode->i_sb);
+ err = ext4_handle_dirty_super(handle, inode->i_sb);
} else {
struct ext4_iloc iloc2;
struct inode *i_prev =
@@ -2518,7 +2729,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
return PTR_ERR(handle);
retval = -ENOENT;
- bh = ext4_find_entry(dir, &dentry->d_name, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (!bh)
goto end_rmdir;
@@ -2583,7 +2794,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
ext4_handle_sync(handle);
retval = -ENOENT;
- bh = ext4_find_entry(dir, &dentry->d_name, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (!bh)
goto end_unlink;
@@ -2765,8 +2976,39 @@ retry:
return err;
}
-#define PARENT_INO(buffer, size) \
- (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
+
+/*
+ * Try to find buffer head where contains the parent block.
+ * It should be the inode block if it is inlined or the 1st block
+ * if it is a normal dir.
+ */
+static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
+ struct inode *inode,
+ int *retval,
+ struct ext4_dir_entry_2 **parent_de,
+ int *inlined)
+{
+ struct buffer_head *bh;
+
+ if (!ext4_has_inline_data(inode)) {
+ if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) {
+ if (!*retval) {
+ *retval = -EIO;
+ ext4_error(inode->i_sb,
+ "Directory hole detected on inode %lu\n",
+ inode->i_ino);
+ }
+ return NULL;
+ }
+ *parent_de = ext4_next_entry(
+ (struct ext4_dir_entry_2 *)bh->b_data,
+ inode->i_sb->s_blocksize);
+ return bh;
+ }
+
+ *inlined = 1;
+ return ext4_get_first_inline_block(inode, parent_de, retval);
+}
/*
* Anybody can rename anything with this: the permission checks are left to the
@@ -2780,6 +3022,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct buffer_head *old_bh, *new_bh, *dir_bh;
struct ext4_dir_entry_2 *old_de, *new_de;
int retval, force_da_alloc = 0;
+ int inlined = 0, new_inlined = 0;
+ struct ext4_dir_entry_2 *parent_de;
dquot_initialize(old_dir);
dquot_initialize(new_dir);
@@ -2799,7 +3043,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
ext4_handle_sync(handle);
- old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
+ old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
@@ -2812,7 +3056,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
new_inode = new_dentry->d_inode;
- new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
+ new_bh = ext4_find_entry(new_dir, &new_dentry->d_name,
+ &new_de, &new_inlined);
if (new_bh) {
if (!new_inode) {
brelse(new_bh);
@@ -2826,16 +3071,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
}
retval = -EIO;
- dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
+ dir_bh = ext4_get_first_dir_block(handle, old_inode,
+ &retval, &parent_de,
+ &inlined);
if (!dir_bh)
goto end_rename;
- if (!buffer_verified(dir_bh) &&
+ if (!inlined && !buffer_verified(dir_bh) &&
!ext4_dirent_csum_verify(old_inode,
(struct ext4_dir_entry *)dir_bh->b_data))
goto end_rename;
set_buffer_verified(dir_bh);
- if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
- old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
+ if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
goto end_rename;
retval = -EMLINK;
if (!new_inode && new_dir != old_dir &&
@@ -2864,10 +3110,13 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
ext4_current_time(new_dir);
ext4_mark_inode_dirty(handle, new_dir);
BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
- retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh);
- if (unlikely(retval)) {
- ext4_std_error(new_dir->i_sb, retval);
- goto end_rename;
+ if (!new_inlined) {
+ retval = ext4_handle_dirty_dirent_node(handle,
+ new_dir, new_bh);
+ if (unlikely(retval)) {
+ ext4_std_error(new_dir->i_sb, retval);
+ goto end_rename;
+ }
}
brelse(new_bh);
new_bh = NULL;
@@ -2895,7 +3144,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct buffer_head *old_bh2;
struct ext4_dir_entry_2 *old_de2;
- old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
+ old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
+ &old_de2, NULL);
if (old_bh2) {
retval = ext4_delete_entry(handle, old_dir,
old_de2, old_bh2);
@@ -2915,11 +3165,20 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
ext4_update_dx_flag(old_dir);
if (dir_bh) {
- PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
- cpu_to_le32(new_dir->i_ino);
+ parent_de->inode = cpu_to_le32(new_dir->i_ino);
BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
- retval = ext4_handle_dirty_dirent_node(handle, old_inode,
- dir_bh);
+ if (!inlined) {
+ if (is_dx(old_inode)) {
+ retval = ext4_handle_dirty_dx_node(handle,
+ old_inode,
+ dir_bh);
+ } else {
+ retval = ext4_handle_dirty_dirent_node(handle,
+ old_inode, dir_bh);
+ }
+ } else {
+ retval = ext4_mark_inode_dirty(handle, old_inode);
+ }
if (retval) {
ext4_std_error(old_dir->i_sb, retval);
goto end_rename;
@@ -2969,23 +3228,19 @@ const struct inode_operations ext4_dir_inode_operations = {
.mknod = ext4_mknod,
.rename = ext4_rename,
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
.get_acl = ext4_get_acl,
.fiemap = ext4_fiemap,
};
const struct inode_operations ext4_special_inode_operations = {
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
.get_acl = ext4_get_acl,
};
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dcdeef169a6..0016fbca2a4 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -27,7 +27,6 @@
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
-#include "ext4_extents.h"
static struct kmem_cache *io_page_cachep, *io_end_cachep;
@@ -71,6 +70,9 @@ void ext4_free_io_end(ext4_io_end_t *io)
int i;
BUG_ON(!io);
+ BUG_ON(!list_empty(&io->list));
+ BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
+
if (io->page)
put_page(io->page);
for (i = 0; i < io->num_io_pages; i++)
@@ -81,13 +83,8 @@ void ext4_free_io_end(ext4_io_end_t *io)
kmem_cache_free(io_end_cachep, io);
}
-/*
- * check a range of space and convert unwritten extents to written.
- *
- * Called with inode->i_mutex; we depend on this when we manipulate
- * io->flag, since we could otherwise race with ext4_flush_completed_IO()
- */
-int ext4_end_io_nolock(ext4_io_end_t *io)
+/* check a range of space and convert unwritten extents to written. */
+static int ext4_end_io(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
@@ -106,63 +103,136 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
-
if (io->iocb)
aio_complete(io->iocb, io->result, 0);
if (io->flag & EXT4_IO_END_DIRECT)
inode_dio_done(inode);
/* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
- wake_up_all(ext4_ioend_wq(io->inode));
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+ wake_up_all(ext4_ioend_wq(inode));
return ret;
}
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
+static void dump_completed_IO(struct inode *inode)
+{
+#ifdef EXT4FS_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io, *io0, *io1;
+ unsigned long flags;
+
+ if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
+ ext4_debug("inode %lu completed_io list is empty\n",
+ inode->i_ino);
+ return;
+ }
+
+ ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
+ list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
+ cur = &io->list;
+ before = cur->prev;
+ io0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io, inode->i_ino, io0, io1);
+ }
+#endif
+}
+
+/* Add the io_end to per-inode completed end_io list. */
+void ext4_add_complete_io(ext4_io_end_t *io_end)
{
- ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
- struct inode *inode = io->inode;
- struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long flags;
+ struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+ struct workqueue_struct *wq;
+ unsigned long flags;
+
+ BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (io->flag & EXT4_IO_END_IN_FSYNC)
- goto requeue;
- if (list_empty(&io->list)) {
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- goto free;
+ if (list_empty(&ei->i_completed_io_list)) {
+ io_end->flag |= EXT4_IO_END_QUEUED;
+ queue_work(wq, &io_end->work);
}
+ list_add_tail(&io_end->list, &ei->i_completed_io_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
- if (!mutex_trylock(&inode->i_mutex)) {
- bool was_queued;
-requeue:
- was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
- io->flag |= EXT4_IO_END_QUEUED;
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- /*
- * Requeue the work instead of waiting so that the work
- * items queued after this can be processed.
- */
- queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
- /*
- * To prevent the ext4-dio-unwritten thread from keeping
- * requeueing end_io requests and occupying cpu for too long,
- * yield the cpu if it sees an end_io request that has already
- * been requeued.
- */
- if (was_queued)
- yield();
- return;
+static int ext4_do_flush_completed_IO(struct inode *inode,
+ ext4_io_end_t *work_io)
+{
+ ext4_io_end_t *io;
+ struct list_head unwritten, complete, to_free;
+ unsigned long flags;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int err, ret = 0;
+
+ INIT_LIST_HEAD(&complete);
+ INIT_LIST_HEAD(&to_free);
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ dump_completed_IO(inode);
+ list_replace_init(&ei->i_completed_io_list, &unwritten);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+ while (!list_empty(&unwritten)) {
+ io = list_entry(unwritten.next, ext4_io_end_t, list);
+ BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
+ list_del_init(&io->list);
+
+ err = ext4_end_io(io);
+ if (unlikely(!ret && err))
+ ret = err;
+
+ list_add_tail(&io->list, &complete);
+ }
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ while (!list_empty(&complete)) {
+ io = list_entry(complete.next, ext4_io_end_t, list);
+ io->flag &= ~EXT4_IO_END_UNWRITTEN;
+ /* end_io context can not be destroyed now because it still
+ * used by queued worker. Worker thread will destroy it later */
+ if (io->flag & EXT4_IO_END_QUEUED)
+ list_del_init(&io->list);
+ else
+ list_move(&io->list, &to_free);
+ }
+ /* If we are called from worker context, it is time to clear queued
+ * flag, and destroy it's end_io if it was converted already */
+ if (work_io) {
+ work_io->flag &= ~EXT4_IO_END_QUEUED;
+ if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
+ list_add_tail(&work_io->list, &to_free);
}
- list_del_init(&io->list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- (void) ext4_end_io_nolock(io);
- mutex_unlock(&inode->i_mutex);
-free:
- ext4_free_io_end(io);
+
+ while (!list_empty(&to_free)) {
+ io = list_entry(to_free.next, ext4_io_end_t, list);
+ list_del_init(&io->list);
+ ext4_free_io_end(io);
+ }
+ return ret;
+}
+
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ ext4_do_flush_completed_IO(io->inode, io);
+}
+
+int ext4_flush_unwritten_io(struct inode *inode)
+{
+ int ret;
+ WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
+ !(inode->i_state & I_FREEING));
+ ret = ext4_do_flush_completed_IO(inode, NULL);
+ ext4_unwritten_wait(inode);
+ return ret;
}
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -195,9 +265,7 @@ static void buffer_io_error(struct buffer_head *bh)
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
- struct workqueue_struct *wq;
struct inode *inode;
- unsigned long flags;
int i;
sector_t bi_sector = bio->bi_sector;
@@ -255,14 +323,7 @@ static void ext4_end_bio(struct bio *bio, int error)
return;
}
- /* Add the io_end to per-inode completed io list*/
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
- wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
}
void ext4_io_submit(struct ext4_io_submit *io)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 7ea6cbb4412..d99387b89ed 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -45,6 +45,28 @@ void ext4_resize_end(struct super_block *sb)
smp_mb__after_clear_bit();
}
+static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
+ ext4_group_t group) {
+ return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+}
+
+static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb,
+ ext4_group_t group) {
+ group = ext4_meta_bg_first_group(sb, group);
+ return ext4_group_first_block_no(sb, group);
+}
+
+static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
+ ext4_group_t group) {
+ ext4_grpblk_t overhead;
+ overhead = ext4_bg_num_gdb(sb, group);
+ if (ext4_bg_has_super(sb, group))
+ overhead += 1 +
+ le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+ return overhead;
+}
+
#define outside(b, first, last) ((b) < (first) || (b) >= (last))
#define inside(b, first, last) ((b) >= (first) && (b) < (last))
@@ -57,9 +79,7 @@ static int verify_group_input(struct super_block *sb,
ext4_fsblk_t end = start + input->blocks_count;
ext4_group_t group = input->group;
ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
- unsigned overhead = ext4_bg_has_super(sb, group) ?
- (1 + ext4_bg_num_gdb(sb, group) +
- le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+ unsigned overhead = ext4_group_overhead_blocks(sb, group);
ext4_fsblk_t metaend = start + overhead;
struct buffer_head *bh = NULL;
ext4_grpblk_t free_blocks_count, offset;
@@ -200,13 +220,15 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
* be a partial of a flex group.
*
* @sb: super block of fs to which the groups belongs
+ *
+ * Returns 0 on a successful allocation of the metadata blocks in the
+ * block group.
*/
-static void ext4_alloc_group_tables(struct super_block *sb,
+static int ext4_alloc_group_tables(struct super_block *sb,
struct ext4_new_flex_group_data *flex_gd,
int flexbg_size)
{
struct ext4_new_group_data *group_data = flex_gd->groups;
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
ext4_fsblk_t start_blk;
ext4_fsblk_t last_blk;
ext4_group_t src_group;
@@ -226,23 +248,24 @@ static void ext4_alloc_group_tables(struct super_block *sb,
(last_group & ~(flexbg_size - 1))));
next_group:
group = group_data[0].group;
+ if (src_group >= group_data[0].group + flex_gd->count)
+ return -ENOSPC;
start_blk = ext4_group_first_block_no(sb, src_group);
last_blk = start_blk + group_data[src_group - group].blocks_count;
- overhead = ext4_bg_has_super(sb, src_group) ?
- (1 + ext4_bg_num_gdb(sb, src_group) +
- le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+ overhead = ext4_group_overhead_blocks(sb, src_group);
start_blk += overhead;
- BUG_ON(src_group >= group_data[0].group + flex_gd->count);
/* We collect contiguous blocks as much as possible. */
src_group++;
- for (; src_group <= last_group; src_group++)
- if (!ext4_bg_has_super(sb, src_group))
+ for (; src_group <= last_group; src_group++) {
+ overhead = ext4_group_overhead_blocks(sb, src_group);
+ if (overhead != 0)
last_blk += group_data[src_group - group].blocks_count;
else
break;
+ }
/* Allocate block bitmaps */
for (; bb_index < flex_gd->count; bb_index++) {
@@ -300,6 +323,7 @@ next_group:
group_data[i].free_blocks_count);
}
}
+ return 0;
}
static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
@@ -433,11 +457,13 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
ext4_group_t group, count;
struct buffer_head *bh = NULL;
int reserved_gdb, i, j, err = 0, err2;
+ int meta_bg;
BUG_ON(!flex_gd->count || !group_data ||
group_data[0].group != sbi->s_groups_count);
reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+ meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
/* This transaction may be extended/restarted along the way */
handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
@@ -447,12 +473,25 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
group = group_data[0].group;
for (i = 0; i < flex_gd->count; i++, group++) {
unsigned long gdblocks;
+ ext4_grpblk_t overhead;
gdblocks = ext4_bg_num_gdb(sb, group);
start = ext4_group_first_block_no(sb, group);
+ if (meta_bg == 0 && !ext4_bg_has_super(sb, group))
+ goto handle_itb;
+
+ if (meta_bg == 1) {
+ ext4_group_t first_group;
+ first_group = ext4_meta_bg_first_group(sb, group);
+ if (first_group != group + 1 &&
+ first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1)
+ goto handle_itb;
+ }
+
+ block = start + ext4_bg_has_super(sb, group);
/* Copy all of the GDT blocks into the backup in this group */
- for (j = 0, block = start + 1; j < gdblocks; j++, block++) {
+ for (j = 0; j < gdblocks; j++, block++) {
struct buffer_head *gdb;
ext4_debug("update backup group %#04llx\n", block);
@@ -493,6 +532,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
goto out;
}
+handle_itb:
/* Initialize group tables of the grop @group */
if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
goto handle_bb;
@@ -521,11 +561,11 @@ handle_bb:
err = PTR_ERR(bh);
goto out;
}
- if (ext4_bg_has_super(sb, group)) {
+ overhead = ext4_group_overhead_blocks(sb, group);
+ if (overhead != 0) {
ext4_debug("mark backup superblock %#04llx (+0)\n",
start);
- ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb +
- 1);
+ ext4_set_bits(bh->b_data, 0, overhead);
}
ext4_mark_bitmap_end(group_data[i].blocks_count,
sb->s_blocksize * 8, bh->b_data);
@@ -743,7 +783,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
err = ext4_journal_get_write_access(handle, gdb_bh);
if (unlikely(err))
- goto exit_sbh;
+ goto exit_dind;
err = ext4_journal_get_write_access(handle, dind);
if (unlikely(err))
@@ -752,7 +792,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
/* ext4_reserve_inode_write() gets a reference on the iloc */
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (unlikely(err))
- goto exit_dindj;
+ goto exit_dind;
n_group_desc = ext4_kvmalloc((gdb_num + 1) *
sizeof(struct buffer_head *),
@@ -798,7 +838,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
ext4_kvfree(o_group_desc);
le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
- err = ext4_handle_dirty_super_now(handle, sb);
+ err = ext4_handle_dirty_super(handle, sb);
if (err)
ext4_std_error(sb, err);
@@ -806,12 +846,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
exit_inode:
ext4_kvfree(n_group_desc);
- /* ext4_handle_release_buffer(handle, iloc.bh); */
brelse(iloc.bh);
-exit_dindj:
- /* ext4_handle_release_buffer(handle, dind); */
-exit_sbh:
- /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
exit_dind:
brelse(dind);
exit_bh:
@@ -822,6 +857,45 @@ exit_bh:
}
/*
+ * add_new_gdb_meta_bg is the sister of add_new_gdb.
+ */
+static int add_new_gdb_meta_bg(struct super_block *sb,
+ handle_t *handle, ext4_group_t group) {
+ ext4_fsblk_t gdblock;
+ struct buffer_head *gdb_bh;
+ struct buffer_head **o_group_desc, **n_group_desc;
+ unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+ int err;
+
+ gdblock = ext4_meta_bg_first_block_no(sb, group) +
+ ext4_bg_has_super(sb, group);
+ gdb_bh = sb_bread(sb, gdblock);
+ if (!gdb_bh)
+ return -EIO;
+ n_group_desc = ext4_kvmalloc((gdb_num + 1) *
+ sizeof(struct buffer_head *),
+ GFP_NOFS);
+ if (!n_group_desc) {
+ err = -ENOMEM;
+ ext4_warning(sb, "not enough memory for %lu groups",
+ gdb_num + 1);
+ return err;
+ }
+
+ o_group_desc = EXT4_SB(sb)->s_group_desc;
+ memcpy(n_group_desc, o_group_desc,
+ EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+ n_group_desc[gdb_num] = gdb_bh;
+ EXT4_SB(sb)->s_group_desc = n_group_desc;
+ EXT4_SB(sb)->s_gdb_count++;
+ ext4_kvfree(o_group_desc);
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+ if (unlikely(err))
+ brelse(gdb_bh);
+ return err;
+}
+
+/*
* Called when we are adding a new group which has a backup copy of each of
* the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
* We need to add these reserved backup GDT blocks to the resize inode, so
@@ -890,14 +964,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
}
for (i = 0; i < reserved_gdb; i++) {
- if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
- /*
- int j;
- for (j = 0; j < i; j++)
- ext4_handle_release_buffer(handle, primary[j]);
- */
+ if ((err = ext4_journal_get_write_access(handle, primary[i])))
goto exit_bh;
- }
}
if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
@@ -949,16 +1017,16 @@ exit_free:
* do not copy the full number of backups at this time. The resize
* which changed s_groups_count will backup again.
*/
-static void update_backups(struct super_block *sb,
- int blk_off, char *data, int size)
+static void update_backups(struct super_block *sb, int blk_off, char *data,
+ int size, int meta_bg)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- const ext4_group_t last = sbi->s_groups_count;
+ ext4_group_t last;
const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
unsigned three = 1;
unsigned five = 5;
unsigned seven = 7;
- ext4_group_t group;
+ ext4_group_t group = 0;
int rest = sb->s_blocksize - size;
handle_t *handle;
int err = 0, err2;
@@ -970,10 +1038,17 @@ static void update_backups(struct super_block *sb,
goto exit_err;
}
- ext4_superblock_csum_set(sb, (struct ext4_super_block *)data);
+ if (meta_bg == 0) {
+ group = ext4_list_backups(sb, &three, &five, &seven);
+ last = sbi->s_groups_count;
+ } else {
+ group = ext4_meta_bg_first_group(sb, group) + 1;
+ last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2);
+ }
- while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
+ while (group < sbi->s_groups_count) {
struct buffer_head *bh;
+ ext4_fsblk_t backup_block;
/* Out of journal space, and can't get more - abort - so sad */
if (ext4_handle_valid(handle) &&
@@ -982,13 +1057,20 @@ static void update_backups(struct super_block *sb,
(err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
break;
- bh = sb_getblk(sb, group * bpg + blk_off);
+ if (meta_bg == 0)
+ backup_block = group * bpg + blk_off;
+ else
+ backup_block = (ext4_group_first_block_no(sb, group) +
+ ext4_bg_has_super(sb, group));
+
+ bh = sb_getblk(sb, backup_block);
if (!bh) {
err = -EIO;
break;
}
- ext4_debug("update metadata backup %#04lx\n",
- (unsigned long)bh->b_blocknr);
+ ext4_debug("update metadata backup %llu(+%llu)\n",
+ backup_block, backup_block -
+ ext4_group_first_block_no(sb, group));
if ((err = ext4_journal_get_write_access(handle, bh)))
break;
lock_buffer(bh);
@@ -1001,6 +1083,13 @@ static void update_backups(struct super_block *sb,
if (unlikely(err))
ext4_std_error(sb, err);
brelse(bh);
+
+ if (meta_bg == 0)
+ group = ext4_list_backups(sb, &three, &five, &seven);
+ else if (group == last)
+ break;
+ else
+ group = last;
}
if ((err2 = ext4_journal_stop(handle)) && !err)
err = err2;
@@ -1043,7 +1132,9 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
struct ext4_super_block *es = sbi->s_es;
struct buffer_head *gdb_bh;
int i, gdb_off, gdb_num, err = 0;
+ int meta_bg;
+ meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
for (i = 0; i < count; i++, group++) {
int reserved_gdb = ext4_bg_has_super(sb, group) ?
le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
@@ -1063,8 +1154,11 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
err = reserve_backup_gdb(handle, resize_inode, group);
- } else
+ } else if (meta_bg != 0) {
+ err = add_new_gdb_meta_bg(sb, handle, group);
+ } else {
err = add_new_gdb(handle, resize_inode, group);
+ }
if (err)
break;
}
@@ -1076,17 +1170,12 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
struct buffer_head *bh = sb_getblk(sb, block);
if (!bh)
return NULL;
-
- if (bitmap_uptodate(bh))
- return bh;
-
- lock_buffer(bh);
- if (bh_submit_read(bh) < 0) {
- unlock_buffer(bh);
- brelse(bh);
- return NULL;
+ if (!bh_uptodate_or_lock(bh)) {
+ if (bh_submit_read(bh) < 0) {
+ brelse(bh);
+ return NULL;
+ }
}
- unlock_buffer(bh);
return bh;
}
@@ -1112,8 +1201,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb,
bh = ext4_get_bitmap(sb, group_data->block_bitmap);
if (!bh)
return -EIO;
- ext4_block_bitmap_csum_set(sb, group, gdp, bh,
- EXT4_BLOCKS_PER_GROUP(sb) / 8);
+ ext4_block_bitmap_csum_set(sb, group, gdp, bh);
brelse(bh);
return 0;
@@ -1161,6 +1249,9 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
ext4_free_group_clusters_set(sb, gdp,
EXT4_B2C(sbi, group_data->free_blocks_count));
ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+ if (ext4_has_group_desc_csum(sb))
+ ext4_itable_unused_set(sb, gdp,
+ EXT4_INODES_PER_GROUP(sb));
gdp->bg_flags = cpu_to_le16(*bg_flags);
ext4_group_desc_csum_set(sb, group, gdp);
@@ -1216,7 +1307,7 @@ static void ext4_update_super(struct super_block *sb,
}
reserved_blocks = ext4_r_blocks_count(es) * 100;
- do_div(reserved_blocks, ext4_blocks_count(es));
+ reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es));
reserved_blocks *= blocks_count;
do_div(reserved_blocks, 100);
@@ -1227,6 +1318,7 @@ static void ext4_update_super(struct super_block *sb,
le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
flex_gd->count);
+ ext4_debug("free blocks count %llu", ext4_free_blocks_count(es));
/*
* We need to protect s_groups_count against other CPUs seeing
* inconsistent state in the superblock.
@@ -1261,6 +1353,8 @@ static void ext4_update_super(struct super_block *sb,
percpu_counter_add(&sbi->s_freeinodes_counter,
EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
+ ext4_debug("free blocks count %llu",
+ percpu_counter_read(&sbi->s_freeclusters_counter));
if (EXT4_HAS_INCOMPAT_FEATURE(sb,
EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
sbi->s_log_groups_per_flex) {
@@ -1272,6 +1366,11 @@ static void ext4_update_super(struct super_block *sb,
&sbi->s_flex_groups[flex_group].free_inodes);
}
+ /*
+ * Update the fs overhead information
+ */
+ ext4_calculate_overhead(sb);
+
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: added group %u:"
"%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
@@ -1344,16 +1443,24 @@ exit_journal:
err = err2;
if (!err) {
- int i;
+ int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+ int gdb_num_end = ((group + flex_gd->count - 1) /
+ EXT4_DESC_PER_BLOCK(sb));
+ int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_META_BG);
+ sector_t old_gdb = 0;
+
update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block));
- for (i = 0; i < flex_gd->count; i++, group++) {
+ sizeof(struct ext4_super_block), 0);
+ for (; gdb_num <= gdb_num_end; gdb_num++) {
struct buffer_head *gdb_bh;
- int gdb_num;
- gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb);
+
gdb_bh = sbi->s_group_desc[gdb_num];
+ if (old_gdb == gdb_bh->b_blocknr)
+ continue;
update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
- gdb_bh->b_size);
+ gdb_bh->b_size, meta_bg);
+ old_gdb = gdb_bh->b_blocknr;
}
}
exit:
@@ -1397,9 +1504,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
group_data[i].group = group + i;
group_data[i].blocks_count = blocks_per_group;
- overhead = ext4_bg_has_super(sb, group + i) ?
- (1 + ext4_bg_num_gdb(sb, group + i) +
- le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+ overhead = ext4_group_overhead_blocks(sb, group + i);
group_data[i].free_blocks_count = blocks_per_group - overhead;
if (ext4_has_group_desc_csum(sb))
flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
@@ -1487,6 +1592,14 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
if (err)
goto out;
+ err = ext4_alloc_flex_bg_array(sb, input->group + 1);
+ if (err)
+ return err;
+
+ err = ext4_mb_alloc_groupinfo(sb, input->group + 1);
+ if (err)
+ goto out;
+
flex_gd.count = 1;
flex_gd.groups = input;
flex_gd.bg_flags = &bg_flags;
@@ -1539,11 +1652,13 @@ errout:
err = err2;
if (!err) {
+ ext4_fsblk_t first_block;
+ first_block = ext4_group_first_block_no(sb, 0);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
"blocks\n", ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block));
+ update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block,
+ (char *)es, sizeof(struct ext4_super_block), 0);
}
return err;
}
@@ -1626,6 +1741,94 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
return err;
} /* ext4_group_extend */
+
+static int num_desc_blocks(struct super_block *sb, ext4_group_t groups)
+{
+ return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb);
+}
+
+/*
+ * Release the resize inode and drop the resize_inode feature if there
+ * are no more reserved gdt blocks, and then convert the file system
+ * to enable meta_bg
+ */
+static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
+{
+ handle_t *handle;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_fsblk_t nr;
+ int i, ret, err = 0;
+ int credits = 1;
+
+ ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg");
+ if (inode) {
+ if (es->s_reserved_gdt_blocks) {
+ ext4_error(sb, "Unexpected non-zero "
+ "s_reserved_gdt_blocks");
+ return -EPERM;
+ }
+
+ /* Do a quick sanity check of the resize inode */
+ if (inode->i_blocks != 1 << (inode->i_blkbits - 9))
+ goto invalid_resize_inode;
+ for (i = 0; i < EXT4_N_BLOCKS; i++) {
+ if (i == EXT4_DIND_BLOCK) {
+ if (ei->i_data[i])
+ continue;
+ else
+ goto invalid_resize_inode;
+ }
+ if (ei->i_data[i])
+ goto invalid_resize_inode;
+ }
+ credits += 3; /* block bitmap, bg descriptor, resize inode */
+ }
+
+ handle = ext4_journal_start_sb(sb, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto errout;
+
+ EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE);
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ sbi->s_es->s_first_meta_bg =
+ cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
+
+ err = ext4_handle_dirty_super(handle, sb);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto errout;
+ }
+
+ if (inode) {
+ nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]);
+ ext4_free_blocks(handle, inode, NULL, nr, 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
+ ei->i_data[EXT4_DIND_BLOCK] = 0;
+ inode->i_blocks = 0;
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err)
+ ext4_std_error(sb, err);
+ }
+
+errout:
+ ret = ext4_journal_stop(handle);
+ if (!err)
+ err = ret;
+ return ret;
+
+invalid_resize_inode:
+ ext4_error(sb, "corrupted/inconsistent resize inode");
+ return -EINVAL;
+}
+
/*
* ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
*
@@ -1638,21 +1841,31 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
struct buffer_head *bh;
- struct inode *resize_inode;
- ext4_fsblk_t o_blocks_count;
- ext4_group_t o_group;
- ext4_group_t n_group;
- ext4_grpblk_t offset, add;
+ struct inode *resize_inode = NULL;
+ ext4_grpblk_t add, offset;
unsigned long n_desc_blocks;
unsigned long o_desc_blocks;
- unsigned long desc_blocks;
- int err = 0, flexbg_size = 1;
+ ext4_group_t o_group;
+ ext4_group_t n_group;
+ ext4_fsblk_t o_blocks_count;
+ ext4_fsblk_t n_blocks_count_retry = 0;
+ unsigned long last_update_time = 0;
+ int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex;
+ int meta_bg;
+ /* See if the device is actually as big as what was requested */
+ bh = sb_bread(sb, n_blocks_count - 1);
+ if (!bh) {
+ ext4_warning(sb, "can't read last block, resize aborted");
+ return -ENOSPC;
+ }
+ brelse(bh);
+
+retry:
o_blocks_count = ext4_blocks_count(es);
- if (test_opt(sb, DEBUG))
- ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu "
- "to %llu blocks", o_blocks_count, n_blocks_count);
+ ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu "
+ "to %llu blocks", o_blocks_count, n_blocks_count);
if (n_blocks_count < o_blocks_count) {
/* On-line shrinking not supported */
@@ -1667,32 +1880,49 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
- n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
- EXT4_DESC_PER_BLOCK(sb);
- o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
- EXT4_DESC_PER_BLOCK(sb);
- desc_blocks = n_desc_blocks - o_desc_blocks;
+ n_desc_blocks = num_desc_blocks(sb, n_group + 1);
+ o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count);
- if (desc_blocks &&
- (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) ||
- le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) {
- ext4_warning(sb, "No reserved GDT blocks, can't resize");
- return -EPERM;
- }
+ meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+
+ if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) {
+ if (meta_bg) {
+ ext4_error(sb, "resize_inode and meta_bg enabled "
+ "simultaneously");
+ return -EINVAL;
+ }
+ if (n_desc_blocks > o_desc_blocks +
+ le16_to_cpu(es->s_reserved_gdt_blocks)) {
+ n_blocks_count_retry = n_blocks_count;
+ n_desc_blocks = o_desc_blocks +
+ le16_to_cpu(es->s_reserved_gdt_blocks);
+ n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
+ n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb);
+ n_group--; /* set to last group number */
+ }
- resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
- if (IS_ERR(resize_inode)) {
- ext4_warning(sb, "Error opening resize inode");
- return PTR_ERR(resize_inode);
+ if (!resize_inode)
+ resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+ if (IS_ERR(resize_inode)) {
+ ext4_warning(sb, "Error opening resize inode");
+ return PTR_ERR(resize_inode);
+ }
}
- /* See if the device is actually as big as what was requested */
- bh = sb_bread(sb, n_blocks_count - 1);
- if (!bh) {
- ext4_warning(sb, "can't read last block, resize aborted");
- return -ENOSPC;
+ if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
+ err = ext4_convert_meta_bg(sb, resize_inode);
+ if (err)
+ goto out;
+ if (resize_inode) {
+ iput(resize_inode);
+ resize_inode = NULL;
+ }
+ if (n_blocks_count_retry) {
+ n_blocks_count = n_blocks_count_retry;
+ n_blocks_count_retry = 0;
+ goto retry;
+ }
}
- brelse(bh);
/* extend the last group */
if (n_group == o_group)
@@ -1705,12 +1935,15 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
goto out;
}
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
- es->s_log_groups_per_flex)
- flexbg_size = 1 << es->s_log_groups_per_flex;
+ if (ext4_blocks_count(es) == n_blocks_count)
+ goto out;
- o_blocks_count = ext4_blocks_count(es);
- if (o_blocks_count == n_blocks_count)
+ err = ext4_alloc_flex_bg_array(sb, n_group + 1);
+ if (err)
+ return err;
+
+ err = ext4_mb_alloc_groupinfo(sb, n_group + 1);
+ if (err)
goto out;
flex_gd = alloc_flex_gd(flexbg_size);
@@ -1724,19 +1957,33 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
*/
while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
flexbg_size)) {
- ext4_alloc_group_tables(sb, flex_gd, flexbg_size);
+ if (jiffies - last_update_time > HZ * 10) {
+ if (last_update_time)
+ ext4_msg(sb, KERN_INFO,
+ "resized to %llu blocks",
+ ext4_blocks_count(es));
+ last_update_time = jiffies;
+ }
+ if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0)
+ break;
err = ext4_flex_group_add(sb, resize_inode, flex_gd);
if (unlikely(err))
break;
}
+ if (!err && n_blocks_count_retry) {
+ n_blocks_count = n_blocks_count_retry;
+ n_blocks_count_retry = 0;
+ free_flex_gd(flex_gd);
+ flex_gd = NULL;
+ goto retry;
+ }
+
out:
if (flex_gd)
free_flex_gd(flex_gd);
-
- iput(resize_inode);
- if (test_opt(sb, DEBUG))
- ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu "
- "upto %llu blocks", o_blocks_count, n_blocks_count);
+ if (resize_inode != NULL)
+ iput(resize_inode);
+ ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
return err;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eb7aa3e4ef0..3d4fb81bacd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,7 +45,7 @@
#include <linux/freezer.h>
#include "ext4.h"
-#include "ext4_extents.h"
+#include "ext4_extents.h" /* Needed for trace points definition */
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
@@ -74,7 +74,6 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
-static void ext4_write_super(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data);
@@ -144,9 +143,10 @@ int ext4_superblock_csum_verify(struct super_block *sb,
return es->s_checksum == ext4_superblock_csum(sb, es);
}
-void ext4_superblock_csum_set(struct super_block *sb,
- struct ext4_super_block *es)
+void ext4_superblock_csum_set(struct super_block *sb)
{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
return;
@@ -327,38 +327,17 @@ static void ext4_put_nojournal(handle_t *handle)
/*
* Wrappers for jbd2_journal_start/end.
- *
- * The only special thing we need to do here is to make sure that all
- * journal_end calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- *
- * To avoid j_barrier hold in userspace when a user calls freeze(),
- * ext4 prevents a new handle from being started by s_frozen, which
- * is in an upper layer.
*/
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
{
journal_t *journal;
- handle_t *handle;
trace_ext4_journal_start(sb, nblocks, _RET_IP_);
if (sb->s_flags & MS_RDONLY)
return ERR_PTR(-EROFS);
+ WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
journal = EXT4_SB(sb)->s_journal;
- handle = ext4_journal_current_handle();
-
- /*
- * If a handle has been started, it should be allowed to
- * finish, otherwise deadlock could happen between freeze
- * and others(e.g. truncate) due to the restart of the
- * journal handle if the filesystem is forzen and active
- * handles are not stopped.
- */
- if (!handle)
- vfs_check_frozen(sb, SB_FREEZE_TRANS);
-
if (!journal)
return ext4_get_nojournal();
/*
@@ -373,12 +352,6 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
return jbd2_journal_start(journal, nblocks);
}
-/*
- * The only special thing we need to do here is to make sure that all
- * jbd2_journal_stop calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- */
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
{
struct super_block *sb;
@@ -448,7 +421,7 @@ static void __save_error_info(struct super_block *sb, const char *func,
*/
if (!es->s_error_count)
mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
- es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1);
+ le32_add_cpu(&es->s_error_count, 1);
}
static void save_error_info(struct super_block *sb, const char *func,
@@ -878,7 +851,6 @@ static void ext4_put_super(struct super_block *sb)
flush_workqueue(sbi->dio_unwritten_wq);
destroy_workqueue(sbi->dio_unwritten_wq);
- lock_super(sb);
if (sbi->s_journal) {
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -896,7 +868,7 @@ static void ext4_put_super(struct super_block *sb)
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
}
- if (sb->s_dirt || !(sb->s_flags & MS_RDONLY))
+ if (!(sb->s_flags & MS_RDONLY))
ext4_commit_super(sb, 1);
if (sbi->s_proc) {
@@ -945,7 +917,6 @@ static void ext4_put_super(struct super_block *sb)
* Now that we are completely done shutting down the
* superblock, we need to actually destroy the kobject.
*/
- unlock_super(sb);
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
if (sbi->s_chksum_driver)
@@ -968,14 +939,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
return NULL;
ei->vfs_inode.i_version = 1;
- ei->vfs_inode.i_data.writeback_index = 0;
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
+ ext4_es_init_tree(&ei->i_es_tree);
+ rwlock_init(&ei->i_es_lock);
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
+ ei->i_da_metadata_calc_last_lblock = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
#ifdef CONFIG_QUOTA
ei->i_reserved_quota = 0;
@@ -983,11 +956,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_completed_io_list);
spin_lock_init(&ei->i_completed_io_lock);
- ei->cur_aio_dio = NULL;
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
- atomic_set(&ei->i_aiodio_unwritten, 0);
+ atomic_set(&ei->i_unwritten, 0);
return &ei->vfs_inode;
}
@@ -1025,9 +997,7 @@ static void init_once(void *foo)
struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
INIT_LIST_HEAD(&ei->i_orphan);
-#ifdef CONFIG_EXT4_FS_XATTR
init_rwsem(&ei->xattr_sem);
-#endif
init_rwsem(&ei->i_data_sem);
inode_init_once(&ei->vfs_inode);
}
@@ -1046,6 +1016,11 @@ static int init_inodecache(void)
static void destroy_inodecache(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(ext4_inode_cachep);
}
@@ -1055,6 +1030,7 @@ void ext4_clear_inode(struct inode *inode)
clear_inode(inode);
dquot_drop(inode);
ext4_discard_preallocations(inode);
+ ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
if (EXT4_I(inode)->jinode) {
jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
EXT4_I(inode)->jinode);
@@ -1137,12 +1113,18 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
struct path *path);
+static int ext4_quota_on_sysfile(struct super_block *sb, int type,
+ int format_id);
static int ext4_quota_off(struct super_block *sb, int type);
+static int ext4_quota_off_sysfile(struct super_block *sb, int type);
static int ext4_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
static ssize_t ext4_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
+static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
+ unsigned int flags);
+static int ext4_enable_quotas(struct super_block *sb);
static const struct dquot_operations ext4_quota_operations = {
.get_reserved_space = ext4_get_reserved_space,
@@ -1164,6 +1146,16 @@ static const struct quotactl_ops ext4_qctl_operations = {
.get_dqblk = dquot_get_dqblk,
.set_dqblk = dquot_set_dqblk
};
+
+static const struct quotactl_ops ext4_qctl_sysfile_operations = {
+ .quota_on_meta = ext4_quota_on_sysfile,
+ .quota_off = ext4_quota_off_sysfile,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk
+};
#endif
static const struct super_operations ext4_sops = {
@@ -1194,7 +1186,6 @@ static const struct super_operations ext4_nojournal_sops = {
.dirty_inode = ext4_dirty_inode,
.drop_inode = ext4_drop_inode,
.evict_inode = ext4_evict_inode,
- .write_super = ext4_write_super,
.put_super = ext4_put_super,
.statfs = ext4_statfs,
.remount_fs = ext4_remount,
@@ -1231,6 +1222,7 @@ enum {
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+ Opt_max_dir_size_kb,
};
static const match_table_t tokens = {
@@ -1304,6 +1296,7 @@ static const match_table_t tokens = {
{Opt_init_itable, "init_itable=%u"},
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
+ {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
{Opt_removed, "nocheck"}, /* mount option from ext2/3 */
{Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1454,13 +1447,8 @@ static const struct mount_opts {
{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
-#ifdef CONFIG_EXT4_FS_XATTR
{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
-#else
- {Opt_user_xattr, 0, MOPT_NOSUPPORT},
- {Opt_nouser_xattr, 0, MOPT_NOSUPPORT},
-#endif
#ifdef CONFIG_EXT4_FS_POSIX_ACL
{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
{Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
@@ -1484,6 +1472,7 @@ static const struct mount_opts {
{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
+ {Opt_max_dir_size_kb, 0, MOPT_GTE0},
{Opt_err, 0, 0}
};
@@ -1599,6 +1588,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
if (!args->from)
arg = EXT4_DEF_LI_WAIT_MULT;
sbi->s_li_wait_mult = arg;
+ } else if (token == Opt_max_dir_size_kb) {
+ sbi->s_max_dir_size_kb = arg;
} else if (token == Opt_stripe) {
sbi->s_stripe = arg;
} else if (m->flags & MOPT_DATAJ) {
@@ -1654,9 +1645,7 @@ static int parse_options(char *options, struct super_block *sb,
unsigned int *journal_ioprio,
int is_remount)
{
-#ifdef CONFIG_QUOTA
struct ext4_sb_info *sbi = EXT4_SB(sb);
-#endif
char *p;
substring_t args[MAX_OPT_ARGS];
int token;
@@ -1671,7 +1660,7 @@ static int parse_options(char *options, struct super_block *sb,
* Initialize args struct so we know whether arg was
* found; some options take optional arguments.
*/
- args[0].to = args[0].from = 0;
+ args[0].to = args[0].from = NULL;
token = match_token(p, tokens, args);
if (handle_mount_opt(sb, p, token, args, journal_devnum,
journal_ioprio, is_remount) < 0)
@@ -1705,6 +1694,16 @@ static int parse_options(char *options, struct super_block *sb,
}
}
#endif
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ int blocksize =
+ BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
+
+ if (blocksize < PAGE_CACHE_SIZE) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "dioread_nolock if block size != PAGE_SIZE");
+ return 0;
+ }
+ }
return 1;
}
@@ -1747,7 +1746,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
static const char *token2str(int token)
{
- static const struct match_token *t;
+ const struct match_token *t;
for (t = tokens; t->token != Opt_err; t++)
if (t->token == token && !strchr(t->pattern, '='))
@@ -1830,6 +1829,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
(sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
+ if (nodefs || sbi->s_max_dir_size_kb)
+ SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
ext4_show_quota_options(seq, sb);
return 0;
@@ -1921,34 +1922,56 @@ done:
return res;
}
+int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct flex_groups *new_groups;
+ int size;
+
+ if (!sbi->s_log_groups_per_flex)
+ return 0;
+
+ size = ext4_flex_group(sbi, ngroup - 1) + 1;
+ if (size <= sbi->s_flex_groups_allocated)
+ return 0;
+
+ size = roundup_pow_of_two(size * sizeof(struct flex_groups));
+ new_groups = ext4_kvzalloc(size, GFP_KERNEL);
+ if (!new_groups) {
+ ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
+ size / (int) sizeof(struct flex_groups));
+ return -ENOMEM;
+ }
+
+ if (sbi->s_flex_groups) {
+ memcpy(new_groups, sbi->s_flex_groups,
+ (sbi->s_flex_groups_allocated *
+ sizeof(struct flex_groups)));
+ ext4_kvfree(sbi->s_flex_groups);
+ }
+ sbi->s_flex_groups = new_groups;
+ sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
+ return 0;
+}
+
static int ext4_fill_flex_info(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp = NULL;
- ext4_group_t flex_group_count;
ext4_group_t flex_group;
unsigned int groups_per_flex = 0;
- size_t size;
- int i;
+ int i, err;
sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
sbi->s_log_groups_per_flex = 0;
return 1;
}
- groups_per_flex = 1 << sbi->s_log_groups_per_flex;
-
- /* We allocate both existing and potentially added groups */
- flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
- ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
- EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
- size = flex_group_count * sizeof(struct flex_groups);
- sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL);
- if (sbi->s_flex_groups == NULL) {
- ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups",
- flex_group_count);
+ groups_per_flex = 1U << sbi->s_log_groups_per_flex;
+
+ err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
+ if (err)
goto failed;
- }
for (i = 0; i < sbi->s_groups_count; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
@@ -2151,10 +2174,12 @@ static void ext4_orphan_cleanup(struct super_block *sb,
}
if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
- if (es->s_last_orphan)
+ /* don't clear list on RO mount w/ errors */
+ if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
jbd_debug(1, "Errors on filesystem, "
"clearing orphan list.\n");
- es->s_last_orphan = 0;
+ es->s_last_orphan = 0;
+ }
jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
return;
}
@@ -2195,7 +2220,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
__func__, inode->i_ino, inode->i_size);
jbd_debug(2, "truncating inode %lu to %lld bytes\n",
inode->i_ino, inode->i_size);
+ mutex_lock(&inode->i_mutex);
ext4_truncate(inode);
+ mutex_unlock(&inode->i_mutex);
nr_truncates++;
} else {
ext4_msg(sb, KERN_DEBUG,
@@ -2535,6 +2562,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
static struct attribute *ext4_attrs[] = {
@@ -2550,6 +2578,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
ATTR_LIST(max_writeback_mb_bump),
+ ATTR_LIST(extent_max_zeroout_kb),
ATTR_LIST(trigger_fs_error),
NULL,
};
@@ -2557,10 +2586,12 @@ static struct attribute *ext4_attrs[] = {
/* Features this copy of ext4 supports */
EXT4_INFO_ATTR(lazy_itable_init);
EXT4_INFO_ATTR(batched_discard);
+EXT4_INFO_ATTR(meta_bg_resize);
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
ATTR_LIST(batched_discard),
+ ATTR_LIST(meta_bg_resize),
NULL,
};
@@ -2661,6 +2692,16 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
"extents feature\n");
return 0;
}
+
+#ifndef CONFIG_QUOTA
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ !readonly) {
+ ext4_msg(sb, KERN_ERR,
+ "Filesystem with quota feature cannot be mounted RDWR "
+ "without CONFIG_QUOTA");
+ return 0;
+ }
+#endif /* CONFIG_QUOTA */
return 1;
}
@@ -2723,6 +2764,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
sb = elr->lr_super;
ngroups = EXT4_SB(sb)->s_groups_count;
+ sb_start_write(sb);
for (group = elr->lr_next_group; group < ngroups; group++) {
gdp = ext4_get_group_desc(sb, group, NULL);
if (!gdp) {
@@ -2749,6 +2791,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
}
+ sb_end_write(sb);
return ret;
}
@@ -3085,6 +3128,121 @@ static int set_journal_csum_feature_set(struct super_block *sb)
return ret;
}
+/*
+ * Note: calculating the overhead so we can be compatible with
+ * historical BSD practice is quite difficult in the face of
+ * clusters/bigalloc. This is because multiple metadata blocks from
+ * different block group can end up in the same allocation cluster.
+ * Calculating the exact overhead in the face of clustered allocation
+ * requires either O(all block bitmaps) in memory or O(number of block
+ * groups**2) in time. We will still calculate the superblock for
+ * older file systems --- and if we come across with a bigalloc file
+ * system with zero in s_overhead_clusters the estimate will be close to
+ * correct especially for very large cluster sizes --- but for newer
+ * file systems, it's better to calculate this figure once at mkfs
+ * time, and store it in the superblock. If the superblock value is
+ * present (even for non-bigalloc file systems), we will use it.
+ */
+static int count_overhead(struct super_block *sb, ext4_group_t grp,
+ char *buf)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp;
+ ext4_fsblk_t first_block, last_block, b;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+ int s, j, count = 0;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+ return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+ sbi->s_itb_per_group + 2);
+
+ first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
+ (grp * EXT4_BLOCKS_PER_GROUP(sb));
+ last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
+ for (i = 0; i < ngroups; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ b = ext4_block_bitmap(sb, gdp);
+ if (b >= first_block && b <= last_block) {
+ ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
+ count++;
+ }
+ b = ext4_inode_bitmap(sb, gdp);
+ if (b >= first_block && b <= last_block) {
+ ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
+ count++;
+ }
+ b = ext4_inode_table(sb, gdp);
+ if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
+ for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
+ int c = EXT4_B2C(sbi, b - first_block);
+ ext4_set_bit(c, buf);
+ count++;
+ }
+ if (i != grp)
+ continue;
+ s = 0;
+ if (ext4_bg_has_super(sb, grp)) {
+ ext4_set_bit(s++, buf);
+ count++;
+ }
+ for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
+ ext4_set_bit(EXT4_B2C(sbi, s++), buf);
+ count++;
+ }
+ }
+ if (!count)
+ return 0;
+ return EXT4_CLUSTERS_PER_GROUP(sb) -
+ ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
+}
+
+/*
+ * Compute the overhead and stash it in sbi->s_overhead
+ */
+int ext4_calculate_overhead(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+ ext4_fsblk_t overhead = 0;
+ char *buf = (char *) get_zeroed_page(GFP_KERNEL);
+
+ if (!buf)
+ return -ENOMEM;
+
+ /*
+ * Compute the overhead (FS structures). This is constant
+ * for a given filesystem unless the number of block groups
+ * changes so we cache the previous value until it does.
+ */
+
+ /*
+ * All of the blocks before first_data_block are overhead
+ */
+ overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
+
+ /*
+ * Add the overhead found in each block group
+ */
+ for (i = 0; i < ngroups; i++) {
+ int blks;
+
+ blks = count_overhead(sb, i, buf);
+ overhead += blks;
+ if (blks)
+ memset(buf, 0, PAGE_SIZE);
+ cond_resched();
+ }
+ /* Add the journal blocks as well */
+ if (sbi->s_journal)
+ overhead += EXT4_B2C(sbi, sbi->s_journal->j_maxlen);
+
+ sbi->s_overhead = overhead;
+ smp_wmb();
+ free_page((unsigned long) buf);
+ return 0;
+}
+
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
{
char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -3106,7 +3264,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
unsigned int i;
int needs_recovery, has_huge_files, has_bigalloc;
__u64 blocks_count;
- int err;
+ int err = 0;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
ext4_group_t first_not_zeroed;
@@ -3122,9 +3280,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_fs_info = sbi;
sbi->s_sb = sb;
- sbi->s_mount_opt = 0;
- sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID);
- sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID);
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sb_block = sb_block;
if (sb->s_bdev->bd_part)
@@ -3135,6 +3290,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
for (cp = sb->s_id; (cp = strchr(cp, '/'));)
*cp = '!';
+ /* -EINVAL is default */
ret = -EINVAL;
blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
if (!blocksize) {
@@ -3219,9 +3375,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (def_mount_opts & EXT4_DEFM_UID16)
set_opt(sb, NO_UID32);
/* xattr user namespace & acls are now defaulted on */
-#ifdef CONFIG_EXT4_FS_XATTR
set_opt(sb, XATTR_USER);
-#endif
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
@@ -3257,7 +3411,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* enable delayed allocation by default
* Use -o nodelalloc to turn it off
*/
- if (!IS_EXT3_SB(sb) &&
+ if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);
@@ -3296,15 +3450,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
clear_opt(sb, DELALLOC);
}
- blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
- if (test_opt(sb, DIOREAD_NOLOCK)) {
- if (blocksize < PAGE_SIZE) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "dioread_nolock if block size != PAGE_SIZE");
- goto failed_mount;
- }
- }
-
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -3346,6 +3491,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
goto failed_mount;
+ blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
if (blocksize < EXT4_MIN_BLOCK_SIZE ||
blocksize > EXT4_MAX_BLOCK_SIZE) {
ext4_msg(sb, KERN_ERR,
@@ -3512,7 +3658,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
" too large to mount safely on this system");
if (sizeof(sector_t) < 8)
ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
- ret = err;
goto failed_mount;
}
@@ -3620,12 +3765,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
- ret = err;
goto failed_mount3;
}
sbi->s_stripe = ext4_get_stripe_size(sbi);
sbi->s_max_writeback_mb_bump = 128;
+ sbi->s_extent_max_zeroout_kb = 32;
/*
* set up enough so that it can read an inode
@@ -3640,12 +3785,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_QUOTA
sb->s_qcop = &ext4_qctl_operations;
sb->dq_op = &ext4_quota_operations;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ /* Use qctl operations for hidden quota files. */
+ sb->s_qcop = &ext4_qctl_sysfile_operations;
+ }
#endif
memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
- sbi->s_resize_flags = 0;
sb->s_root = NULL;
@@ -3735,6 +3884,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
no_journal:
/*
+ * Get the # of file system overhead blocks from the
+ * superblock if present.
+ */
+ if (es->s_overhead_clusters)
+ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+ else {
+ err = ext4_calculate_overhead(sb);
+ if (err)
+ goto failed_mount_wq;
+ }
+
+ /*
* The maximum number of concurrent works can be high and
* concurrency isn't really necessary. Limit it to 1.
*/
@@ -3742,6 +3903,7 @@ no_journal:
alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
if (!EXT4_SB(sb)->dio_unwritten_wq) {
printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+ ret = -ENOMEM;
goto failed_mount_wq;
}
@@ -3840,6 +4002,24 @@ no_journal:
} else
descr = "out journal";
+#ifdef CONFIG_QUOTA
+ /* Enable quota usage during mount. */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ !(sb->s_flags & MS_RDONLY)) {
+ err = ext4_enable_quotas(sb);
+ if (err)
+ goto failed_mount7;
+ }
+#endif /* CONFIG_QUOTA */
+
+ if (test_opt(sb, DISCARD)) {
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ if (!blk_queue_discard(q))
+ ext4_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but "
+ "the device does not support discard");
+ }
+
ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
"Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
*sbi->s_es->s_mount_opts ? "; " : "", orig_data);
@@ -3906,7 +4086,7 @@ out_fail:
kfree(sbi);
out_free_orig:
kfree(orig_data);
- return ret;
+ return err ? err : ret;
}
/*
@@ -4203,9 +4383,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
es->s_free_inodes_count =
cpu_to_le32(percpu_counter_sum_positive(
&EXT4_SB(sb)->s_freeinodes_counter));
- sb->s_dirt = 0;
BUFFER_TRACE(sbh, "marking dirty");
- ext4_superblock_csum_set(sb, es);
+ ext4_superblock_csum_set(sb);
mark_buffer_dirty(sbh);
if (sync) {
error = sync_dirty_buffer(sbh);
@@ -4286,6 +4465,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
ext4_commit_super(sb, 1);
jbd2_journal_clear_err(journal);
+ jbd2_journal_update_sb_errno(journal);
}
}
@@ -4302,21 +4482,12 @@ int ext4_force_commit(struct super_block *sb)
return 0;
journal = EXT4_SB(sb)->s_journal;
- if (journal) {
- vfs_check_frozen(sb, SB_FREEZE_TRANS);
+ if (journal)
ret = ext4_journal_force_commit(journal);
- }
return ret;
}
-static void ext4_write_super(struct super_block *sb)
-{
- lock_super(sb);
- ext4_commit_super(sb, 1);
- unlock_super(sb);
-}
-
static int ext4_sync_fs(struct super_block *sb, int wait)
{
int ret = 0;
@@ -4325,6 +4496,11 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->dio_unwritten_wq);
+ /*
+ * Writeback quota in non-journalled quota case - journalled quota has
+ * no dirty dquots
+ */
+ dquot_writeback_dquots(sb, -1);
if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
if (wait)
jbd2_log_wait_commit(sbi->s_journal, target);
@@ -4337,9 +4513,8 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
* gives us a chance to flush the journal completely and mark the fs clean.
*
* Note that only this function cannot bring a filesystem to be in a clean
- * state independently, because ext4 prevents a new handle from being started
- * by @sb->s_frozen, which stays in an upper layer. It thus needs help from
- * the upper layer.
+ * state independently. It relies on upper layer to stop all data & metadata
+ * modifications.
*/
static int ext4_freeze(struct super_block *sb)
{
@@ -4366,7 +4541,7 @@ static int ext4_freeze(struct super_block *sb)
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
error = ext4_commit_super(sb, 1);
out:
- /* we rely on s_frozen to stop further updates */
+ /* we rely on upper layer to stop further updates */
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
return error;
}
@@ -4380,11 +4555,9 @@ static int ext4_unfreeze(struct super_block *sb)
if (sb->s_flags & MS_RDONLY)
return 0;
- lock_super(sb);
/* Reset the needs_recovery flag before the fs is unlocked. */
EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
ext4_commit_super(sb, 1);
- unlock_super(sb);
return 0;
}
@@ -4420,7 +4593,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
char *orig_data = kstrdup(data, GFP_KERNEL);
/* Store the original options */
- lock_super(sb);
old_sb_flags = sb->s_flags;
old_opts.s_mount_opt = sbi->s_mount_opt;
old_opts.s_mount_opt2 = sbi->s_mount_opt2;
@@ -4559,7 +4731,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
ext4_setup_system_zone(sb);
- if (sbi->s_journal == NULL)
+ if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
ext4_commit_super(sb, 1);
#ifdef CONFIG_QUOTA
@@ -4568,10 +4740,17 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (old_opts.s_qf_names[i] &&
old_opts.s_qf_names[i] != sbi->s_qf_names[i])
kfree(old_opts.s_qf_names[i]);
+ if (enable_quota) {
+ if (sb_any_quota_suspended(sb))
+ dquot_resume(sb, -1);
+ else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ err = ext4_enable_quotas(sb);
+ if (err)
+ goto restore_opts;
+ }
+ }
#endif
- unlock_super(sb);
- if (enable_quota)
- dquot_resume(sb, -1);
ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
kfree(orig_data);
@@ -4595,72 +4774,25 @@ restore_opts:
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
#endif
- unlock_super(sb);
kfree(orig_data);
return err;
}
-/*
- * Note: calculating the overhead so we can be compatible with
- * historical BSD practice is quite difficult in the face of
- * clusters/bigalloc. This is because multiple metadata blocks from
- * different block group can end up in the same allocation cluster.
- * Calculating the exact overhead in the face of clustered allocation
- * requires either O(all block bitmaps) in memory or O(number of block
- * groups**2) in time. We will still calculate the superblock for
- * older file systems --- and if we come across with a bigalloc file
- * system with zero in s_overhead_clusters the estimate will be close to
- * correct especially for very large cluster sizes --- but for newer
- * file systems, it's better to calculate this figure once at mkfs
- * time, and store it in the superblock. If the superblock value is
- * present (even for non-bigalloc file systems), we will use it.
- */
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
- struct ext4_group_desc *gdp;
+ ext4_fsblk_t overhead = 0;
u64 fsid;
s64 bfree;
- if (test_opt(sb, MINIX_DF)) {
- sbi->s_overhead_last = 0;
- } else if (es->s_overhead_clusters) {
- sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters);
- } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
- ext4_group_t i, ngroups = ext4_get_groups_count(sb);
- ext4_fsblk_t overhead = 0;
-
- /*
- * Compute the overhead (FS structures). This is constant
- * for a given filesystem unless the number of block groups
- * changes so we cache the previous value until it does.
- */
-
- /*
- * All of the blocks before first_data_block are
- * overhead
- */
- overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
-
- /*
- * Add the overhead found in each block group
- */
- for (i = 0; i < ngroups; i++) {
- gdp = ext4_get_group_desc(sb, i, NULL);
- overhead += ext4_num_overhead_clusters(sb, i, gdp);
- cond_resched();
- }
- sbi->s_overhead_last = overhead;
- smp_wmb();
- sbi->s_blocks_last = ext4_blocks_count(es);
- }
+ if (!test_opt(sb, MINIX_DF))
+ overhead = sbi->s_overhead;
buf->f_type = EXT4_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = (ext4_blocks_count(es) -
- EXT4_C2B(sbi, sbi->s_overhead_last));
+ buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
/* prevent underflow in case that few free space is available */
@@ -4693,7 +4825,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
static inline struct inode *dquot_to_inode(struct dquot *dquot)
{
- return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
+ return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
}
static int ext4_write_dquot(struct dquot *dquot)
@@ -4830,6 +4962,74 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
return dquot_quota_on(sb, type, format_id, path);
}
+static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
+ unsigned int flags)
+{
+ int err;
+ struct inode *qf_inode;
+ unsigned long qf_inums[MAXQUOTAS] = {
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+ };
+
+ BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
+
+ if (!qf_inums[type])
+ return -EPERM;
+
+ qf_inode = ext4_iget(sb, qf_inums[type]);
+ if (IS_ERR(qf_inode)) {
+ ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
+ return PTR_ERR(qf_inode);
+ }
+
+ err = dquot_enable(qf_inode, type, format_id, flags);
+ iput(qf_inode);
+
+ return err;
+}
+
+/* Enable usage tracking for all quota types. */
+static int ext4_enable_quotas(struct super_block *sb)
+{
+ int type, err = 0;
+ unsigned long qf_inums[MAXQUOTAS] = {
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+ };
+
+ sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (qf_inums[type]) {
+ err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
+ DQUOT_USAGE_ENABLED);
+ if (err) {
+ ext4_warning(sb,
+ "Failed to enable quota (type=%d) "
+ "tracking. Please run e2fsck to fix.",
+ type);
+ return err;
+ }
+ }
+ }
+ return 0;
+}
+
+/*
+ * quota_on function that is used when QUOTA feature is set.
+ */
+static int ext4_quota_on_sysfile(struct super_block *sb, int type,
+ int format_id)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+ return -EINVAL;
+
+ /*
+ * USAGE was enabled at mount time. Only need to enable LIMITS now.
+ */
+ return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED);
+}
+
static int ext4_quota_off(struct super_block *sb, int type)
{
struct inode *inode = sb_dqopt(sb)->files[type];
@@ -4856,6 +5056,18 @@ out:
return dquot_quota_off(sb, type);
}
+/*
+ * quota_off function that is used when QUOTA feature is set.
+ */
+static int ext4_quota_off_sysfile(struct super_block *sb, int type)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+ return -EINVAL;
+
+ /* Disable only the limits. */
+ return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+}
+
/* Read data from quotafile - avoid pagecache and such because we cannot afford
* acquiring the locks... As quota files are never truncated and quota code
* itself serializes the operations (and no one else should touch the files)
@@ -5072,6 +5284,7 @@ static int __init ext4_init_fs(void)
ext4_li_info = NULL;
mutex_init(&ext4_li_mtx);
+ /* Build-time check for flags consistency */
ext4_check_flag_values();
for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
@@ -5079,15 +5292,22 @@ static int __init ext4_init_fs(void)
init_waitqueue_head(&ext4__ioend_wq[i]);
}
- err = ext4_init_pageio();
+ err = ext4_init_es();
if (err)
return err;
+
+ err = ext4_init_pageio();
+ if (err)
+ goto out7;
+
err = ext4_init_system_zone();
if (err)
goto out6;
ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
- if (!ext4_kset)
+ if (!ext4_kset) {
+ err = -ENOMEM;
goto out5;
+ }
ext4_proc_root = proc_mkdir("fs/ext4", NULL);
err = ext4_init_feat_adverts();
@@ -5129,6 +5349,9 @@ out5:
ext4_exit_system_zone();
out6:
ext4_exit_pageio();
+out7:
+ ext4_exit_es();
+
return err;
}
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index ed9354aff27..ff371193201 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -35,22 +35,18 @@ const struct inode_operations ext4_symlink_inode_operations = {
.follow_link = page_follow_link_light,
.put_link = page_put_link,
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
};
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = ext4_follow_link,
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e56c9ed7d6e..3a91ebc2b66 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -61,11 +61,6 @@
#include "xattr.h"
#include "acl.h"
-#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
-#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
-#define BFIRST(bh) ENTRY(BHDR(bh)+1)
-#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
-
#ifdef EXT4_XATTR_DEBUG
# define ea_idebug(inode, f...) do { \
printk(KERN_DEBUG "inode %s:%lu: ", \
@@ -127,19 +122,16 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
struct ext4_xattr_header *hdr)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum, old;
old = hdr->h_checksum;
hdr->h_checksum = 0;
- if (le32_to_cpu(hdr->h_refcount) != 1) {
- block_nr = cpu_to_le64(block_nr);
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr,
- sizeof(block_nr));
- } else
- csum = ei->i_csum_seed;
+ block_nr = cpu_to_le64(block_nr);
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr,
+ sizeof(block_nr));
csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
EXT4_BLOCK_SIZE(inode->i_sb));
+
hdr->h_checksum = old;
return cpu_to_le32(csum);
}
@@ -315,7 +307,7 @@ cleanup:
return error;
}
-static int
+int
ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
void *buffer, size_t buffer_size)
{
@@ -584,21 +576,6 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
return (*min_offs - ((void *)last - base) - sizeof(__u32));
}
-struct ext4_xattr_info {
- int name_index;
- const char *name;
- const void *value;
- size_t value_len;
-};
-
-struct ext4_xattr_search {
- struct ext4_xattr_entry *first;
- void *base;
- void *end;
- struct ext4_xattr_entry *here;
- int not_found;
-};
-
static int
ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
{
@@ -651,9 +628,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
size. Just replace. */
s->here->e_value_size =
cpu_to_le32(i->value_len);
- memset(val + size - EXT4_XATTR_PAD, 0,
- EXT4_XATTR_PAD); /* Clear pad bytes. */
- memcpy(val, i->value, i->value_len);
+ if (i->value == EXT4_ZERO_XATTR_VALUE) {
+ memset(val, 0, size);
+ } else {
+ /* Clear pad bytes first. */
+ memset(val + size - EXT4_XATTR_PAD, 0,
+ EXT4_XATTR_PAD);
+ memcpy(val, i->value, i->value_len);
+ }
return 0;
}
@@ -692,9 +674,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
size_t size = EXT4_XATTR_SIZE(i->value_len);
void *val = s->base + min_offs - size;
s->here->e_value_offs = cpu_to_le16(min_offs - size);
- memset(val + size - EXT4_XATTR_PAD, 0,
- EXT4_XATTR_PAD); /* Clear the pad bytes. */
- memcpy(val, i->value, i->value_len);
+ if (i->value == EXT4_ZERO_XATTR_VALUE) {
+ memset(val, 0, size);
+ } else {
+ /* Clear the pad bytes first. */
+ memset(val + size - EXT4_XATTR_PAD, 0,
+ EXT4_XATTR_PAD);
+ memcpy(val, i->value, i->value_len);
+ }
}
}
return 0;
@@ -797,7 +784,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
int offset = (char *)s->here - bs->bh->b_data;
unlock_buffer(bs->bh);
- ext4_handle_release_buffer(handle, bs->bh);
if (ce) {
mb_cache_entry_release(ce);
ce = NULL;
@@ -953,14 +939,8 @@ bad_block:
#undef header
}
-struct ext4_xattr_ibody_find {
- struct ext4_xattr_search s;
- struct ext4_iloc iloc;
-};
-
-static int
-ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is)
+int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
{
struct ext4_xattr_ibody_header *header;
struct ext4_inode *raw_inode;
@@ -988,10 +968,47 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
return 0;
}
-static int
-ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
- struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is)
+int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_search *s = &is->s;
+ int error;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return -ENOSPC;
+ error = ext4_xattr_set_entry(i, s);
+ if (error) {
+ if (error == -ENOSPC &&
+ ext4_has_inline_data(inode)) {
+ error = ext4_try_to_evict_inline_data(handle, inode,
+ EXT4_XATTR_LEN(strlen(i->name) +
+ EXT4_XATTR_SIZE(i->value_len)));
+ if (error)
+ return error;
+ error = ext4_xattr_ibody_find(inode, i, is);
+ if (error)
+ return error;
+ error = ext4_xattr_set_entry(i, s);
+ }
+ if (error)
+ return error;
+ }
+ header = IHDR(inode, ext4_raw_inode(&is->iloc));
+ if (!IS_LAST_ENTRY(s->first)) {
+ header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ } else {
+ header->h_magic = cpu_to_le32(0);
+ ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
+ }
+ return 0;
+}
+
+static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
{
struct ext4_xattr_ibody_header *header;
struct ext4_xattr_search *s = &is->s;
@@ -1147,9 +1164,17 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
{
handle_t *handle;
int error, retries = 0;
+ int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
retry:
- handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ /*
+ * In case of inline data, we may push out the data to a block,
+ * So reserve the journal space first.
+ */
+ if (ext4_has_inline_data(inode))
+ credits += ext4_writepage_trans_blocks(inode) + 1;
+
+ handle = ext4_journal_start(inode, credits);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
} else {
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 91f31ca7d9a..69eda787a96 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -21,6 +21,7 @@
#define EXT4_XATTR_INDEX_TRUSTED 4
#define EXT4_XATTR_INDEX_LUSTRE 5
#define EXT4_XATTR_INDEX_SECURITY 6
+#define EXT4_XATTR_INDEX_SYSTEM 7
struct ext4_xattr_header {
__le32 h_magic; /* magic number for identification */
@@ -65,7 +66,32 @@ struct ext4_xattr_entry {
EXT4_I(inode)->i_extra_isize))
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
-# ifdef CONFIG_EXT4_FS_XATTR
+#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+
+struct ext4_xattr_info {
+ int name_index;
+ const char *name;
+ const void *value;
+ size_t value_len;
+};
+
+struct ext4_xattr_search {
+ struct ext4_xattr_entry *first;
+ void *base;
+ void *end;
+ struct ext4_xattr_entry *here;
+ int not_found;
+};
+
+struct ext4_xattr_ibody_find {
+ struct ext4_xattr_search s;
+ struct ext4_iloc iloc;
+};
extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
@@ -90,60 +116,82 @@ extern void ext4_exit_xattr(void);
extern const struct xattr_handler *ext4_xattr_handlers[];
-# else /* CONFIG_EXT4_FS_XATTR */
-
-static inline int
-ext4_xattr_get(struct inode *inode, int name_index, const char *name,
- void *buffer, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int
-ext4_xattr_set(struct inode *inode, int name_index, const char *name,
- const void *value, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int
-ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
- const char *name, const void *value, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline void
-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
-{
-}
-
-static inline void
-ext4_xattr_put_super(struct super_block *sb)
-{
-}
-
-static __init inline int
-ext4_init_xattr(void)
-{
- return 0;
-}
-
-static inline void
-ext4_exit_xattr(void)
-{
-}
-
-static inline int
-ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
- struct ext4_inode *raw_inode, handle_t *handle)
-{
- return -EOPNOTSUPP;
-}
-
-#define ext4_xattr_handlers NULL
-
-# endif /* CONFIG_EXT4_FS_XATTR */
+extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is);
+extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
+ const char *name,
+ void *buffer, size_t buffer_size);
+extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is);
+
+extern int ext4_has_inline_data(struct inode *inode);
+extern int ext4_get_inline_size(struct inode *inode);
+extern int ext4_get_max_inline_size(struct inode *inode);
+extern int ext4_find_inline_data_nolock(struct inode *inode);
+extern void ext4_write_inline_data(struct inode *inode,
+ struct ext4_iloc *iloc,
+ void *buffer, loff_t pos,
+ unsigned int len);
+extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len);
+extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len);
+extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
+
+extern int ext4_readpage_inline(struct inode *inode, struct page *page);
+extern int ext4_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep);
+extern int ext4_write_inline_data_end(struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned copied,
+ struct page *page);
+extern struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+ unsigned len,
+ struct page *page);
+extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep,
+ void **fsdata);
+extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+ unsigned len, unsigned copied,
+ struct page *page);
+extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode);
+extern int ext4_try_create_inline_dir(handle_t *handle,
+ struct inode *parent,
+ struct inode *inode);
+extern int ext4_read_inline_dir(struct file *filp,
+ void *dirent, filldir_t filldir,
+ int *has_inline_data);
+extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *has_inline_data);
+extern int ext4_delete_inline_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ int *has_inline_data);
+extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
+extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+ struct ext4_dir_entry_2 **parent_de,
+ int *retval);
+extern int ext4_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ int *has_inline);
+extern int ext4_try_to_evict_inline_data(handle_t *handle,
+ struct inode *inode,
+ int needed);
+extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
+
+extern int ext4_convert_inline_data(struct inode *inode);
#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,