From b2b5ef5c8e89f19b68c174bf246f3ca212dbf0bc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:45 +0200 Subject: btrfs: Convert to new freezing mechanism We convert btrfs_file_aio_write() to use new freeze check. We also add proper freeze protection to btrfs_page_mkwrite(). We also add freeze protection to the transaction mechanism to avoid starting transactions on frozen filesystem. At minimum this is necessary to stop iput() of unlinked file to change frozen filesystem during truncation. Checks in cleaner_kthread() and transaction_kthread() can be safely removed since btrfs_freeze() will lock the mutexes and thus block the threads (and they shouldn't have anything to do anyway). CC: linux-btrfs@vger.kernel.org CC: Chris Mason Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/btrfs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9aa01ec2138..5caf285c6e4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1379,7 +1379,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, ssize_t err = 0; size_t count, ocount; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); @@ -1469,6 +1469,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, num_written = err; } out: + sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; } -- cgit v1.2.3 From 224ecce517af3a952321202cdf304c12e138caca Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 16 Aug 2012 16:32:06 -0400 Subject: Btrfs: fix possible corruption when fsyncing written prealloced extents While working on my fsync patch my fsync tester kept hitting mismatching md5sums when I would randomly write to a prealloc'ed region, syncfs() and then write to the prealloced region some more and then fsync() and then immediately reboot. This is because the tree logging code will skip writing csums for file extents who's generation is less than the current running transaction. When we mark extents as written we haven't been updating their generation so they were always being skipped. This wouldn't happen if you were to preallocate and then write in the same transaction, but if you for example prealloced a VM you could definitely run into this problem. This patch makes my fsync tester happy again. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5caf285c6e4..b7c885c8423 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -935,12 +935,16 @@ again: btrfs_set_item_key_safe(trans, root, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - end); btrfs_set_file_extent_offset(leaf, fi, end - orig_offset); fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); btrfs_mark_buffer_dirty(leaf); @@ -958,12 +962,16 @@ again: struct btrfs_file_extent_item); btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); path->slots[0]++; new_key.offset = start; btrfs_set_item_key_safe(trans, root, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, other_end - start); btrfs_set_file_extent_offset(leaf, fi, @@ -991,12 +999,14 @@ again: leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); @@ -1056,12 +1066,14 @@ again: struct btrfs_file_extent_item); btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_mark_buffer_dirty(leaf); } else { fi = btrfs_item_ptr(leaf, del_slot - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); btrfs_mark_buffer_dirty(leaf); -- cgit v1.2.3 From 5dc562c541e1026df9d43913c2f6b91156e22d32 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Aug 2012 13:14:17 -0400 Subject: Btrfs: turbo charge fsync At least for the vm workload. Currently on fsync we will 1) Truncate all items in the log tree for the given inode if they exist and 2) Copy all items for a given inode into the log The problem with this is that for things like VMs you can have lots of extents from the fragmented writing behavior, and worst yet you may have only modified a few extents, not the entire thing. This patch fixes this problem by tracking which transid modified our extent, and then when we do the tree logging we find all of the extents we've modified in our current transaction, sort them and commit them. We also only truncate up to the xattrs of the inode and copy that stuff in normally, and then just drop any extents in the range we have that exist in the log already. Here are some numbers of a 50 meg fio job that does random writes and fsync()s after every write Original Patched SATA drive 82KB/s 140KB/s Fusion drive 431KB/s 2532KB/s So around 2-6 times faster depending on your hardware. There are a few corner cases, for example if you truncate at all we have to do it the old way since there is no way to be sure what is in the log is ok. This probably could be done smarter, but if you write-fsync-truncate-write-fsync you deserve what you get. All this work is in RAM of course so if your inode gets evicted from cache and you read it in and fsync it we'll do it the slow way if we are still in the same transaction that we last modified the inode in. The biggest cool part of this is that it requires no changes to the recovery code, so if you fsync with this patch and crash and load an old kernel, it will run the recovery and be a-ok. I have tested this pretty thoroughly with an fsync tester and everything comes back fine, as well as xfstests. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 62 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 19 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b7c885c8423..399f9d71a92 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -459,13 +459,14 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, * [start, end]. Existing extents are split as required. */ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, - int skip_pinned) + int skip_pinned) { struct extent_map *em; struct extent_map *split = NULL; struct extent_map *split2 = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 len = end - start + 1; + u64 gen; int ret; int testend = 1; unsigned long flags; @@ -490,6 +491,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, break; } flags = em->flags; + gen = em->generation; if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { if (testend && em->start + em->len >= start + len) { free_extent_map(em); @@ -518,12 +520,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->block_len = em->block_len; else split->block_len = split->len; - + split->generation = gen; split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; ret = add_extent_mapping(em_tree, split); BUG_ON(ret); /* Logic error */ + list_move(&split->list, &em_tree->modified_extents); free_extent_map(split); split = split2; split2 = NULL; @@ -537,6 +540,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; + split->generation = gen; if (compressed) { split->block_len = em->block_len; @@ -550,6 +554,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, ret = add_extent_mapping(em_tree, split); BUG_ON(ret); /* Logic error */ + list_move(&split->list, &em_tree->modified_extents); free_extent_map(split); split = NULL; } @@ -576,13 +581,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. */ -int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, - u64 start, u64 end, u64 *hint_byte, int drop_cache) +int __btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, u64 start, u64 end, + u64 *hint_byte, int drop_cache) { - struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; - struct btrfs_path *path; struct btrfs_key key; struct btrfs_key new_key; u64 ino = btrfs_ino(inode); @@ -597,14 +602,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, int recow; int ret; int modify_tree = -1; + int update_refs = (root->ref_cows || root == root->fs_info->tree_root); if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - if (start >= BTRFS_I(inode)->disk_i_size) modify_tree = 0; @@ -707,7 +709,7 @@ next_slot: extent_end - start); btrfs_mark_buffer_dirty(leaf); - if (disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) { ret = btrfs_inc_extent_ref(trans, root, disk_bytenr, num_bytes, 0, root->root_key.objectid, @@ -734,7 +736,7 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - end); btrfs_mark_buffer_dirty(leaf); - if (disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) { inode_sub_bytes(inode, end - key.offset); *hint_byte = disk_bytenr; } @@ -753,7 +755,7 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); btrfs_mark_buffer_dirty(leaf); - if (disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) { inode_sub_bytes(inode, extent_end - start); *hint_byte = disk_bytenr; } @@ -777,12 +779,13 @@ next_slot: del_nr++; } - if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + if (update_refs && + extent_type == BTRFS_FILE_EXTENT_INLINE) { inode_sub_bytes(inode, extent_end - key.offset); extent_end = ALIGN(extent_end, root->sectorsize); - } else if (disk_bytenr > 0) { + } else if (update_refs && disk_bytenr > 0) { ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, 0, root->root_key.objectid, @@ -806,7 +809,7 @@ next_slot: del_nr); if (ret) { btrfs_abort_transaction(trans, root, ret); - goto out; + break; } del_nr = 0; @@ -825,7 +828,22 @@ next_slot: btrfs_abort_transaction(trans, root, ret); } -out: + btrfs_release_path(path); + return ret; +} + +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, u64 start, + u64 end, u64 *hint_byte, int drop_cache) +{ + struct btrfs_path *path; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = __btrfs_drop_extents(trans, root, inode, path, start, end, + hint_byte, drop_cache); btrfs_free_path(path); return ret; } @@ -892,8 +910,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, int ret; u64 ino = btrfs_ino(inode); - btrfs_drop_extent_cache(inode, start, end - 1, 0); - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1556,6 +1572,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { BTRFS_I(inode)->last_trans = 0; + + /* + * We'v had everything committed since the last time we were + * modified so clear this flag in case it was set for whatever + * reason, it's no longer relevant. + */ + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); mutex_unlock(&inode->i_mutex); goto out; } -- cgit v1.2.3 From 2671485d395c07fca104c972785898d7c52fc942 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 29 Aug 2012 12:24:27 -0400 Subject: Btrfs: remove unused hint byte argument for btrfs_drop_extents I audited all users of btrfs_drop_extents and found that nobody actually uses the hint_byte argument. I'm sure it was used for something at some point but it's not used now, and the way the pinning works the disk bytenr would never be immediately useful anyway so lets just remove it. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 399f9d71a92..58598c24995 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -584,7 +584,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, - u64 *hint_byte, int drop_cache) + int drop_cache) { struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; @@ -716,7 +716,6 @@ next_slot: new_key.objectid, start - extent_offset, 0); BUG_ON(ret); /* -ENOMEM */ - *hint_byte = disk_bytenr; } key.offset = start; } @@ -736,10 +735,8 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - end); btrfs_mark_buffer_dirty(leaf); - if (update_refs && disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) inode_sub_bytes(inode, end - key.offset); - *hint_byte = disk_bytenr; - } break; } @@ -755,10 +752,8 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); btrfs_mark_buffer_dirty(leaf); - if (update_refs && disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) inode_sub_bytes(inode, extent_end - start); - *hint_byte = disk_bytenr; - } if (end == extent_end) break; @@ -794,7 +789,6 @@ next_slot: BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); - *hint_byte = disk_bytenr; } if (end == extent_end) @@ -834,7 +828,7 @@ next_slot: int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, - u64 end, u64 *hint_byte, int drop_cache) + u64 end, int drop_cache) { struct btrfs_path *path; int ret; @@ -843,7 +837,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; ret = __btrfs_drop_extents(trans, root, inode, path, start, end, - hint_byte, drop_cache); + drop_cache); btrfs_free_path(path); return ret; } -- cgit v1.2.3 From 2aaa66558172b017f36bf38ae69372813dedee9d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 29 Aug 2012 14:27:18 -0400 Subject: Btrfs: add hole punching This patch adds hole punching via fallocate. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 332 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 328 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 58598c24995..57026a6e9c9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -39,6 +39,7 @@ #include "tree-log.h" #include "locking.h" #include "compat.h" +#include "volumes.h" /* * when auto defrag is enabled we @@ -584,7 +585,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, - int drop_cache) + u64 *drop_end, int drop_cache) { struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; @@ -822,6 +823,8 @@ next_slot: btrfs_abort_transaction(trans, root, ret); } + if (drop_end) + *drop_end = min(end, extent_end); btrfs_release_path(path); return ret; } @@ -836,7 +839,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = __btrfs_drop_extents(trans, root, inode, path, start, end, + ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, drop_cache); btrfs_free_path(path); return ret; @@ -1645,6 +1648,324 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } +static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, + int slot, u64 start, u64 end) +{ + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + + if (slot < 0 || slot >= btrfs_header_nritems(leaf)) + return 0; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) + return 0; + + if (btrfs_file_extent_disk_bytenr(leaf, fi)) + return 0; + + if (key.offset == end) + return 1; + if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) + return 1; + return 0; +} + +static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, + struct btrfs_path *path, u64 offset, u64 end) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct extent_map *hole_em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_key key; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = offset; + + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + return ret; + BUG_ON(!ret); + + leaf = path->nodes[0]; + if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) { + u64 num_bytes; + + path->slots[0]--; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + + end - offset; + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + + if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { + u64 num_bytes; + + path->slots[0]++; + key.offset = offset; + btrfs_set_item_key_safe(trans, root, path, &key); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - + offset; + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + btrfs_release_path(path); + + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, + 0, 0, end - offset, 0, end - offset, + 0, 0, 0); + if (ret) + return ret; + +out: + btrfs_release_path(path); + + hole_em = alloc_extent_map(); + if (!hole_em) { + btrfs_drop_extent_cache(inode, offset, end - 1, 0); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } else { + hole_em->start = offset; + hole_em->len = end - offset; + hole_em->orig_start = offset; + + hole_em->block_start = EXTENT_MAP_HOLE; + hole_em->block_len = 0; + hole_em->bdev = root->fs_info->fs_devices->latest_bdev; + hole_em->compress_type = BTRFS_COMPRESS_NONE; + hole_em->generation = trans->transid; + + do { + btrfs_drop_extent_cache(inode, offset, end - 1, 0); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, hole_em); + if (!ret) + list_move(&hole_em->list, + &em_tree->modified_extents); + write_unlock(&em_tree->lock); + } while (ret == -EEXIST); + free_extent_map(hole_em); + if (ret) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } + + return 0; +} + +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_block_rsv *rsv; + struct btrfs_trans_handle *trans; + u64 mask = BTRFS_I(inode)->root->sectorsize - 1; + u64 lockstart = (offset + mask) & ~mask; + u64 lockend = ((offset + len) & ~mask) - 1; + u64 cur_offset = lockstart; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); + u64 drop_end; + unsigned long nr; + int ret = 0; + int err = 0; + bool same_page = (offset >> PAGE_CACHE_SHIFT) == + ((offset + len) >> PAGE_CACHE_SHIFT); + + btrfs_wait_ordered_range(inode, offset, len); + + mutex_lock(&inode->i_mutex); + if (offset >= inode->i_size) { + mutex_unlock(&inode->i_mutex); + return 0; + } + + /* + * Only do this if we are in the same page and we aren't doing the + * entire page. + */ + if (same_page && len < PAGE_CACHE_SIZE) { + ret = btrfs_truncate_page(inode, offset, len, 0); + mutex_unlock(&inode->i_mutex); + return ret; + } + + /* zero back part of the first page */ + ret = btrfs_truncate_page(inode, offset, 0, 0); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } + + /* zero the front end of the last page */ + ret = btrfs_truncate_page(inode, offset + len, 0, 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } + + if (lockend < lockstart) { + mutex_unlock(&inode->i_mutex); + return 0; + } + + while (1) { + struct btrfs_ordered_extent *ordered; + + truncate_pagecache_range(inode, lockstart, lockend); + + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, &cached_state); + ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + + /* + * We need to make sure we have no ordered extents in this range + * and nobody raced in and read a page in this range, if we did + * we need to try again. + */ + if ((!ordered || + (ordered->file_offset + ordered->len < lockstart || + ordered->file_offset > lockend)) && + !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_UPTODATE, 0, + cached_state)) { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state, GFP_NOFS); + btrfs_wait_ordered_range(inode, lockstart, + lockend - lockstart + 1); + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + rsv = btrfs_alloc_block_rsv(root); + if (!rsv) { + ret = -ENOMEM; + goto out_free; + } + rsv->size = btrfs_calc_trunc_metadata_size(root, 1); + rsv->failfast = 1; + + /* + * 1 - update the inode + * 1 - removing the extents in the range + * 1 - adding the hole extent + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } + + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, + min_size); + BUG_ON(ret); + trans->block_rsv = rsv; + + while (cur_offset < lockend) { + ret = __btrfs_drop_extents(trans, root, inode, path, + cur_offset, lockend + 1, + &drop_end, 1); + if (ret != -ENOSPC) + break; + + trans->block_rsv = &root->fs_info->trans_block_rsv; + + ret = fill_holes(trans, inode, path, cur_offset, drop_end); + if (ret) { + err = ret; + break; + } + + cur_offset = drop_end; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + err = ret; + break; + } + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + break; + } + + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, + rsv, min_size); + BUG_ON(ret); /* shouldn't happen */ + trans->block_rsv = rsv; + } + + if (ret) { + err = ret; + goto out_trans; + } + + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = fill_holes(trans, inode, path, cur_offset, drop_end); + if (ret) { + err = ret; + goto out_trans; + } + +out_trans: + if (!trans) + goto out_free; + + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); +out_free: + btrfs_free_path(path); + btrfs_free_block_rsv(root, rsv); +out: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); + mutex_unlock(&inode->i_mutex); + if (ret && !err) + err = ret; + return err; +} + static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -1663,10 +1984,13 @@ static long btrfs_fallocate(struct file *file, int mode, alloc_start = offset & ~mask; alloc_end = (offset + len + mask) & ~mask; - /* We only support the FALLOC_FL_KEEP_SIZE mode */ - if (mode & ~FALLOC_FL_KEEP_SIZE) + /* Make sure we aren't being give some crap mode */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; + if (mode & FALLOC_FL_PUNCH_HOLE) + return btrfs_punch_hole(inode, offset, len); + /* * Make sure we have enough space before we do the * allocation. -- cgit v1.2.3 From 7014cdb49305eda0767d2ae6136f8c191ea8fd81 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 30 Aug 2012 20:06:49 -0400 Subject: Btrfs: btrfs_drop_extent_cache should never fail I noticed this when I was doing the fsync stuff, we allocate split extents if we drop an extent range that is in the middle of an existing extent. This BUG()'s if we fail to allocate memory, but the fact is this is just a cache, we will just regenerate the cache if we need it, the important part is that we free the range we are given. This can be done without allocations, so if we fail to allocate splits just skip the splitting stage and free our em and look for more extents to drop. This also makes btrfs_drop_extent_cache a void since nobody was checking the return value anyway. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 57026a6e9c9..a50e98733e2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -459,8 +459,8 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, * this drops all the extents in the cache that intersect the range * [start, end]. Existing extents are split as required. */ -int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, - int skip_pinned) +void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned) { struct extent_map *em; struct extent_map *split = NULL; @@ -479,11 +479,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, testend = 0; } while (1) { + int no_splits = 0; + if (!split) split = alloc_extent_map(); if (!split2) split2 = alloc_extent_map(); - BUG_ON(!split || !split2); /* -ENOMEM */ + if (!split || !split2) + no_splits = 1; write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -509,6 +512,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); clear_bit(EXTENT_FLAG_PINNED, &em->flags); remove_extent_mapping(em_tree, em); + if (no_splits) + goto next; if (em->block_start < EXTENT_MAP_LAST_BYTE && em->start < start) { @@ -559,6 +564,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, free_extent_map(split); split = NULL; } +next: write_unlock(&em_tree->lock); /* once for us */ @@ -570,7 +576,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, free_extent_map(split); if (split2) free_extent_map(split2); - return 0; } /* -- cgit v1.2.3 From 66d8f3dd1c87813d7f1cf8b774cb03e9b8d7e87e Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 6 Sep 2012 04:02:28 -0600 Subject: Btrfs: add a new "type" field into the block reservation structure Sometimes we need choose the method of the reservation according to the type of the block reservation, such as the reservation for the delayed inode update. Now we identify the type just by comparing the address of the reservation variants, it is very ugly if it is a temporary one because we need compare it with all the common reservation variants. So we add a new "type" field to keep the type the reservation variants. Signed-off-by: Miao Xie --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a50e98733e2..a9d7815cf58 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1874,7 +1874,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - rsv = btrfs_alloc_block_rsv(root); + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); if (!rsv) { ret = -ENOMEM; goto out_free; -- cgit v1.2.3 From 2ecb79239bcd04c9d410f4cdce16adb6840b19da Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 6 Sep 2012 04:04:27 -0600 Subject: Btrfs: fix unprotected ->log_batch We forget to protect ->log_batch when syncing a file, this patch fix this problem by atomic operation. And ->log_batch is used to check if there are parallel sync operations or not, so it is unnecessary to reset it to 0 after the sync operation of the current log tree complete. Signed-off-by: Miao Xie --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a9d7815cf58..793bc89c660 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1551,9 +1551,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * ordered range does a filemape_write_and_wait_range which is why we * don't do it above like other file systems. */ - root->log_batch++; + atomic_inc(&root->log_batch); btrfs_wait_ordered_range(inode, start, end); - root->log_batch++; + atomic_inc(&root->log_batch); /* * check the transaction that last modified this inode -- cgit v1.2.3 From 903889f462409c816893abd02d88636f7b4a7774 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 6 Sep 2012 04:04:57 -0600 Subject: Btrfs: fix wrong size for the reservation when doing, file pre-allocation. When we ran fsstress(a program in xfstests), the filesystem hung up when it is full. It was because the space reserved in btrfs_fallocate() was wrong, btrfs_fallocate() just used the size of the pre-allocation to reserve the space, didn't took the block size aligning into account, so the size of the reserved space was less than the allocated space, it caused the over reserve problem and made the filesystem hung up when invoking cow_file_range(). Fix it. Signed-off-by: Miao Xie --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 793bc89c660..00279c9a7f3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2000,7 +2000,7 @@ static long btrfs_fallocate(struct file *file, int mode, * Make sure we have enough space before we do the * allocation. */ - ret = btrfs_check_data_free_space(inode, len); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); if (ret) return ret; @@ -2107,7 +2107,7 @@ static long btrfs_fallocate(struct file *file, int mode, out: mutex_unlock(&inode->i_mutex); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, len); + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); return ret; } -- cgit v1.2.3 From 9e8a4a8b0b9484e8d14674fc62c9ad8ac9dbce5b Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 5 Sep 2012 19:10:51 -0600 Subject: Btrfs: use flag EXTENT_DEFRAG for snapshot-aware defrag We're going to use this flag EXTENT_DEFRAG to indicate which range belongs to defragment so that we can implement snapshow-aware defrag: We set the EXTENT_DEFRAG flag when dirtying the extents that need defragmented, so later on writeback thread can differentiate between normal writeback and writeback started by defragmentation. Original-Signed-off-by: Li Zefan Signed-off-by: Liu Bo --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 00279c9a7f3..0a4b03d8fcd 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1203,8 +1203,8 @@ again: clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, - GFP_NOFS); + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, &cached_state, GFP_NOFS); unlock_extent_cached(&BTRFS_I(inode)->io_tree, start_pos, last_pos - 1, &cached_state, GFP_NOFS); -- cgit v1.2.3 From 90abccf2c6e6e9c5a5d519eaed95292afa30aa11 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 13 Sep 2012 04:53:47 -0600 Subject: Revert "Btrfs: do not do filemap_write_and_wait_range in fsync" This reverts commit 0885ef5b5601e9b007c383e77c172769b1f214fd After applying the above patch, the performance slowed down because the dirty page flush can only be done by one task, so revert it. The following is the test result of sysbench: Before After 24MB/s 39MB/s Signed-off-by: Miao Xie --- fs/btrfs/file.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0a4b03d8fcd..d0fc4c5aaf1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1544,12 +1544,20 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_btrfs_sync_file(file, datasync); + /* + * We write the dirty pages in the range and wait until they complete + * out of the ->i_mutex. If so, we can flush the dirty pages by + * multi-task, and make the performance up. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); /* - * we wait first, since the writeback may change the inode, also wait - * ordered range does a filemape_write_and_wait_range which is why we - * don't do it above like other file systems. + * We flush the dirty pages again to avoid some dirty pages in the + * range being left. */ atomic_inc(&root->log_batch); btrfs_wait_ordered_range(inode, start, end); -- cgit v1.2.3 From c3308f84c1743eabb91f4976a314d118d5ea2342 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Sep 2012 14:51:22 -0400 Subject: Btrfs: fix punch hole when no extent exists I saw the warning in btrfs_drop_extent_cache where our end is less than our start while running xfstests 68 in a loop. This is because we unconditionally do drop_end = min(end, extent_end) in __btrfs_drop_extents(), even though we may not have found an extent in the range we were looking to drop. So keep track of wether or not we found something, and if we didn't just use our end. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d0fc4c5aaf1..110d3cb7b6f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -609,6 +609,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int ret; int modify_tree = -1; int update_refs = (root->ref_cows || root == root->fs_info->tree_root); + int found = 0; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); @@ -674,6 +675,7 @@ next_slot: goto next_slot; } + found = 1; search_start = max(key.offset, start); if (recow || !modify_tree) { modify_tree = -1; @@ -829,7 +831,7 @@ next_slot: } if (drop_end) - *drop_end = min(end, extent_end); + *drop_end = found ? min(end, extent_end) : end; btrfs_release_path(path); return ret; } -- cgit v1.2.3 From 0b173bc4daa8f8ec03a85abf5e47b23502ff80af Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:46 -0700 Subject: mm: kill vma flag VM_CAN_NONLINEAR Move actual pte filling for non-linear file mappings into the new special vma operation: ->remap_pages(). Filesystems must implement this method to get non-linear mapping support, if it uses filemap_fault() then generic_file_remap_pages() can be used. Now device drivers can implement this method and obtain nonlinear vma support. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf #arch/tile Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5caf285c6e4..f6b40e86121 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1599,6 +1599,7 @@ out: static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = btrfs_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) @@ -1610,7 +1611,6 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) file_accessed(filp); vma->vm_ops = &btrfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } -- cgit v1.2.3 From d0e1d66b5aa1ec9f556f951aa9a114cc192cd01c Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 11 Dec 2012 16:00:21 -0800 Subject: writeback: remove nr_pages_dirtied arg from balance_dirty_pages_ratelimited_nr() There is no reason to pass the nr_pages_dirtied argument, because nr_pages_dirtied value from the caller is unused in balance_dirty_pages_ratelimited_nr(). Signed-off-by: Namjae Jeon Signed-off-by: Vivek Trivedi Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9ab1bed8811..a8ee75cb96e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1346,8 +1346,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, cond_resched(); - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - dirty_pages); + balance_dirty_pages_ratelimited(inode->i_mapping); if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root, 1); -- cgit v1.2.3 From 9f3959c53d57d010ae6f4205fbd0159cb7976a83 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 1 Nov 2012 06:38:48 +0000 Subject: Btrfs: get right arguments for btrfs_wait_ordered_range btrfs_wait_ordered_range expects for 'len' instead of 'end'. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9ab1bed8811..d2df98124d0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1562,7 +1562,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * range being left. */ atomic_inc(&root->log_batch); - btrfs_wait_ordered_range(inode, start, end); + btrfs_wait_ordered_range(inode, start, end - start + 1); atomic_inc(&root->log_batch); /* -- cgit v1.2.3 From e1f5790e0588bc5b11eb57f95bfde8702049dd0d Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 8 Nov 2012 04:47:33 +0000 Subject: Btrfs: set hole punching time properly Even if the hole punching is executed, the modification time of the file is not updated. So, current time is set to inode. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d2df98124d0..883cf826cf2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1964,6 +1964,9 @@ out_trans: if (!trans) goto out_free; + inode_inc_iversion(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); nr = trans->blocks_used; -- cgit v1.2.3 From b53d3f5db2b79637acadc06a330db6c2c60863f5 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 14 Nov 2012 14:34:34 +0000 Subject: Btrfs: cleanup for btrfs_btree_balance_dirty - 'nr' is no more used. - btrfs_btree_balance_dirty() and __btrfs_btree_balance_dirty() can share a bunch of code. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/file.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 883cf826cf2..bd7f1b01e05 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1349,7 +1349,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, balance_dirty_pages_ratelimited_nr(inode->i_mapping, dirty_pages); if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) - btrfs_btree_balance_dirty(root, 1); + btrfs_btree_balance_dirty(root); pos += copied; num_written += copied; @@ -1803,7 +1803,6 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 cur_offset = lockstart; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); u64 drop_end; - unsigned long nr; int ret = 0; int err = 0; bool same_page = (offset >> PAGE_CACHE_SHIFT) == @@ -1931,9 +1930,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) break; } - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { @@ -1969,9 +1967,8 @@ out_trans: trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); out_free: btrfs_free_path(path); btrfs_free_block_rsv(root, rsv); -- cgit v1.2.3 From 9247f3170b2c3d648707c93bbebcd763fac17c06 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:24:43 +0000 Subject: Btrfs: use slabs for auto defrag allocation The auto defrag allocation is in the fast path of the IO, so use slabs to improve the speed of the allocation. And besides that, it can do check for leaked objects when the module is removed. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bd7f1b01e05..15117eae85c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -41,6 +41,7 @@ #include "compat.h" #include "volumes.h" +static struct kmem_cache *btrfs_inode_defrag_cachep; /* * when auto defrag is enabled we * queue up these defrag structs to remember which @@ -127,7 +128,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode, return; exists: - kfree(defrag); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return; } @@ -157,7 +158,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, else transid = BTRFS_I(inode)->root->last_trans; - defrag = kzalloc(sizeof(*defrag), GFP_NOFS); + defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) return -ENOMEM; @@ -169,7 +170,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) __btrfs_add_inode_defrag(inode, defrag); else - kfree(defrag); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); spin_unlock(&root->fs_info->defrag_inodes_lock); return 0; } @@ -315,7 +316,8 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) next: spin_lock(&fs_info->defrag_inodes_lock); next_free: - kfree(defrag); + if (defrag) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } spin_unlock(&fs_info->defrag_inodes_lock); @@ -2293,3 +2295,21 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_ioctl, #endif }; + +void btrfs_auto_defrag_exit(void) +{ + if (btrfs_inode_defrag_cachep) + kmem_cache_destroy(btrfs_inode_defrag_cachep); +} + +int btrfs_auto_defrag_init(void) +{ + btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", + sizeof(struct inode_defrag), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_inode_defrag_cachep) + return -ENOMEM; + + return 0; +} -- cgit v1.2.3 From 8ddc473433b5e8ce8693db9f6e251f5a28267528 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:25:38 +0000 Subject: Btrfs: fix unprotected defragable inode insertion We forget to get the defrag lock when we re-add the defragable inode, Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 70 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 15 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 15117eae85c..00918321e39 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -91,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, * If an existing record is found the defrag item you * pass in is freed */ -static void __btrfs_add_inode_defrag(struct inode *inode, +static int __btrfs_add_inode_defrag(struct inode *inode, struct inode_defrag *defrag) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -119,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode, entry->transid = defrag->transid; if (defrag->last_offset > entry->last_offset) entry->last_offset = defrag->last_offset; - goto exists; + return -EEXIST; } } set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); rb_link_node(&defrag->rb_node, parent, p); rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); - return; + return 0; +} -exists: - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return; +static inline int __need_auto_defrag(struct btrfs_root *root) +{ + if (!btrfs_test_opt(root, AUTO_DEFRAG)) + return 0; + + if (btrfs_fs_closing(root->fs_info)) + return 0; + return 1; } /* @@ -143,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_root *root = BTRFS_I(inode)->root; struct inode_defrag *defrag; u64 transid; + int ret; - if (!btrfs_test_opt(root, AUTO_DEFRAG)) - return 0; - - if (btrfs_fs_closing(root->fs_info)) + if (!__need_auto_defrag(root)) return 0; if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) @@ -167,14 +171,50 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag->root = root->root_key.objectid; spin_lock(&root->fs_info->defrag_inodes_lock); - if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) - __btrfs_add_inode_defrag(inode, defrag); - else + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { + /* + * If we set IN_DEFRAG flag and evict the inode from memory, + * and then re-read this inode, this new inode doesn't have + * IN_DEFRAG flag. At the case, we may find the existed defrag. + */ + ret = __btrfs_add_inode_defrag(inode, defrag); + if (ret) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } else { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } spin_unlock(&root->fs_info->defrag_inodes_lock); return 0; } +/* + * Requeue the defrag object. If there is a defrag object that points to + * the same inode in the tree, we will merge them together (by + * __btrfs_add_inode_defrag()) and free the one that we want to requeue. + */ +void btrfs_requeue_inode_defrag(struct inode *inode, + struct inode_defrag *defrag) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (!__need_auto_defrag(root)) + goto out; + + /* + * Here we don't check the IN_DEFRAG flag, because we need merge + * them together. + */ + spin_lock(&root->fs_info->defrag_inodes_lock); + ret = __btrfs_add_inode_defrag(inode, defrag); + spin_unlock(&root->fs_info->defrag_inodes_lock); + if (ret) + goto out; + return; +out: + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); +} + /* * must be called with the defrag_inodes lock held */ @@ -294,7 +334,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) */ if (num_defrag == defrag_batch) { defrag->last_offset = range.start; - __btrfs_add_inode_defrag(inode, defrag); + btrfs_requeue_inode_defrag(inode, defrag); /* * we don't want to kfree defrag, we added it back to * the rbtree @@ -308,7 +348,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) */ defrag->last_offset = 0; defrag->cycled = 1; - __btrfs_add_inode_defrag(inode, defrag); + btrfs_requeue_inode_defrag(inode, defrag); defrag = NULL; } -- cgit v1.2.3 From 26176e7c2aa923327becdc25b5aca2cb907ac932 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:26:20 +0000 Subject: Btrfs: restructure btrfs_run_defrag_inodes() This patch restructure btrfs_run_defrag_inodes() and make the code of the auto defragment more readable. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 197 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 107 insertions(+), 90 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 00918321e39..3c6f7479cd5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -216,11 +216,11 @@ out: } /* - * must be called with the defrag_inodes lock held + * pick the defragable inode that we want, if it doesn't exist, we will get + * the next one. */ -struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, - u64 root, u64 ino, - struct rb_node **next) +static struct inode_defrag * +btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) { struct inode_defrag *entry = NULL; struct inode_defrag tmp; @@ -231,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, tmp.ino = ino; tmp.root = root; - p = info->defrag_inodes.rb_node; + spin_lock(&fs_info->defrag_inodes_lock); + p = fs_info->defrag_inodes.rb_node; while (p) { parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); @@ -242,52 +243,128 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, else if (ret > 0) p = parent->rb_right; else - return entry; + goto out; } - if (next) { - while (parent && __compare_inode_defrag(&tmp, entry) > 0) { - parent = rb_next(parent); + if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + parent = rb_next(parent); + if (parent) entry = rb_entry(parent, struct inode_defrag, rb_node); - } - *next = parent; + else + entry = NULL; } - return NULL; +out: + if (entry) + rb_erase(parent, &fs_info->defrag_inodes); + spin_unlock(&fs_info->defrag_inodes_lock); + return entry; } -/* - * run through the list of inodes in the FS that need - * defragging - */ -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) { struct inode_defrag *defrag; + struct rb_node *node; + + spin_lock(&fs_info->defrag_inodes_lock); + node = rb_first(&fs_info->defrag_inodes); + while (node) { + rb_erase(node, &fs_info->defrag_inodes); + defrag = rb_entry(node, struct inode_defrag, rb_node); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + + if (need_resched()) { + spin_unlock(&fs_info->defrag_inodes_lock); + cond_resched(); + spin_lock(&fs_info->defrag_inodes_lock); + } + + node = rb_first(&fs_info->defrag_inodes); + } + spin_unlock(&fs_info->defrag_inodes_lock); +} + +#define BTRFS_DEFRAG_BATCH 1024 + +static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag) +{ struct btrfs_root *inode_root; struct inode *inode; - struct rb_node *n; struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; - u64 first_ino = 0; - u64 root_objectid = 0; int num_defrag; - int defrag_batch = 1024; + /* get the inode */ + key.objectid = defrag->root; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(inode_root)) { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return PTR_ERR(inode_root); + } + + key.objectid = defrag->ino; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); + if (IS_ERR(inode)) { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return PTR_ERR(inode); + } + + /* do a chunk of defrag */ + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; + range.start = defrag->last_offset; + num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + BTRFS_DEFRAG_BATCH); + /* + * if we filled the whole defrag batch, there + * must be more work to do. Queue this defrag + * again + */ + if (num_defrag == BTRFS_DEFRAG_BATCH) { + defrag->last_offset = range.start; + btrfs_requeue_inode_defrag(inode, defrag); + } else if (defrag->last_offset && !defrag->cycled) { + /* + * we didn't fill our defrag batch, but + * we didn't start at zero. Make sure we loop + * around to the start of the file. + */ + defrag->last_offset = 0; + defrag->cycled = 1; + btrfs_requeue_inode_defrag(inode, defrag); + } else { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } + + iput(inode); + return 0; +} + +/* + * run through the list of inodes in the FS that need + * defragging + */ +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + u64 first_ino = 0; + u64 root_objectid = 0; atomic_inc(&fs_info->defrag_running); - spin_lock(&fs_info->defrag_inodes_lock); while(1) { - n = NULL; + if (!__need_auto_defrag(fs_info->tree_root)) + break; /* find an inode to defrag */ - defrag = btrfs_find_defrag_inode(fs_info, root_objectid, - first_ino, &n); + defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, + first_ino); if (!defrag) { - if (n) { - defrag = rb_entry(n, struct inode_defrag, - rb_node); - } else if (root_objectid || first_ino) { + if (root_objectid || first_ino) { root_objectid = 0; first_ino = 0; continue; @@ -296,71 +373,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) } } - /* remove it from the rbtree */ first_ino = defrag->ino + 1; root_objectid = defrag->root; - rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); - - if (btrfs_fs_closing(fs_info)) - goto next_free; - spin_unlock(&fs_info->defrag_inodes_lock); - - /* get the inode */ - key.objectid = defrag->root; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - key.offset = (u64)-1; - inode_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(inode_root)) - goto next; - - key.objectid = defrag->ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); - key.offset = 0; - - inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); - if (IS_ERR(inode)) - goto next; - - /* do a chunk of defrag */ - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); - range.start = defrag->last_offset; - num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, - defrag_batch); - /* - * if we filled the whole defrag batch, there - * must be more work to do. Queue this defrag - * again - */ - if (num_defrag == defrag_batch) { - defrag->last_offset = range.start; - btrfs_requeue_inode_defrag(inode, defrag); - /* - * we don't want to kfree defrag, we added it back to - * the rbtree - */ - defrag = NULL; - } else if (defrag->last_offset && !defrag->cycled) { - /* - * we didn't fill our defrag batch, but - * we didn't start at zero. Make sure we loop - * around to the start of the file. - */ - defrag->last_offset = 0; - defrag->cycled = 1; - btrfs_requeue_inode_defrag(inode, defrag); - defrag = NULL; - } - - iput(inode); -next: - spin_lock(&fs_info->defrag_inodes_lock); -next_free: - if (defrag) - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + __btrfs_run_defrag_inode(fs_info, defrag); } - spin_unlock(&fs_info->defrag_inodes_lock); - atomic_dec(&fs_info->defrag_running); /* -- cgit v1.2.3 From b66f00da0cfceb856c17706b77906b63437f6fda Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:27:29 +0000 Subject: Btrfs: fix freeze vs auto defrag If we freeze the fs, the auto defragment should not run. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3c6f7479cd5..d415a052ca9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -318,8 +318,11 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, memset(&range, 0, sizeof(range)); range.len = (u64)-1; range.start = defrag->last_offset; + + sb_start_write(fs_info->sb); num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, BTRFS_DEFRAG_BATCH); + sb_end_write(fs_info->sb); /* * if we filled the whole defrag batch, there * must be more work to do. Queue this defrag -- cgit v1.2.3 From 797f4277113bff142b6c64a55abaef64d7d67d5c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 28 Nov 2012 10:28:07 +0000 Subject: Btrfs: use existing align macros in btrfs_allocate() The kernel developers have implemented some often-used align macros, we should use them instead of the complex code. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d415a052ca9..a43d0aef6ee 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2054,12 +2054,12 @@ static long btrfs_fallocate(struct file *file, int mode, u64 alloc_end; u64 alloc_hint = 0; u64 locked_end; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; + int blocksize = BTRFS_I(inode)->root->sectorsize; int ret; - alloc_start = offset & ~mask; - alloc_end = (offset + len + mask) & ~mask; + alloc_start = round_down(offset, blocksize); + alloc_end = round_up(offset + len, blocksize); /* Make sure we aren't being give some crap mode */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2140,7 +2140,7 @@ static long btrfs_fallocate(struct file *file, int mode, } last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); - last_byte = (last_byte + mask) & ~mask; + last_byte = ALIGN(last_byte, blocksize); if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && -- cgit v1.2.3 From 0ff6fabdb0a862b22df4dd75873578392478e64d Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 28 Nov 2012 10:28:54 +0000 Subject: Btrfs: fix off-by-one error of the reserved size of btrfs_allocate() alloc_end is not the real end of the current extent, it is the start of the next adjoining extent. So we needn't +1 when calculating the size the space that is about to be reserved. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a43d0aef6ee..8e3d6788d6d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2072,7 +2072,7 @@ static long btrfs_fallocate(struct file *file, int mode, * Make sure we have enough space before we do the * allocation. */ - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) return ret; @@ -2179,7 +2179,7 @@ static long btrfs_fallocate(struct file *file, int mode, out: mutex_unlock(&inode->i_mutex); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; } -- cgit v1.2.3 From 6347b3c433a4cff00eb2299c7f2c7d1d8b24c1fc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:53:45 +0000 Subject: Btrfs: fix off-by-one error of the same page check in btrfs_punch_hole() (start + len) is the start of the adjacent extent, not the end of the current extent, so we should not use it to check the hole is on the same page or not. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8e3d6788d6d..d75412bf7c4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1867,8 +1867,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 drop_end; int ret = 0; int err = 0; - bool same_page = (offset >> PAGE_CACHE_SHIFT) == - ((offset + len) >> PAGE_CACHE_SHIFT); + bool same_page = ((offset >> PAGE_CACHE_SHIFT) == + ((offset + len - 1) >> PAGE_CACHE_SHIFT)); btrfs_wait_ordered_range(inode, offset, len); -- cgit v1.2.3 From 0061280d2c7240805cfd7b1f493da967c97c2f34 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:54:12 +0000 Subject: Btrfs: fix the page that is beyond EOF Steps to reproduce: # mkfs.btrfs # mount # dd if=/dev/zero of=/ bs=512 seek=5 count=8 # fallocate -p -o 2048 -l 16384 / # dd if=/dev/zero of=/ bs=4096 seek=3 count=8 conv=notrunc,nocreat # umount # dmesg WARNING: at fs/btrfs/inode.c:7140 btrfs_destroy_inode+0x2eb/0x330 The reason is that we inputed a range which is beyond the end of the file. And because the end of this range was not page-aligned, we had to truncate the last page in this range, this operation is similar to a buffered file write. In other words, we reserved enough space and clear the data which was in the hole range on that page. But when we expanded that test file, write the data into the same page, we forgot that we have reserved enough space for the buffered write of that page because in most cases there is no page that is beyond the end of the file. As a result, we reserved the space twice. In fact, we needn't truncate the page if it is beyond the end of the file, just release the allocated space in that range. Fix the above problem by this way. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d75412bf7c4..700ffd266da 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1859,9 +1859,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) struct btrfs_path *path; struct btrfs_block_rsv *rsv; struct btrfs_trans_handle *trans; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; - u64 lockstart = (offset + mask) & ~mask; - u64 lockend = ((offset + len) & ~mask) - 1; + u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); + u64 lockend = round_down(offset + len, + BTRFS_I(inode)->root->sectorsize) - 1; u64 cur_offset = lockstart; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); u64 drop_end; @@ -1896,10 +1896,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } /* zero the front end of the last page */ - ret = btrfs_truncate_page(inode, offset + len, 0, 1); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; + if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + ret = btrfs_truncate_page(inode, offset + len, 0, 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } } if (lockend < lockstart) { -- cgit v1.2.3 From 7426cc04d407621773af3a0403e57642e40c36bf Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:54:52 +0000 Subject: Btrfs: punch hole past the end of the file Since we can pre-allocate the space past EOF, we should be able to reclaim that space if we need. This patch implements it by removing the EOF check. Though the manual of fallocate command says we can use truncate command to reclaim the pre-allocated space which past EOF, but because truncate command changes the file size, we must run several commands to reclaim the space if we don't want to change the file size, so it is not a good choice. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 700ffd266da..71c2dc1ea15 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1873,26 +1873,28 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) btrfs_wait_ordered_range(inode, offset, len); mutex_lock(&inode->i_mutex); - if (offset >= inode->i_size) { - mutex_unlock(&inode->i_mutex); - return 0; - } - + /* + * We needn't truncate any page which is beyond the end of the file + * because we are sure there is no data there. + */ /* * Only do this if we are in the same page and we aren't doing the * entire page. */ if (same_page && len < PAGE_CACHE_SIZE) { - ret = btrfs_truncate_page(inode, offset, len, 0); + if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) + ret = btrfs_truncate_page(inode, offset, len, 0); mutex_unlock(&inode->i_mutex); return ret; } /* zero back part of the first page */ - ret = btrfs_truncate_page(inode, offset, 0, 0); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; + if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + ret = btrfs_truncate_page(inode, offset, 0, 0); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } } /* zero the front end of the last page */ -- cgit v1.2.3 From b812ce28796f746f14ba6cc451250c422db447b2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Nov 2012 13:56:32 -0500 Subject: Btrfs: inline csums if we're fsyncing The tree logging stuff needs the csums to be on the ordered extents in order to log them properly, so mark that we're sync and inline the csum creation so we don't have to wait on the csumming to be done when logging extents that are still in flight. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 71c2dc1ea15..7f4654a1520 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1472,6 +1472,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, ssize_t num_written = 0; ssize_t err = 0; size_t count, ocount; + bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); sb_start_write(inode->i_sb); @@ -1529,6 +1530,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, } } + if (sync) + atomic_inc(&BTRFS_I(inode)->sync_writers); + if (unlikely(file->f_flags & O_DIRECT)) { num_written = __btrfs_direct_write(iocb, iov, nr_segs, pos, ppos, count, ocount); @@ -1563,6 +1567,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, num_written = err; } out: + if (sync) + atomic_dec(&BTRFS_I(inode)->sync_writers); sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; @@ -1613,7 +1619,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * out of the ->i_mutex. If so, we can flush the dirty pages by * multi-task, and make the performance up. */ + atomic_inc(&BTRFS_I(inode)->sync_writers); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + atomic_dec(&BTRFS_I(inode)->sync_writers); if (ret) return ret; -- cgit v1.2.3 From b493968096944a11422c4d80fb87af537ca1cac7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 3 Dec 2012 10:31:19 -0500 Subject: Btrfs: keep track of the extents original block length If we've written to a prealloc extent we need to know the original block len for the extent. We can't figure this out currently since ->block_len is just set to the extent length. So introduce ->orig_block_len so that we know how many bytes were in the original extent for proper extent logging that future patches will need. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7f4654a1520..6810145f4e9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -588,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->block_len = em->block_len; else split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); split->generation = gen; split->bdev = em->bdev; split->flags = flags; @@ -609,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->flags = flags; split->compress_type = em->compress_type; split->generation = gen; + split->orig_block_len = max(em->block_len, + em->orig_block_len); if (compressed) { split->block_len = em->block_len; @@ -1838,6 +1842,7 @@ out: hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; + hole_em->orig_block_len = 0; hole_em->bdev = root->fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; -- cgit v1.2.3 From 70c8a91ce21b83ccd2d9e7c968775430ead4353d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Oct 2012 16:54:30 -0400 Subject: Btrfs: log changed inodes based on the extent map tree We don't really need to copy extents from the source tree since we have all of the information already available to us in the extent_map tree. So instead just write the extents straight to the log tree and don't bother to copy the extent items from the source tree. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6810145f4e9..c56088ece50 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -621,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } else { split->block_len = split->len; split->block_start = em->block_start + diff; - split->orig_start = split->start; + split->orig_start = em->orig_start; } ret = add_extent_mapping(em_tree, split); -- cgit v1.2.3 From 6c760c072403f446ff829ec9e89568943a3c2ef2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Nov 2012 10:53:21 -0500 Subject: Btrfs: do not call file_update_time in aio_write This starts a transaction and dirties the inode everytime we call it, which is super expensive if you have a write heavy workload. We will be updating the inode when the IO completes and we reserve the space for the inode update when we reserve space for the write, so there is no chance of loss of information or enospc issues. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c56088ece50..20452c110d7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1464,6 +1464,24 @@ out: return written ? written : err; } +static void update_time_for_write(struct inode *inode) +{ + struct timespec now; + + if (IS_NOCMTIME(inode)) + return; + + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_mtime, &now)) + inode->i_mtime = now; + + if (!timespec_equal(&inode->i_ctime, &now)) + inode->i_ctime = now; + + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); +} + static ssize_t btrfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) @@ -1519,11 +1537,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, goto out; } - err = file_update_time(file); - if (err) { - mutex_unlock(&inode->i_mutex); - goto out; - } + /* + * We reserve space for updating the inode when we reserve space for the + * extent we are going to write, so we will enospc out there. We don't + * need to start yet another transaction to update the inode as we will + * update the inode when we finish writing whatever data we write. + */ + update_time_for_write(inode); start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { @@ -1563,8 +1583,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, * this will either be one more than the running transaction * or the generation used for the next transaction if there isn't * one running right now. + * + * We also have to set last_sub_trans to the current log transid, + * otherwise subsequent syncs to a file that's been synced in this + * transaction will appear to have already occured. */ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + BTRFS_I(inode)->last_sub_trans = root->log_transid; if (num_written > 0 || num_written == -EIOCBQUEUED) { err = generic_write_sync(file, pos, num_written); if (err < 0 && num_written > 0) -- cgit v1.2.3 From 965c8e59cfcf845ecde2265a1d1bfee5f011d302 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Dec 2012 15:59:39 -0800 Subject: lseek: the "whence" argument is called "whence" But the kernel decided to call it "origin" instead. Fix most of the sites. Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/file.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a8ee75cb96e..9c6673a9231 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2120,7 +2120,7 @@ out: return ret; } -static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) +static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) { struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_map *em; @@ -2154,7 +2154,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) * before the position we want in case there is outstanding delalloc * going on here. */ - if (origin == SEEK_HOLE && start != 0) { + if (whence == SEEK_HOLE && start != 0) { if (start <= root->sectorsize) em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, root->sectorsize, 0); @@ -2188,13 +2188,13 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) } } - if (origin == SEEK_HOLE) { + if (whence == SEEK_HOLE) { *offset = start; free_extent_map(em); break; } } else { - if (origin == SEEK_DATA) { + if (whence == SEEK_DATA) { if (em->block_start == EXTENT_MAP_DELALLOC) { if (start >= inode->i_size) { free_extent_map(em); @@ -2231,16 +2231,16 @@ out: return ret; } -static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) +static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int ret; mutex_lock(&inode->i_mutex); - switch (origin) { + switch (whence) { case SEEK_END: case SEEK_CUR: - offset = generic_file_llseek(file, offset, origin); + offset = generic_file_llseek(file, offset, whence); goto out; case SEEK_DATA: case SEEK_HOLE: @@ -2249,7 +2249,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) return -ENXIO; } - ret = find_desired_extent(inode, &offset, origin); + ret = find_desired_extent(inode, &offset, whence); if (ret) { mutex_unlock(&inode->i_mutex); return ret; -- cgit v1.2.3 From 1214b53f90131fee1f950010c43e92455fe598ab Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 7 Jan 2013 03:53:08 +0000 Subject: Btrfs: fix off-by-one in lseek Lock end is inclusive. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 20452c110d7..fa48051484b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2242,6 +2242,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) if (lockend <= lockstart) lockend = lockstart + root->sectorsize; + lockend--; len = lockend - lockstart + 1; len = max_t(u64, len, root->sectorsize); -- cgit v1.2.3 From f9e4fb53938de5db01950c9dfe479703b2f5c964 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 7 Jan 2013 10:10:12 +0000 Subject: Btrfs: fix a bug when llseek for delalloc bytes behind prealloc extents xfstests case 285 complains. It it because btrfs did not try to find unwritten delalloc bytes(only dirty pages, not yet writeback) behind prealloc extents, it ends up finding nothing while we're with SEEK_DATA. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fa48051484b..841cfe3be0e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2309,9 +2309,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) } } - *offset = start; - free_extent_map(em); - break; + if (!test_bit(EXTENT_FLAG_PREALLOC, + &em->flags)) { + *offset = start; + free_extent_map(em); + break; + } } } -- cgit v1.2.3 From 0a3404dcff29a7589e8d6ce8c36f97c5411f85b4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 28 Jan 2013 12:34:55 +0000 Subject: Btrfs: fix wrong sync_writers decrement in btrfs_file_aio_write() If the checks at the beginning of btrfs_file_aio_write() fail, we needn't decrease ->sync_writers, because we have not increased it. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 841cfe3be0e..a902faab716 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1595,9 +1595,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (err < 0 && num_written > 0) num_written = err; } -out: + if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); +out: sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; -- cgit v1.2.3 From 6f1c36055f96e80031c7fdda3fd5be826b8d7782 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 29 Jan 2013 03:22:10 +0000 Subject: Btrfs: fix race between snapshot deletion and getting inode While running snapshot testscript created by Mitch and David, the race between autodefrag and snapshot deletion can lead to corruption of dead_root list so that we can get crash on btrfs_clean_old_snapshots(). And besides autodefrag, scrub also does the same thing, ie. read root first and get inode. Here is the story(take autodefrag as an example): (1) when we delete a snapshot or subvolume, it will set its root's refs to zero and do a iput() on its own inode, and if this inode happens to be the only active in-meory one in root's inode rbtree, it will add itself to the global dead_roots list for later cleanup. (2) after (1), the autodefrag thread may read another inode for defrag and the inode is just in the deleted snapshot/subvolume, but all of these are without checking if the root is still valid(refs > 0). So the end up result is adding the deleted snapshot/subvolume's root to the global dead_roots list AGAIN. Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu. So all we need to do is to take the lock to protect 'read root and get inode', since we synchronize to wait for the rcu grace period before adding something to the global dead_roots list. Reported-by: Mitch Harder Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a902faab716..b06d289f998 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -293,15 +293,24 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; int num_defrag; + int index; + int ret; /* get the inode */ key.objectid = defrag->root; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(inode_root)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode_root); + ret = PTR_ERR(inode_root); + goto cleanup; + } + if (btrfs_root_refs(&inode_root->root_item) == 0) { + ret = -ENOENT; + goto cleanup; } key.objectid = defrag->ino; @@ -309,9 +318,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); if (IS_ERR(inode)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode); + ret = PTR_ERR(inode); + goto cleanup; } + srcu_read_unlock(&fs_info->subvol_srcu, index); /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); @@ -346,6 +356,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, iput(inode); return 0; +cleanup: + srcu_read_unlock(&fs_info->subvol_srcu, index); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; } /* -- cgit v1.2.3