From b9959295151625c17723103afd79077e80b24ddd Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Mon, 25 Jun 2012 21:25:22 -0600 Subject: Btrfs: return error of btrfs_update_inode() to caller We didn't check error of btrfs_update_inode(), but that error looks easy to bubble back up. Reviewed-by: David Sterba Signed-off-by: Tsutomu Itoh Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8abeae4224f..c86670f4f28 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -637,7 +637,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, } inode_set_bytes(inode, saved_nbytes); - btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, inode); out: if (inode) iput(inode); @@ -1133,7 +1133,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (ret == 0) { btrfs_inc_nlink(inode); - btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, inode); } else if (ret == -EEXIST) { ret = 0; } else { -- cgit v1.2.3 From 5dc562c541e1026df9d43913c2f6b91156e22d32 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Aug 2012 13:14:17 -0400 Subject: Btrfs: turbo charge fsync At least for the vm workload. Currently on fsync we will 1) Truncate all items in the log tree for the given inode if they exist and 2) Copy all items for a given inode into the log The problem with this is that for things like VMs you can have lots of extents from the fragmented writing behavior, and worst yet you may have only modified a few extents, not the entire thing. This patch fixes this problem by tracking which transid modified our extent, and then when we do the tree logging we find all of the extents we've modified in our current transaction, sort them and commit them. We also only truncate up to the xattrs of the inode and copy that stuff in normally, and then just drop any extents in the range we have that exist in the log already. Here are some numbers of a 50 meg fio job that does random writes and fsync()s after every write Original Patched SATA drive 82KB/s 140KB/s Fusion drive 431KB/s 2532KB/s So around 2-6 times faster depending on your hardware. There are a few corner cases, for example if you truncate at all we have to do it the old way since there is no way to be sure what is in the log is ok. This probably could be done smarter, but if you write-fsync-truncate-write-fsync you deserve what you get. All this work is in RAM of course so if your inode gets evicted from cache and you read it in and fsync it we'll do it the slow way if we are still in the same transaction that we last modified the inode in. The biggest cool part of this is that it requires no changes to the recovery code, so if you fsync with this patch and crash and load an old kernel, it will run the recovery and be a-ok. I have tested this pretty thoroughly with an fsync tester and everything comes back fine, as well as xfstests. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 214 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c86670f4f28..f2ff02c5513 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -18,6 +18,7 @@ #include #include +#include #include "ctree.h" #include "transaction.h" #include "disk-io.h" @@ -550,7 +551,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ - ret = btrfs_drop_extents(trans, inode, start, extent_end, + ret = btrfs_drop_extents(trans, root, inode, start, extent_end, &alloc_hint, 1); BUG_ON(ret); @@ -2803,6 +2804,194 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, return ret; } +static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct extent_map *em1, *em2; + + em1 = list_entry(a, struct extent_map, list); + em2 = list_entry(b, struct extent_map, list); + + if (em1->start < em2->start) + return -1; + else if (em1->start > em2->start) + return 1; + return 0; +} + +struct log_args { + struct extent_buffer *src; + u64 next_offset; + int start_slot; + int nr; +}; + +static int log_one_extent(struct btrfs_trans_handle *trans, + struct inode *inode, struct btrfs_root *root, + struct extent_map *em, struct btrfs_path *path, + struct btrfs_path *dst_path, struct log_args *args) +{ + struct btrfs_root *log = root->log_root; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 start = em->start; + u64 len = em->len; + u64 num_bytes; + int nritems; + int ret; + + if (BTRFS_I(inode)->logged_trans == trans->transid) { + u64 tmp; + ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, + start + len, &tmp, 0); + if (ret) + return ret; + } + + while (len) { + if (args->nr) + goto next_slot; + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + if (ret) { + /* + * This shouldn't happen, but it might so warn and + * return an error. + */ + WARN_ON(1); + return -ENOENT; + } + args->src = path->nodes[0]; +next_slot: + fi = btrfs_item_ptr(args->src, path->slots[0], + struct btrfs_file_extent_item); + if (args->nr && + args->start_slot + args->nr == path->slots[0]) { + args->nr++; + } else if (args->nr) { + ret = copy_items(trans, log, dst_path, args->src, + args->start_slot, args->nr, + LOG_INODE_ALL); + if (ret) + return ret; + args->nr = 1; + args->start_slot = path->slots[0]; + } else if (!args->nr) { + args->nr = 1; + args->start_slot = path->slots[0]; + } + nritems = btrfs_header_nritems(path->nodes[0]); + path->slots[0]++; + num_bytes = btrfs_file_extent_num_bytes(args->src, fi); + if (len < num_bytes) { + /* I _think_ this is ok, envision we write to a + * preallocated space that is adjacent to a previously + * written preallocated space that gets merged when we + * mark this preallocated space written. If we do not + * have the adjacent extent in cache then when we copy + * this extent it could end up being larger than our EM + * thinks it is, which is a-ok, so just set len to 0. + */ + len = 0; + } else { + len -= num_bytes; + } + start += btrfs_file_extent_num_bytes(args->src, fi); + args->next_offset = start; + + if (path->slots[0] < nritems) { + if (len) + goto next_slot; + break; + } + + if (args->nr) { + ret = copy_items(trans, log, dst_path, args->src, + args->start_slot, args->nr, + LOG_INODE_ALL); + if (ret) + return ret; + args->nr = 0; + btrfs_release_path(path); + } + } + + return 0; +} + +static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + struct log_args args; + struct btrfs_root *log = root->log_root; + struct extent_map *em, *n; + struct list_head extents; + struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; + u64 test_gen; + int ret = 0; + + INIT_LIST_HEAD(&extents); + + memset(&args, 0, sizeof(args)); + + write_lock(&tree->lock); + test_gen = root->fs_info->last_trans_committed; + + list_for_each_entry_safe(em, n, &tree->modified_extents, list) { + list_del_init(&em->list); + if (em->generation <= test_gen) + continue; + list_add_tail(&em->list, &extents); + } + + list_sort(NULL, &extents, extent_cmp); + + while (!list_empty(&extents)) { + em = list_entry(extents.next, struct extent_map, list); + + list_del_init(&em->list); + + /* + * If we had an error we just need to delete everybody from our + * private list. + */ + if (ret) + continue; + + /* + * If the previous EM and the last extent we left off on aren't + * sequential then we need to copy the items we have and redo + * our search + */ + if (args.nr && em->start != args.next_offset) { + ret = copy_items(trans, log, dst_path, args.src, + args.start_slot, args.nr, + LOG_INODE_ALL); + if (ret) + continue; + btrfs_release_path(path); + args.nr = 0; + } + + ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); + } + + if (!ret && args.nr) + ret = copy_items(trans, log, dst_path, args.src, + args.start_slot, args.nr, LOG_INODE_ALL); + btrfs_release_path(path); + WARN_ON(!list_empty(&extents)); + write_unlock(&tree->lock); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -2832,6 +3021,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, int nritems; int ins_start_slot = 0; int ins_nr; + bool fast_search = false; u64 ino = btrfs_ino(inode); log = root->log_root; @@ -2851,10 +3041,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.objectid = ino; - /* today the code can only do partial logging of directories */ - if (!S_ISDIR(inode->i_mode)) - inode_only = LOG_INODE_ALL; + /* today the code can only do partial logging of directories */ if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) max_key.type = BTRFS_XATTR_ITEM_KEY; else @@ -2881,7 +3069,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key_type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { - ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); + if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + ret = btrfs_truncate_inode_items(trans, log, + inode, 0, 0); + } else { + fast_search = true; + max_key.type = BTRFS_XATTR_ITEM_KEY; + ret = drop_objectid_items(trans, log, path, ino, + BTRFS_XATTR_ITEM_KEY); + } } if (ret) { err = ret; @@ -2960,7 +3157,18 @@ next_slot: } ins_nr = 0; } - WARN_ON(ins_nr); + + if (fast_search) { + btrfs_release_path(path); + btrfs_release_path(dst_path); + ret = btrfs_log_changed_extents(trans, root, inode, path, + dst_path); + if (ret) { + err = ret; + goto out_unlock; + } + } + if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { btrfs_release_path(path); btrfs_release_path(dst_path); -- cgit v1.2.3 From 0fa83cdb1d72a94ea84ab6380747de6ac7cc8753 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Aug 2012 14:48:11 -0400 Subject: Btrfs: only warn if we hit an error when doing the tree logging I hit this a couple times while working on my fsync patch (all my bugs, not normal operation), but with my new stuff we could have new errors from cases I have not encountered, so instead of BUG()'ing we should be WARN()'ing so that we are notified there is a problem but the user doesn't lose their data. We can easily commit the transaction in the case that the tree logging fails and still be fine, so let's try and be as nice to the user as possible. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f2ff02c5513..94db438494d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3346,7 +3346,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: dput(old_parent); if (ret < 0) { - BUG_ON(ret != -ENOSPC); + WARN_ON(ret != -ENOSPC); root->fs_info->last_trans_log_full_commit = trans->transid; ret = 1; } -- cgit v1.2.3 From 06d3d22b456c2f87aeb1eb4517eeabb47e21fcc9 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 27 Aug 2012 10:52:19 -0600 Subject: Btrfs: cleanup extents after we finish logging inode This is based on Josef's "Btrfs: turbo charge fsync". We should cleanup those extents after we've finished logging inode, otherwise we may do redundant work on them. Signed-off-by: Liu Bo --- fs/btrfs/tree-log.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 94db438494d..58075d711d2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3167,6 +3167,12 @@ next_slot: err = ret; goto out_unlock; } + } else { + struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em, *n; + + list_for_each_entry_safe(em, n, &tree->modified_extents, list) + list_del_init(&em->list); } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { -- cgit v1.2.3 From 4e2f84e63dc138eca91e89ccbc34f37732ce58f7 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 27 Aug 2012 10:52:20 -0600 Subject: Btrfs: improve fsync by filtering extents that we want This is based on Josef's "Btrfs: turbo charge fsync". The above Josef's patch performs very good in random sync write test, because we won't have too much extents to merge. However, it does not performs good on the test: dd if=/dev/zero of=foobar bs=4k count=12500 oflag=sync The reason is when we do sequencial sync write, we need to merge the current extent just with the previous one, so that we can get accumulated extents to log: A(4k) --> AA(8k) --> AAA(12k) --> AAAA(16k) ... So we'll have to flush more and more checksum into log tree, which is the bottleneck according to my tests. But we can avoid this by telling fsync the real extents that are needed to be logged. With this, I did the above dd sync write test (size=50m), w/o (orig) w/ (josef's) w/ (this) SATA 104KB/s 109KB/s 121KB/s ramdisk 1.5MB/s 1.5MB/s 10.7MB/s (613%) Signed-off-by: Liu Bo --- fs/btrfs/tree-log.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 58075d711d2..71e71539ffb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2833,8 +2833,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; struct btrfs_file_extent_item *fi; struct btrfs_key key; - u64 start = em->start; - u64 len = em->len; + u64 start = em->mod_start; + u64 len = em->mod_len; u64 num_bytes; int nritems; int ret; @@ -2970,7 +2970,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, * sequential then we need to copy the items we have and redo * our search */ - if (args.nr && em->start != args.next_offset) { + if (args.nr && em->mod_start != args.next_offset) { ret = copy_items(trans, log, dst_path, args.src, args.start_slot, args.nr, LOG_INODE_ALL); -- cgit v1.2.3 From 46d8bc34248f3a94dea910137d1ddf5fb1e3a1cc Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 29 Aug 2012 01:07:55 -0600 Subject: Btrfs: fix a bug in checking whether a inode is already in log This is based on Josef's "Btrfs: turbo charge fsync". The current btrfs checks if an inode is in log by comparing root's last_log_commit to inode's last_sub_trans[2]. But the problem is that this root->last_log_commit is shared among inodes. Say we have N inodes to be logged, after the first inode, root's last_log_commit is updated and the N-1 remained files will be skipped. This fixes the bug by keeping a local copy of root's last_log_commit inside each inode and this local copy will be maintained itself. [1]: we regard each log transaction as a subset of btrfs's transaction, i.e. sub_trans Signed-off-by: Liu Bo --- fs/btrfs/tree-log.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 71e71539ffb..fc0df95c286 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3185,6 +3185,7 @@ next_slot: } } BTRFS_I(inode)->logged_trans = trans->transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; out_unlock: mutex_unlock(&BTRFS_I(inode)->log_mutex); -- cgit v1.2.3 From d2794405113cd04f13a58e32e4d541dc87ec86e4 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 29 Aug 2012 01:07:56 -0600 Subject: Btrfs: check if an inode has no checksum when logging it This is based on Josef's "Btrfs: turbo charge fsync". If an inode is a BTRFS_INODE_NODATASUM one, we don't need to look for csum items any more. Signed-off-by: Liu Bo --- fs/btrfs/tree-log.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fc0df95c286..5e2ffb95cc6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2680,13 +2680,14 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, } static noinline int copy_items(struct btrfs_trans_handle *trans, - struct btrfs_root *log, + struct inode *inode, struct btrfs_path *dst_path, struct extent_buffer *src, int start_slot, int nr, int inode_only) { unsigned long src_offset; unsigned long dst_offset; + struct btrfs_root *log = BTRFS_I(inode)->root->log_root; struct btrfs_file_extent_item *extent; struct btrfs_inode_item *inode_item; int ret; @@ -2695,6 +2696,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, char *ins_data; int i; struct list_head ordered_sums; + int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; INIT_LIST_HEAD(&ordered_sums); @@ -2745,7 +2747,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * or deletes of this inode don't have to relog the inode * again */ - if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { + if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && + !skip_csum) { int found_type; extent = btrfs_item_ptr(src, start_slot + i, struct btrfs_file_extent_item); @@ -2873,7 +2876,7 @@ next_slot: args->start_slot + args->nr == path->slots[0]) { args->nr++; } else if (args->nr) { - ret = copy_items(trans, log, dst_path, args->src, + ret = copy_items(trans, inode, dst_path, args->src, args->start_slot, args->nr, LOG_INODE_ALL); if (ret) @@ -2910,7 +2913,7 @@ next_slot: } if (args->nr) { - ret = copy_items(trans, log, dst_path, args->src, + ret = copy_items(trans, inode, dst_path, args->src, args->start_slot, args->nr, LOG_INODE_ALL); if (ret) @@ -2930,7 +2933,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_path *dst_path) { struct log_args args; - struct btrfs_root *log = root->log_root; struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; @@ -2971,7 +2973,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, * our search */ if (args.nr && em->mod_start != args.next_offset) { - ret = copy_items(trans, log, dst_path, args.src, + ret = copy_items(trans, inode, dst_path, args.src, args.start_slot, args.nr, LOG_INODE_ALL); if (ret) @@ -2984,7 +2986,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } if (!ret && args.nr) - ret = copy_items(trans, log, dst_path, args.src, + ret = copy_items(trans, inode, dst_path, args.src, args.start_slot, args.nr, LOG_INODE_ALL); btrfs_release_path(path); WARN_ON(!list_empty(&extents)); @@ -3109,7 +3111,7 @@ again: goto next_slot; } - ret = copy_items(trans, log, dst_path, src, ins_start_slot, + ret = copy_items(trans, inode, dst_path, src, ins_start_slot, ins_nr, inode_only); if (ret) { err = ret; @@ -3127,7 +3129,7 @@ next_slot: goto again; } if (ins_nr) { - ret = copy_items(trans, log, dst_path, src, + ret = copy_items(trans, inode, dst_path, src, ins_start_slot, ins_nr, inode_only); if (ret) { @@ -3148,8 +3150,7 @@ next_slot: break; } if (ins_nr) { - ret = copy_items(trans, log, dst_path, src, - ins_start_slot, + ret = copy_items(trans, inode, dst_path, src, ins_start_slot, ins_nr, inode_only); if (ret) { err = ret; -- cgit v1.2.3 From 2671485d395c07fca104c972785898d7c52fc942 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 29 Aug 2012 12:24:27 -0400 Subject: Btrfs: remove unused hint byte argument for btrfs_drop_extents I audited all users of btrfs_drop_extents and found that nobody actually uses the hint_byte argument. I'm sure it was used for something at some point but it's not used now, and the way the pinning works the disk bytenr would never be immediately useful anyway so lets just remove it. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5e2ffb95cc6..0c39f58973d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -485,7 +485,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, int found_type; u64 mask = root->sectorsize - 1; u64 extent_end; - u64 alloc_hint; u64 start = key->offset; u64 saved_nbytes; struct btrfs_file_extent_item *item; @@ -551,8 +550,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ - ret = btrfs_drop_extents(trans, root, inode, start, extent_end, - &alloc_hint, 1); + ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || @@ -2843,9 +2841,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans, int ret; if (BTRFS_I(inode)->logged_trans == trans->transid) { - u64 tmp; ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, - start + len, &tmp, 0); + start + len, 0); if (ret) return ret; } -- cgit v1.2.3 From 2aaa66558172b017f36bf38ae69372813dedee9d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 29 Aug 2012 14:27:18 -0400 Subject: Btrfs: add hole punching This patch adds hole punching via fallocate. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 0c39f58973d..f9b0fc91166 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2842,7 +2842,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->logged_trans == trans->transid) { ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, - start + len, 0); + start + len, NULL, 0); if (ret) return ret; } -- cgit v1.2.3 From 2ecb79239bcd04c9d410f4cdce16adb6840b19da Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 6 Sep 2012 04:04:27 -0600 Subject: Btrfs: fix unprotected ->log_batch We forget to protect ->log_batch when syncing a file, this patch fix this problem by atomic operation. And ->log_batch is used to check if there are parallel sync operations or not, so it is unnecessary to reset it to 0 after the sync operation of the current log tree complete. Signed-off-by: Miao Xie --- fs/btrfs/tree-log.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f9b0fc91166..038a5229404 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -147,7 +147,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, root->log_multiple_pids = true; } - root->log_batch++; + atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); return 0; @@ -166,7 +166,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, err = ret; } mutex_unlock(&root->fs_info->tree_log_mutex); - root->log_batch++; + atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); return err; @@ -2036,7 +2036,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root->log_transid - 1); while (1) { - unsigned long batch = root->log_batch; + int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { mutex_unlock(&root->log_mutex); @@ -2044,7 +2044,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); } wait_for_writer(trans, root); - if (batch == root->log_batch) + if (batch == atomic_read(&root->log_batch)) break; } @@ -2073,7 +2073,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_root_node(&log->root_item, log->node); - root->log_batch = 0; root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; @@ -2086,7 +2085,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); mutex_lock(&log_root_tree->log_mutex); - log_root_tree->log_batch++; + atomic_inc(&log_root_tree->log_batch); atomic_inc(&log_root_tree->log_writers); mutex_unlock(&log_root_tree->log_mutex); @@ -2156,7 +2155,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_super_log_root_level(root->fs_info->super_for_commit, btrfs_header_level(log_root_tree->node)); - log_root_tree->log_batch = 0; log_root_tree->log_transid++; smp_mb(); -- cgit v1.2.3 From ff44c6e36dc9dcc02652a1105b120bdf08cea9f7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Sep 2012 12:59:20 -0400 Subject: Btrfs: do not hold the write_lock on the extent tree while logging Dave Sterba pointed out a sleeping while atomic bug while doing fsync. This is because I'm an idiot and didn't realize that rwlock's were spin locks, so we've been holding this thing while doing allocations and such which is not good. This patch fixes this by dropping the write lock before we do anything heavy and re-acquire it when it is done. We also need to take a ref on the em's in case their corresponding pages are evicted and mark them as being logged so that releasepage does not remove them and doesn't remove them from our local list. Thanks, Reported-by: Dave Sterba Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 038a5229404..ed1f7ce7219 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2945,6 +2945,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, list_del_init(&em->list); if (em->generation <= test_gen) continue; + /* Need a ref to keep it from getting evicted from cache */ + atomic_inc(&em->refs); + set_bit(EXTENT_FLAG_LOGGING, &em->flags); list_add_tail(&em->list, &extents); } @@ -2954,13 +2957,18 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em = list_entry(extents.next, struct extent_map, list); list_del_init(&em->list); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); /* * If we had an error we just need to delete everybody from our * private list. */ - if (ret) + if (ret) { + free_extent_map(em); continue; + } + + write_unlock(&tree->lock); /* * If the previous EM and the last extent we left off on aren't @@ -2971,21 +2979,26 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, ret = copy_items(trans, inode, dst_path, args.src, args.start_slot, args.nr, LOG_INODE_ALL); - if (ret) + if (ret) { + free_extent_map(em); + write_lock(&tree->lock); continue; + } btrfs_release_path(path); args.nr = 0; } ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); + free_extent_map(em); + write_lock(&tree->lock); } + WARN_ON(!list_empty(&extents)); + write_unlock(&tree->lock); if (!ret && args.nr) ret = copy_items(trans, inode, dst_path, args.src, args.start_slot, args.nr, LOG_INODE_ALL); btrfs_release_path(path); - WARN_ON(!list_empty(&extents)); - write_unlock(&tree->lock); return ret; } -- cgit v1.2.3 From 0aa4a17d8232e7d8a42763b53bb9943bc0484b18 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Sep 2012 15:42:38 -0400 Subject: Btrfs: handle not finding the extent exactly when logging changed extents I started hitting warnings when running xfstest 68 in a loop because there were EM's that were not lined up properly with the physical extents. This is ok, if we do something like punch a hole or write to a preallocated space or something like that we can have an EM that doesn't cover the entire physical extent. So fix the tree logging stuff to cope with this case so we don't just commit the transaction. With this patch I no longer see the warnings from the tree logging code. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 46 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ed1f7ce7219..e0ef92f91d6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2833,6 +2833,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 start = em->mod_start; + u64 search_start = start; u64 len = em->mod_len; u64 num_bytes; int nritems; @@ -2848,23 +2849,55 @@ static int log_one_extent(struct btrfs_trans_handle *trans, while (len) { if (args->nr) goto next_slot; +again: key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; + key.offset = search_start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret) { /* - * This shouldn't happen, but it might so warn and - * return an error. + * A rare case were we can have an em for a section of a + * larger extent so we need to make sure that this em + * falls within the extent we've found. If not we just + * bail and go back to ye-olde way of doing things but + * it happens often enough in testing that we need to do + * this dance to make sure. */ - WARN_ON(1); - return -ENOENT; + do { + if (path->slots[0] == 0) { + btrfs_release_path(path); + if (search_start == 0) + return -ENOENT; + search_start--; + goto again; + } + + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) { + btrfs_release_path(path); + return -ENOENT; + } + } while (key.offset > start); + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(path->nodes[0], + fi); + if (key.offset + num_bytes <= start) { + btrfs_release_path(path); + return -ENOENT; + } } args->src = path->nodes[0]; next_slot: + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); fi = btrfs_item_ptr(args->src, path->slots[0], struct btrfs_file_extent_item); if (args->nr && @@ -2898,8 +2931,9 @@ next_slot: } else { len -= num_bytes; } - start += btrfs_file_extent_num_bytes(args->src, fi); + start = key.offset + num_bytes; args->next_offset = start; + search_start = start; if (path->slots[0] < nritems) { if (len) -- cgit v1.2.3 From 5a1d7843ca4b3a9009bea87f85ad33854b910aea Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 17 Aug 2012 14:04:41 -0700 Subject: btrfs: improved readablity for add_inode_ref Moved part of the code into a sub function and replaced most of the gotos by ifs, hoping that it will be easier to read now. Signed-off-by: Jan Schmidt Signed-off-by: Mark Fasheh --- fs/btrfs/tree-log.c | 178 ++++++++++++++++++++++++++++------------------------ 1 file changed, 97 insertions(+), 81 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e0ef92f91d6..15dae589e59 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -785,76 +785,18 @@ out: return match; } - -/* - * replay one inode back reference item found in the log tree. - * eb, slot and key refer to the buffer and key found in the log tree. - * root is the destination we are replaying into, and path is for temp - * use by this function. (it should be released on return). - */ -static noinline int add_inode_ref(struct btrfs_trans_handle *trans, +static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_root *log, struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) + struct btrfs_root *log_root, + struct inode *dir, struct inode *inode, + struct btrfs_key *key, + struct extent_buffer *eb, + struct btrfs_inode_ref *ref, + char *name, int namelen, int *search_done) { - struct btrfs_inode_ref *ref; - struct btrfs_dir_item *di; - struct inode *dir; - struct inode *inode; - unsigned long ref_ptr; - unsigned long ref_end; - char *name; - int namelen; int ret; - int search_done = 0; - - /* - * it is possible that we didn't log all the parent directories - * for a given inode. If we don't find the dir, just don't - * copy the back ref in. The link count fixup code will take - * care of the rest - */ - dir = read_one_inode(root, key->offset); - if (!dir) - return -ENOENT; - - inode = read_one_inode(root, key->objectid); - if (!inode) { - iput(dir); - return -EIO; - } - - ref_ptr = btrfs_item_ptr_offset(eb, slot); - ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); - -again: - ref = (struct btrfs_inode_ref *)ref_ptr; - - namelen = btrfs_inode_ref_name_len(eb, ref); - name = kmalloc(namelen, GFP_NOFS); - BUG_ON(!name); - - read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); - - /* if we already have a perfect match, we're done */ - if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), - btrfs_inode_ref_index(eb, ref), - name, namelen)) { - goto out; - } - - /* - * look for a conflicting back reference in the metadata. - * if we find one we have to unlink that name of the file - * before we add our new link. Later on, we overwrite any - * existing back reference, and we don't want to create - * dangling pointers in the directory. - */ - - if (search_done) - goto insert; + struct btrfs_dir_item *di; ret = btrfs_search_slot(NULL, root, key, path, 0, 0); if (ret == 0) { @@ -869,7 +811,7 @@ again: * if so, just jump out, we're done */ if (key->objectid == key->offset) - goto out_nowrite; + return 1; /* check all the names in this back reference to see * if they are in the log. if so, we allow them to stay @@ -888,7 +830,7 @@ again: (unsigned long)(victim_ref + 1), victim_name_len); - if (!backref_in_log(log, key, victim_name, + if (!backref_in_log(log_root, key, victim_name, victim_name_len)) { btrfs_inc_nlink(inode); btrfs_release_path(path); @@ -907,7 +849,7 @@ again: * NOTE: we have searched root tree and checked the * coresponding ref, it does not need to check again. */ - search_done = 1; + *search_done = 1; } btrfs_release_path(path); @@ -930,25 +872,99 @@ again: } btrfs_release_path(path); -insert: - /* insert our name */ - ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, - btrfs_inode_ref_index(eb, ref)); - BUG_ON(ret); + return 0; +} + +/* + * replay one inode back reference item found in the log tree. + * eb, slot and key refer to the buffer and key found in the log tree. + * root is the destination we are replaying into, and path is for temp + * use by this function. (it should be released on return). + */ +static noinline int add_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + struct btrfs_inode_ref *ref; + struct inode *dir; + struct inode *inode; + unsigned long ref_ptr; + unsigned long ref_end; + char *name; + int namelen; + int ret; + int search_done = 0; + + /* + * it is possible that we didn't log all the parent directories + * for a given inode. If we don't find the dir, just don't + * copy the back ref in. The link count fixup code will take + * care of the rest + */ + dir = read_one_inode(root, key->offset); + if (!dir) + return -ENOENT; - btrfs_update_inode(trans, root, inode); + inode = read_one_inode(root, key->objectid); + if (!inode) { + iput(dir); + return -EIO; + } -out: - ref_ptr = (unsigned long)(ref + 1) + namelen; - kfree(name); - if (ref_ptr < ref_end) - goto again; + ref_ptr = btrfs_item_ptr_offset(eb, slot); + ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + + while (ref_ptr < ref_end) { + ref = (struct btrfs_inode_ref *)ref_ptr; + + namelen = btrfs_inode_ref_name_len(eb, ref); + name = kmalloc(namelen, GFP_NOFS); + BUG_ON(!name); + + read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); + + /* if we already have a perfect match, we're done */ + if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + btrfs_inode_ref_index(eb, ref), + name, namelen)) { + /* + * look for a conflicting back reference in the + * metadata. if we find one we have to unlink that name + * of the file before we add our new link. Later on, we + * overwrite any existing back reference, and we don't + * want to create dangling pointers in the directory. + */ + + if (!search_done) { + ret = __add_inode_ref(trans, root, path, log, + dir, inode, key, eb, ref, + name, namelen, + &search_done); + if (ret == 1) + goto out; + BUG_ON(ret); + } + + /* insert our name */ + ret = btrfs_add_link(trans, dir, inode, name, namelen, + 0, btrfs_inode_ref_index(eb, ref)); + BUG_ON(ret); + + btrfs_update_inode(trans, root, inode); + } + + ref_ptr = (unsigned long)(ref + 1) + namelen; + kfree(name); + } /* finally write the back reference in the inode */ ret = overwrite_item(trans, root, path, eb, slot, key); BUG_ON(ret); -out_nowrite: +out: btrfs_release_path(path); iput(dir); iput(inode); -- cgit v1.2.3 From f186373fef005cee948a4a39e6a14c2e5f517298 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 8 Aug 2012 11:32:27 -0700 Subject: btrfs: extended inode refs This patch adds basic support for extended inode refs. This includes support for link and unlink of the refs, which basically gets us support for rename as well. Inode creation does not need changing - extended refs are only added after the ref array is full. Signed-off-by: Mark Fasheh --- fs/btrfs/tree-log.c | 345 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 293 insertions(+), 52 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 15dae589e59..1d7b3484432 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -24,8 +24,10 @@ #include "disk-io.h" #include "locking.h" #include "print-tree.h" +#include "backref.h" #include "compat.h" #include "tree-log.h" +#include "hash.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -743,6 +745,7 @@ out: */ static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, + u64 ref_objectid, char *name, int namelen) { struct btrfs_path *path; @@ -763,8 +766,17 @@ static noinline int backref_in_log(struct btrfs_root *log, if (ret != 0) goto out; - item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + if (btrfs_find_name_in_ext_backref(path, ref_objectid, + name, namelen, NULL)) + match = 1; + + goto out; + } + + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); ptr_end = ptr + item_size; while (ptr < ptr_end) { ref = (struct btrfs_inode_ref *)ptr; @@ -790,27 +802,36 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_root *log_root, struct inode *dir, struct inode *inode, - struct btrfs_key *key, struct extent_buffer *eb, - struct btrfs_inode_ref *ref, - char *name, int namelen, int *search_done) + u64 inode_objectid, u64 parent_objectid, + u64 ref_index, char *name, int namelen, + int *search_done) { int ret; + char *victim_name; + int victim_name_len; + struct extent_buffer *leaf; struct btrfs_dir_item *di; + struct btrfs_key search_key; + struct btrfs_inode_extref *extref; - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); +again: + /* Search old style refs */ + search_key.objectid = inode_objectid; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = parent_objectid; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret == 0) { - char *victim_name; - int victim_name_len; struct btrfs_inode_ref *victim_ref; unsigned long ptr; unsigned long ptr_end; - struct extent_buffer *leaf = path->nodes[0]; + + leaf = path->nodes[0]; /* are we trying to overwrite a back ref for the root directory * if so, just jump out, we're done */ - if (key->objectid == key->offset) + if (search_key.objectid == search_key.offset) return 1; /* check all the names in this back reference to see @@ -830,7 +851,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, (unsigned long)(victim_ref + 1), victim_name_len); - if (!backref_in_log(log_root, key, victim_name, + if (!backref_in_log(log_root, &search_key, + parent_objectid, + victim_name, victim_name_len)) { btrfs_inc_nlink(inode); btrfs_release_path(path); @@ -838,9 +861,14 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); + BUG_ON(ret); btrfs_run_delayed_items(trans, root); + kfree(victim_name); + *search_done = 1; + goto again; } kfree(victim_name); + ptr = (unsigned long)(victim_ref + 1) + victim_name_len; } BUG_ON(ret); @@ -853,10 +881,74 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, } btrfs_release_path(path); + /* Same search but for extended refs */ + extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, + inode_objectid, parent_objectid, 0, + 0); + if (!IS_ERR_OR_NULL(extref)) { + u32 item_size; + u32 cur_offset = 0; + unsigned long base; + struct inode *victim_parent; + + leaf = path->nodes[0]; + + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + base = btrfs_item_ptr_offset(leaf, path->slots[0]); + + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *)base + cur_offset; + + victim_name_len = btrfs_inode_extref_name_len(leaf, extref); + + if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) + goto next; + + victim_name = kmalloc(victim_name_len, GFP_NOFS); + read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, + victim_name_len); + + search_key.objectid = inode_objectid; + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = btrfs_extref_hash(parent_objectid, + victim_name, + victim_name_len); + ret = 0; + if (!backref_in_log(log_root, &search_key, + parent_objectid, victim_name, + victim_name_len)) { + ret = -ENOENT; + victim_parent = read_one_inode(root, + parent_objectid); + if (victim_parent) { + btrfs_inc_nlink(inode); + btrfs_release_path(path); + + ret = btrfs_unlink_inode(trans, root, + victim_parent, + inode, + victim_name, + victim_name_len); + btrfs_run_delayed_items(trans, root); + } + BUG_ON(ret); + iput(victim_parent); + kfree(victim_name); + *search_done = 1; + goto again; + } + kfree(victim_name); + BUG_ON(ret); +next: + cur_offset += victim_name_len + sizeof(*extref); + } + *search_done = 1; + } + btrfs_release_path(path); + /* look for a conflicting sequence number */ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), - btrfs_inode_ref_index(eb, ref), - name, namelen, 0); + ref_index, name, namelen, 0); if (di && !IS_ERR(di)) { ret = drop_one_dir_item(trans, root, path, dir, di); BUG_ON(ret); @@ -875,6 +967,48 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, return 0; } +static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index, + u64 *parent_objectid) +{ + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)ref_ptr; + + *namelen = btrfs_inode_extref_name_len(eb, extref); + *name = kmalloc(*namelen, GFP_NOFS); + if (*name == NULL) + return -ENOMEM; + + read_extent_buffer(eb, *name, (unsigned long)&extref->name, + *namelen); + + *index = btrfs_inode_extref_index(eb, extref); + if (parent_objectid) + *parent_objectid = btrfs_inode_extref_parent(eb, extref); + + return 0; +} + +static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index) +{ + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ref_ptr; + + *namelen = btrfs_inode_ref_name_len(eb, ref); + *name = kmalloc(*namelen, GFP_NOFS); + if (*name == NULL) + return -ENOMEM; + + read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); + + *index = btrfs_inode_ref_index(eb, ref); + + return 0; +} + /* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. @@ -888,7 +1022,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct btrfs_inode_ref *ref; struct inode *dir; struct inode *inode; unsigned long ref_ptr; @@ -897,6 +1030,27 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, int namelen; int ret; int search_done = 0; + int log_ref_ver = 0; + u64 parent_objectid; + u64 inode_objectid; + u64 ref_index; + int ref_struct_size; + + ref_ptr = btrfs_item_ptr_offset(eb, slot); + ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *r; + + ref_struct_size = sizeof(struct btrfs_inode_extref); + log_ref_ver = 1; + r = (struct btrfs_inode_extref *)ref_ptr; + parent_objectid = btrfs_inode_extref_parent(eb, r); + } else { + ref_struct_size = sizeof(struct btrfs_inode_ref); + parent_objectid = key->offset; + } + inode_objectid = key->objectid; /* * it is possible that we didn't log all the parent directories @@ -904,32 +1058,38 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * copy the back ref in. The link count fixup code will take * care of the rest */ - dir = read_one_inode(root, key->offset); + dir = read_one_inode(root, parent_objectid); if (!dir) return -ENOENT; - inode = read_one_inode(root, key->objectid); + inode = read_one_inode(root, inode_objectid); if (!inode) { iput(dir); return -EIO; } - ref_ptr = btrfs_item_ptr_offset(eb, slot); - ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); - while (ref_ptr < ref_end) { - ref = (struct btrfs_inode_ref *)ref_ptr; - - namelen = btrfs_inode_ref_name_len(eb, ref); - name = kmalloc(namelen, GFP_NOFS); - BUG_ON(!name); - - read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); + if (log_ref_ver) { + ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index, &parent_objectid); + /* + * parent object can change from one array + * item to another. + */ + if (!dir) + dir = read_one_inode(root, parent_objectid); + if (!dir) + return -ENOENT; + } else { + ret = ref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index); + } + if (ret) + return ret; /* if we already have a perfect match, we're done */ if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), - btrfs_inode_ref_index(eb, ref), - name, namelen)) { + ref_index, name, namelen)) { /* * look for a conflicting back reference in the * metadata. if we find one we have to unlink that name @@ -940,8 +1100,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (!search_done) { ret = __add_inode_ref(trans, root, path, log, - dir, inode, key, eb, ref, - name, namelen, + dir, inode, eb, + inode_objectid, + parent_objectid, + ref_index, name, namelen, &search_done); if (ret == 1) goto out; @@ -950,14 +1112,18 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, /* insert our name */ ret = btrfs_add_link(trans, dir, inode, name, namelen, - 0, btrfs_inode_ref_index(eb, ref)); + 0, ref_index); BUG_ON(ret); btrfs_update_inode(trans, root, inode); } - ref_ptr = (unsigned long)(ref + 1) + namelen; + ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; kfree(name); + if (log_ref_ver) { + iput(dir); + dir = NULL; + } } /* finally write the back reference in the inode */ @@ -981,25 +1147,55 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans, return ret; } +static int count_inode_extrefs(struct btrfs_root *root, + struct inode *inode, struct btrfs_path *path) +{ + int ret = 0; + int name_len; + unsigned int nlink = 0; + u32 item_size; + u32 cur_offset = 0; + u64 inode_objectid = btrfs_ino(inode); + u64 offset = 0; + unsigned long ptr; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; -/* - * There are a few corners where the link count of the file can't - * be properly maintained during replay. So, instead of adding - * lots of complexity to the log code, we just scan the backrefs - * for any file that has been through replay. - * - * The scan will update the link count on the inode to reflect the - * number of back refs found. If it goes down to zero, the iput - * will free the inode. - */ -static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) + while (1) { + ret = btrfs_find_one_extref(root, inode_objectid, offset, path, + &extref, &offset); + if (ret) + break; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *) (ptr + cur_offset); + name_len = btrfs_inode_extref_name_len(leaf, extref); + + nlink++; + + cur_offset += name_len + sizeof(*extref); + } + + offset++; + btrfs_release_path(path); + } + btrfs_release_path(path); + + if (ret < 0) + return ret; + return nlink; +} + +static int count_inode_refs(struct btrfs_root *root, + struct inode *inode, struct btrfs_path *path) { - struct btrfs_path *path; int ret; struct btrfs_key key; - u64 nlink = 0; + unsigned int nlink = 0; unsigned long ptr; unsigned long ptr_end; int name_len; @@ -1009,10 +1205,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -1046,6 +1238,50 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, btrfs_release_path(path); } btrfs_release_path(path); + + return nlink; +} + +/* + * There are a few corners where the link count of the file can't + * be properly maintained during replay. So, instead of adding + * lots of complexity to the log code, we just scan the backrefs + * for any file that has been through replay. + * + * The scan will update the link count on the inode to reflect the + * number of back refs found. If it goes down to zero, the iput + * will free the inode. + */ +static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + struct btrfs_path *path; + int ret; + u64 nlink = 0; + u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = count_inode_refs(root, inode, path); + if (ret < 0) + goto out; + + nlink = ret; + + ret = count_inode_extrefs(root, inode, path); + if (ret == -ENOENT) + ret = 0; + + if (ret < 0) + goto out; + + nlink += ret; + + ret = 0; + if (nlink != inode->i_nlink) { set_nlink(inode, nlink); btrfs_update_inode(trans, root, inode); @@ -1061,9 +1297,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, ret = insert_orphan_item(trans, root, ino); BUG_ON(ret); } - btrfs_free_path(path); - return 0; +out: + btrfs_free_path(path); + return ret; } static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, @@ -1710,6 +1947,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, ret = add_inode_ref(wc->trans, root, log, path, eb, i, &key); BUG_ON(ret && ret != -ENOENT); + } else if (key.type == BTRFS_INODE_EXTREF_KEY) { + ret = add_inode_ref(wc->trans, root, log, path, + eb, i, &key); + BUG_ON(ret && ret != -ENOENT); } else if (key.type == BTRFS_EXTENT_DATA_KEY) { ret = replay_one_extent(wc->trans, root, path, eb, i, &key); -- cgit v1.2.3 From e6138876ad8327250d77291b3262fee356267211 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 27 Sep 2012 17:07:30 -0400 Subject: Btrfs: cache extent state when writing out dirty metadata pages Everytime we write out dirty pages we search for an offset in the tree, convert the bits in the state, and then when we wait we search for the offset again and clear the bits. So for every dirty range in the io tree we are doing 4 rb searches, which is suboptimal. With this patch we are only doing 2 searches for every cycle (modulo weird things happening). Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1d7b3484432..f4b9e54b1da 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2463,7 +2463,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans, while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); + 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, + NULL); if (ret) break; -- cgit v1.2.3 From 6f1fed775316d58ba356b2ce62de600ad00f003a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Sep 2012 11:07:06 -0400 Subject: Btrfs: don't lookup csums for prealloc extents The tree logging stuff was looking up csums to copy over for prealloc extents which is just work we don't need to be doing. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f4b9e54b1da..31b46a9e94c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3011,8 +3011,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, continue; found_type = btrfs_file_extent_type(src, extent); - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { + if (found_type == BTRFS_FILE_EXTENT_REG) { u64 ds, dl, cs, cl; ds = btrfs_file_extent_disk_bytenr(src, extent); -- cgit v1.2.3 From 18ec90d63f43cf785f6e73e7a7db546bff3fa380 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 28 Sep 2012 11:56:28 -0400 Subject: Btrfs: be smarter about dropping things from the tree log When we truncate existing items in the tree log we've been searching for each individual item and removing them. This is unnecessary churn and searching, just keep track of the slot we are on and how many items we need to delete and delete them all at once. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 31b46a9e94c..179fda96460 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2901,6 +2901,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, int ret; struct btrfs_key key; struct btrfs_key found_key; + int start_slot; key.objectid = objectid; key.type = max_key_type; @@ -2922,8 +2923,18 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, if (found_key.objectid != objectid) break; - ret = btrfs_del_item(trans, log, path); - if (ret) + found_key.offset = 0; + found_key.type = 0; + ret = btrfs_bin_search(path->nodes[0], &found_key, 0, + &start_slot); + + ret = btrfs_del_items(trans, log, path, start_slot, + path->slots[0] - start_slot + 1); + /* + * If start slot isn't 0 then we don't need to re-search, we've + * found the last guy with the objectid in this tree. + */ + if (ret || start_slot != 0) break; btrfs_release_path(path); } -- cgit v1.2.3 From 94edf4ae43a5f9405fe3570f670e26ce1b188476 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 25 Sep 2012 14:56:25 -0400 Subject: Btrfs: don't bother committing delayed inode updates when fsyncing We can just copy the in memory inode into the tree log directly, no sense in updating the fs tree so we can copy it into the tree log tree. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 84 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 19 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 179fda96460..47911fd1831 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2944,6 +2944,55 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, return ret; } +static void fill_inode_item(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf, + struct btrfs_inode_item *item, + struct inode *inode, int log_inode_only) +{ + btrfs_set_inode_uid(leaf, item, inode->i_uid); + btrfs_set_inode_gid(leaf, item, inode->i_gid); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + + btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec); + + btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); + + btrfs_set_inode_sequence(leaf, item, inode->i_version); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); + btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); + btrfs_set_inode_block_group(leaf, item, 0); + + if (log_inode_only) { + /* set the generation to zero so the recover code + * can tell the difference between an logging + * just to say 'this inode exists' and a logging + * to say 'update this inode with these values' + */ + btrfs_set_inode_generation(leaf, item, 0); + btrfs_set_inode_size(leaf, item, 0); + } else { + btrfs_set_inode_generation(leaf, item, + BTRFS_I(inode)->generation); + btrfs_set_inode_size(leaf, item, inode->i_size); + } + +} + static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, @@ -2990,24 +3039,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + i); - copy_extent_buffer(dst_path->nodes[0], src, dst_offset, - src_offset, ins_sizes[i]); - - if (inode_only == LOG_INODE_EXISTS && - ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { + if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_path->slots[0], struct btrfs_inode_item); - btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); - - /* set the generation to zero so the recover code - * can tell the difference between an logging - * just to say 'this inode exists' and a logging - * to say 'update this inode with these values' - */ - btrfs_set_inode_generation(dst_path->nodes[0], - inode_item, 0); + fill_inode_item(trans, dst_path->nodes[0], inode_item, + inode, inode_only == LOG_INODE_EXISTS); + } else { + copy_extent_buffer(dst_path->nodes[0], src, dst_offset, + src_offset, ins_sizes[i]); } + /* take a reference on file data extents so that truncates * or deletes of this inode don't have to relog the inode * again @@ -3361,11 +3403,15 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.type = (u8)-1; max_key.offset = (u64)-1; - ret = btrfs_commit_inode_delayed_items(trans, inode); - if (ret) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - return ret; + /* Only run delayed items if we are a dir or a new file */ + if (S_ISDIR(inode->i_mode) || + BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { + ret = btrfs_commit_inode_delayed_items(trans, inode); + if (ret) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + return ret; + } } mutex_lock(&BTRFS_I(inode)->log_mutex); -- cgit v1.2.3 From 5af3e8cce8b7ba0a2819e18c9146c8c0b452d479 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 1 Aug 2012 18:56:49 +0200 Subject: Btrfs: make filesystem read-only when submitting barrier fails So far the return code of barrier_all_devices() is ignored, which means that errors are ignored. The result can be a corrupt filesystem which is not consistent. This commit adds code to evaluate the return code of barrier_all_devices(). The normal btrfs_error() mechanism is used to switch the filesystem into read-only mode when errors are detected. In order to decide whether barrier_all_devices() should return error or success, the number of disks that are allowed to fail the barrier submission is calculated. This calculation accounts for the worst RAID level of metadata, system and data. If single, dup or RAID0 is in use, a single disk error is already considered to be fatal. Otherwise a single disk error is tolerated. The calculation of the number of disks that are tolerated to fail the barrier operation is performed when the filesystem gets mounted, when a balance operation is started and finished, and when devices are added or removed. Signed-off-by: Stefan Behrens --- fs/btrfs/tree-log.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 47911fd1831..67eab2d4d8a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2425,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * in and cause problems either. */ btrfs_scrub_pause_super(root); - write_ctree_super(trans, root->fs_info->tree_root, 1); + ret = write_ctree_super(trans, root->fs_info->tree_root, 1); btrfs_scrub_continue_super(root); - ret = 0; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_wake_log_root; + } mutex_lock(&root->log_mutex); if (root->last_log_commit < log_transid) -- cgit v1.2.3 From f46dbe3dee853f8a860f889cb2b7ff4c624f2a7a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 9 Oct 2012 11:17:20 -0400 Subject: btrfs: init ref_index to zero in add_inode_ref Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 67eab2d4d8a..e9ebb472b28 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1033,7 +1033,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, int log_ref_ver = 0; u64 parent_objectid; u64 inode_objectid; - u64 ref_index; + u64 ref_index = 0; int ref_struct_size; ref_ptr = btrfs_item_ptr_offset(eb, slot); -- cgit v1.2.3 From e9069f470803eeb5e243a05bc717452c6218bd71 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 12 Oct 2012 14:25:24 -0700 Subject: btrfs: Fix compilation with user namespace support enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling with user namespace support btrfs fails like: fs/btrfs/tree-log.c: In function ‘fill_inode_item’: fs/btrfs/tree-log.c:2955:2: error: incompatible type for argument 3 of ‘btrfs_set_inode_uid’ fs/btrfs/ctree.h:2026:1: note: expected ‘u32’ but argument is of type ‘kuid_t’ fs/btrfs/tree-log.c:2956:2: error: incompatible type for argument 3 of ‘btrfs_set_inode_gid’ fs/btrfs/ctree.h:2027:1: note: expected ‘u32’ but argument is of type ‘kgid_t’ Fix this by using i_uid_read and i_gid_read in Cc: Chris Mason Cc: Josef Bacik Signed-off-by: "Eric W. Biederman" --- fs/btrfs/tree-log.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e9ebb472b28..81e407d9677 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2952,8 +2952,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode, int log_inode_only) { - btrfs_set_inode_uid(leaf, item, inode->i_uid); - btrfs_set_inode_gid(leaf, item, inode->i_gid); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); + btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); btrfs_set_inode_mode(leaf, item, inode->i_mode); btrfs_set_inode_nlink(leaf, item, inode->i_nlink); -- cgit v1.2.3 From 183f37fa3503332740c76f1b493f4304ec889358 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 1 Nov 2012 06:38:47 +0000 Subject: Btrfs: do not log extents when we only log new names When we log new names, we need to log just enough to recreate the inode during log replay, and there is no need to log extents along with it. This actually fixes a bug revealed by xfstests 241, where it shows that we're logging some extents that have not updated metadata, so we don't get proper EXTENT_DATA items to be copied to log tree. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 81e407d9677..4ec41ecb4d6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3435,7 +3435,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); } else { - fast_search = true; + if (inode_only == LOG_INODE_ALL) + fast_search = true; max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, BTRFS_XATTR_ITEM_KEY); -- cgit v1.2.3 From 315a9850da2b89c83971b26fe54a60f22bdd91ad Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:33:59 +0000 Subject: Btrfs: fix wrong file extent length There are two types of the file extent - inline extent and regular extent, When we log file extents, we didn't take inline extent into account, fix it. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4ec41ecb4d6..bcf0e48b193 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3143,7 +3143,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_path *dst_path, struct log_args *args) { struct btrfs_root *log = root->log_root; - struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 start = em->mod_start; u64 search_start = start; @@ -3199,10 +3198,7 @@ again: } } while (key.offset > start); - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - num_bytes = btrfs_file_extent_num_bytes(path->nodes[0], - fi); + num_bytes = btrfs_file_extent_length(path); if (key.offset + num_bytes <= start) { btrfs_release_path(path); return -ENOENT; @@ -3211,8 +3207,7 @@ again: args->src = path->nodes[0]; next_slot: btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - fi = btrfs_item_ptr(args->src, path->slots[0], - struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_length(path); if (args->nr && args->start_slot + args->nr == path->slots[0]) { args->nr++; @@ -3230,7 +3225,6 @@ next_slot: } nritems = btrfs_header_nritems(path->nodes[0]); path->slots[0]++; - num_bytes = btrfs_file_extent_num_bytes(args->src, fi); if (len < num_bytes) { /* I _think_ this is ok, envision we write to a * preallocated space that is adjacent to a previously -- cgit v1.2.3 From bbe1426764e5dfaa57e7b12cc954acdb3fb7f94b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:34:54 +0000 Subject: Btrfs: fix unprotected extent map operation when logging file extents We forget to protect the modified_extents list, fix it. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index bcf0e48b193..d1947af67bc 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3526,8 +3526,10 @@ next_slot: struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em, *n; + write_lock(&tree->lock); list_for_each_entry_safe(em, n, &tree->modified_extents, list) list_del_init(&em->list); + write_unlock(&tree->lock); } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { -- cgit v1.2.3 From 5269b67e3d809dcaa4c6763a343423bb1b7b3fe6 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:35:23 +0000 Subject: Btrfs: fix missing log when BTRFS_INODE_NEEDS_FULL_SYNC is set If we set BTRFS_INODE_NEEDS_FULL_SYNC, we should log all the extent, but now we forget to take it into account, and set a wrong max key, if so, we will skip the file extent metadata when doing logging. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d1947af67bc..40b9efd20e4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3394,7 +3394,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, /* today the code can only do partial logging of directories */ - if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || + (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags) && + inode_only == LOG_INODE_EXISTS)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; -- cgit v1.2.3 From e99761514999f64aff1985460967f93d9e8417f4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Oct 2012 15:53:56 -0400 Subject: Btrfs: only log the inode item if we can get away with it Currently we copy all the file information into the log, inode item, the refs, xattrs etc. Except most of this doesn't change from fsync to fsync, just the inode item changes. So set a flag if an xattr changes or a link is added, and otherwise only log the inode item. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 40b9efd20e4..f05fca778cb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3429,14 +3429,20 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags)) { + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); } else { if (inode_only == LOG_INODE_ALL) fast_search = true; - max_key.type = BTRFS_XATTR_ITEM_KEY; + if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags)) + max_key.type = BTRFS_XATTR_ITEM_KEY; + else + max_key.type = BTRFS_INODE_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, - BTRFS_XATTR_ITEM_KEY); + max_key.type); } } if (ret) { -- cgit v1.2.3 From a95249b392c3ab843d7b25ab6817ecc9ea0b82ee Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Oct 2012 16:17:34 -0400 Subject: Btrfs: don't bother copying if we're only logging the inode We don't copy inode items anwyay, we just copy them straight into the log from the in memory inode. So if we know we're only logging the inode, don't bother dropping anything, just try to insert it and either if it succeeds or we get EEXIST we can update the inode item in the log and carry on. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f05fca778cb..ab7168ee618 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2996,6 +2996,26 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, } +static int log_inode_item(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct btrfs_path *path, + struct inode *inode) +{ + struct btrfs_inode_item *inode_item; + struct btrfs_key key; + int ret; + + memcpy(&key, &BTRFS_I(inode)->location, sizeof(key)); + ret = btrfs_insert_empty_item(trans, log, path, &key, + sizeof(*inode_item)); + if (ret && ret != -EEXIST) + return ret; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); + btrfs_release_path(path); + return 0; +} + static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, @@ -3433,17 +3453,24 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &BTRFS_I(inode)->runtime_flags); ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); - } else { + } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags)) { if (inode_only == LOG_INODE_ALL) fast_search = true; - if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags)) - max_key.type = BTRFS_XATTR_ITEM_KEY; - else - max_key.type = BTRFS_INODE_ITEM_KEY; + max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key.type); + } else { + if (inode_only == LOG_INODE_ALL) + fast_search = true; + ret = log_inode_item(trans, log, dst_path, inode); + if (ret) { + err = ret; + goto out_unlock; + } + goto log_extents; } + } if (ret) { err = ret; @@ -3522,6 +3549,7 @@ next_slot: ins_nr = 0; } +log_extents: if (fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); -- cgit v1.2.3 From 70c8a91ce21b83ccd2d9e7c968775430ead4353d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Oct 2012 16:54:30 -0400 Subject: Btrfs: log changed inodes based on the extent map tree We don't really need to copy extents from the source tree since we have all of the information already available to us in the extent_map tree. So instead just write the extents straight to the log tree and don't bother to copy the extent items from the source tree. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 338 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 194 insertions(+), 144 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ab7168ee618..72444811d27 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3150,145 +3150,220 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } -struct log_args { - struct extent_buffer *src; - u64 next_offset; - int start_slot; - int nr; -}; +static int drop_adjacent_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct extent_map *em, + struct btrfs_path *path) +{ + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct btrfs_key key, new_key; + struct btrfs_map_token token; + u64 extent_end; + u64 extent_offset = 0; + int extent_type; + int del_slot = 0; + int del_nr = 0; + int ret = 0; + + while (1) { + btrfs_init_map_token(&token); + leaf = path->nodes[0]; + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + if (del_nr) { + ret = btrfs_del_items(trans, root, path, + del_slot, del_nr); + if (ret) + return ret; + del_nr = 0; + } + + ret = btrfs_next_leaf_write(trans, root, path, 1); + if (ret < 0) + return ret; + if (ret > 0) + return 0; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY || + key.offset >= em->start + em->len) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_token_file_extent_type(leaf, fi, &token); + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_offset = btrfs_token_file_extent_offset(leaf, + fi, &token); + extent_end = key.offset + + btrfs_token_file_extent_num_bytes(leaf, fi, + &token); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = key.offset + + btrfs_file_extent_inline_len(leaf, fi); + } else { + BUG(); + } + + if (extent_end <= em->len + em->start) { + if (!del_nr) { + del_slot = path->slots[0]; + } + del_nr++; + continue; + } + + /* + * Ok so we'll ignore previous items if we log a new extent, + * which can lead to overlapping extents, so if we have an + * existing extent we want to adjust we _have_ to check the next + * guy to make sure we even need this extent anymore, this keeps + * us from panicing in set_item_key_safe. + */ + if (path->slots[0] < btrfs_header_nritems(leaf) - 1) { + struct btrfs_key tmp_key; + + btrfs_item_key_to_cpu(leaf, &tmp_key, + path->slots[0] + 1); + if (tmp_key.objectid == btrfs_ino(inode) && + tmp_key.type == BTRFS_EXTENT_DATA_KEY && + tmp_key.offset <= em->start + em->len) { + if (!del_nr) + del_slot = path->slots[0]; + del_nr++; + continue; + } + } + + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = em->start + em->len; + btrfs_set_item_key_safe(trans, root, path, &new_key); + extent_offset += em->start + em->len - key.offset; + btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, + &token); + btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end - + (em->start + em->len), + &token); + btrfs_mark_buffer_dirty(leaf); + } + + if (del_nr) + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + + return ret; +} static int log_one_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_root *root, - struct extent_map *em, struct btrfs_path *path, - struct btrfs_path *dst_path, struct log_args *args) + struct extent_map *em, struct btrfs_path *path) { struct btrfs_root *log = root->log_root; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct list_head ordered_sums; struct btrfs_key key; - u64 start = em->mod_start; - u64 search_start = start; - u64 len = em->mod_len; - u64 num_bytes; - int nritems; + u64 csum_offset = em->mod_start - em->start; + u64 csum_len = em->mod_len; + u64 extent_offset = em->start - em->orig_start; + u64 block_len; int ret; + bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - if (BTRFS_I(inode)->logged_trans == trans->transid) { - ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, - start + len, NULL, 0); - if (ret) - return ret; + INIT_LIST_HEAD(&ordered_sums); + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = em->start; + path->really_keep_locks = 1; + + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); + if (ret && ret != -EEXIST) { + path->really_keep_locks = 0; + return ret; + } + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, em->generation); + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + skip_csum = true; + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_PREALLOC); + } else { + btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + if (em->block_start == 0) + skip_csum = true; + } + + block_len = max(em->block_len, em->orig_block_len); + if (em->compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_file_extent_disk_bytenr(leaf, fi, em->block_start); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { + btrfs_set_file_extent_disk_bytenr(leaf, fi, + em->block_start - + extent_offset); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + } else { + btrfs_set_file_extent_disk_bytenr(leaf, fi, 0); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, 0); } - while (len) { - if (args->nr) - goto next_slot; -again: - key.objectid = btrfs_ino(inode); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = search_start; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ret; - - if (ret) { - /* - * A rare case were we can have an em for a section of a - * larger extent so we need to make sure that this em - * falls within the extent we've found. If not we just - * bail and go back to ye-olde way of doing things but - * it happens often enough in testing that we need to do - * this dance to make sure. - */ - do { - if (path->slots[0] == 0) { - btrfs_release_path(path); - if (search_start == 0) - return -ENOENT; - search_start--; - goto again; - } + btrfs_set_file_extent_offset(leaf, fi, em->start - em->orig_start); + btrfs_set_file_extent_num_bytes(leaf, fi, em->len); + btrfs_set_file_extent_ram_bytes(leaf, fi, em->len); + btrfs_set_file_extent_compression(leaf, fi, em->compress_type); + btrfs_set_file_extent_encryption(leaf, fi, 0); + btrfs_set_file_extent_other_encoding(leaf, fi, 0); + btrfs_mark_buffer_dirty(leaf); - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0]); - if (key.objectid != btrfs_ino(inode) || - key.type != BTRFS_EXTENT_DATA_KEY) { - btrfs_release_path(path); - return -ENOENT; - } - } while (key.offset > start); + /* + * Have to check the extent to the right of us to make sure it doesn't + * fall in our current range. We're ok if the previous extent is in our + * range since the recovery stuff will run us in key order and thus just + * drop the part we overwrote. + */ + ret = drop_adjacent_extents(trans, log, inode, em, path); + btrfs_release_path(path); + path->really_keep_locks = 0; + if (ret) { + return ret; + } - num_bytes = btrfs_file_extent_length(path); - if (key.offset + num_bytes <= start) { - btrfs_release_path(path); - return -ENOENT; - } - } - args->src = path->nodes[0]; -next_slot: - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - num_bytes = btrfs_file_extent_length(path); - if (args->nr && - args->start_slot + args->nr == path->slots[0]) { - args->nr++; - } else if (args->nr) { - ret = copy_items(trans, inode, dst_path, args->src, - args->start_slot, args->nr, - LOG_INODE_ALL); - if (ret) - return ret; - args->nr = 1; - args->start_slot = path->slots[0]; - } else if (!args->nr) { - args->nr = 1; - args->start_slot = path->slots[0]; - } - nritems = btrfs_header_nritems(path->nodes[0]); - path->slots[0]++; - if (len < num_bytes) { - /* I _think_ this is ok, envision we write to a - * preallocated space that is adjacent to a previously - * written preallocated space that gets merged when we - * mark this preallocated space written. If we do not - * have the adjacent extent in cache then when we copy - * this extent it could end up being larger than our EM - * thinks it is, which is a-ok, so just set len to 0. - */ - len = 0; - } else { - len -= num_bytes; - } - start = key.offset + num_bytes; - args->next_offset = start; - search_start = start; + if (skip_csum) + return 0; - if (path->slots[0] < nritems) { - if (len) - goto next_slot; - break; - } + /* block start is already adjusted for the file extent offset. */ + ret = btrfs_lookup_csums_range(log->fs_info->csum_root, + em->block_start + csum_offset, + em->block_start + csum_offset + + csum_len - 1, &ordered_sums, 0); + if (ret) + return ret; - if (args->nr) { - ret = copy_items(trans, inode, dst_path, args->src, - args->start_slot, args->nr, - LOG_INODE_ALL); - if (ret) - return ret; - args->nr = 0; - btrfs_release_path(path); - } + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + if (!ret) + ret = btrfs_csum_file_blocks(trans, log, sums); + list_del(&sums->list); + kfree(sums); } - return 0; + return ret; } static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct btrfs_path *path, - struct btrfs_path *dst_path) + struct btrfs_path *path) { - struct log_args args; struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; @@ -3297,8 +3372,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); - memset(&args, 0, sizeof(args)); - write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; @@ -3331,34 +3404,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_unlock(&tree->lock); - /* - * If the previous EM and the last extent we left off on aren't - * sequential then we need to copy the items we have and redo - * our search - */ - if (args.nr && em->mod_start != args.next_offset) { - ret = copy_items(trans, inode, dst_path, args.src, - args.start_slot, args.nr, - LOG_INODE_ALL); - if (ret) { - free_extent_map(em); - write_lock(&tree->lock); - continue; - } - btrfs_release_path(path); - args.nr = 0; - } - - ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); + ret = log_one_extent(trans, inode, root, em, path); free_extent_map(em); write_lock(&tree->lock); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); - if (!ret && args.nr) - ret = copy_items(trans, inode, dst_path, args.src, - args.start_slot, args.nr, LOG_INODE_ALL); btrfs_release_path(path); return ret; } @@ -3551,10 +3603,8 @@ next_slot: log_extents: if (fast_search) { - btrfs_release_path(path); btrfs_release_path(dst_path); - ret = btrfs_log_changed_extents(trans, root, inode, path, - dst_path); + ret = btrfs_log_changed_extents(trans, root, inode, dst_path); if (ret) { err = ret; goto out_unlock; -- cgit v1.2.3 From 0b1c6ccadee4ea4adb98799f3430fc72e57a187f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Oct 2012 16:03:44 -0400 Subject: Btrfs: use tokens where we can in the tree log If we are syncing over and over the overhead of doing all those maps in fill_inode_item and log_changed_extents really starts to hurt, so use map tokens so we can avoid all the extra mapping. Since the token maps from our offset to the end of the page make sure to set the first thing in the item first so we really only do one map. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 127 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 73 insertions(+), 54 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 72444811d27..83186c7e45d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode, int log_inode_only) { - btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); - btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); - btrfs_set_inode_mode(leaf, item, inode->i_mode); - btrfs_set_inode_nlink(leaf, item, inode->i_nlink); - - btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_nsec); - - btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_nsec); - - btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_nsec); - - btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); - - btrfs_set_inode_sequence(leaf, item, inode->i_version); - btrfs_set_inode_transid(leaf, item, trans->transid); - btrfs_set_inode_rdev(leaf, item, inode->i_rdev); - btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); - btrfs_set_inode_block_group(leaf, item, 0); + struct btrfs_map_token token; + + btrfs_init_map_token(&token); if (log_inode_only) { /* set the generation to zero so the recover code @@ -2986,14 +2962,43 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * just to say 'this inode exists' and a logging * to say 'update this inode with these values' */ - btrfs_set_inode_generation(leaf, item, 0); - btrfs_set_inode_size(leaf, item, 0); + btrfs_set_token_inode_generation(leaf, item, 0, &token); + btrfs_set_token_inode_size(leaf, item, 0, &token); } else { - btrfs_set_inode_generation(leaf, item, - BTRFS_I(inode)->generation); - btrfs_set_inode_size(leaf, item, inode->i_size); - } - + btrfs_set_token_inode_generation(leaf, item, + BTRFS_I(inode)->generation, + &token); + btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); + } + + btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); + btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); + btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); + btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec, &token); + + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), + &token); + + btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); + btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); + btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); + btrfs_set_token_inode_block_group(leaf, item, 0, &token); } static int log_inode_item(struct btrfs_trans_handle *trans, @@ -3267,6 +3272,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct list_head ordered_sums; + struct btrfs_map_token token; struct btrfs_key key; u64 csum_offset = em->mod_start - em->start; u64 csum_len = em->mod_len; @@ -3276,6 +3282,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; INIT_LIST_HEAD(&ordered_sums); + btrfs_init_map_token(&token); key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = em->start; @@ -3289,37 +3296,49 @@ static int log_one_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, fi, em->generation); + btrfs_set_token_file_extent_generation(leaf, fi, em->generation, + &token); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { skip_csum = true; - btrfs_set_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_PREALLOC); + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_PREALLOC, + &token); } else { - btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG, + &token); if (em->block_start == 0) skip_csum = true; } block_len = max(em->block_len, em->orig_block_len); if (em->compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_file_extent_disk_bytenr(leaf, fi, em->block_start); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start, + &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_file_extent_disk_bytenr(leaf, fi, - em->block_start - - extent_offset); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start - + extent_offset, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); } else { - btrfs_set_file_extent_disk_bytenr(leaf, fi, 0); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, 0); - } - - btrfs_set_file_extent_offset(leaf, fi, em->start - em->orig_start); - btrfs_set_file_extent_num_bytes(leaf, fi, em->len); - btrfs_set_file_extent_ram_bytes(leaf, fi, em->len); - btrfs_set_file_extent_compression(leaf, fi, em->compress_type); - btrfs_set_file_extent_encryption(leaf, fi, 0); - btrfs_set_file_extent_other_encoding(leaf, fi, 0); + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, + &token); + } + + btrfs_set_token_file_extent_offset(leaf, fi, + em->start - em->orig_start, + &token); + btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, + &token); + btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); + btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); btrfs_mark_buffer_dirty(leaf); /* -- cgit v1.2.3 From 201a90389424d6771d24fc5d72f7e34cb4a8f967 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Jan 2013 12:02:07 -0500 Subject: Btrfs: do not allow logged extents to be merged or removed We drop the extent map tree lock while we're logging extents, so somebody could come in and merge another extent into this one and screw up our logging, or they could even remove us from the list which would keep us from logging the extent or freeing our ref on it, so we need to make sure to not clear LOGGING until after the extent is logged, and then we can merge it to adjacent extents. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 83186c7e45d..de8899b04d6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3410,13 +3410,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em = list_entry(extents.next, struct extent_map, list); list_del_init(&em->list); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); /* * If we had an error we just need to delete everybody from our * private list. */ if (ret) { + clear_em_logging(tree, em); free_extent_map(em); continue; } @@ -3424,8 +3424,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_unlock(&tree->lock); ret = log_one_extent(trans, inode, root, em, path); - free_extent_map(em); write_lock(&tree->lock); + clear_em_logging(tree, em); + free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); -- cgit v1.2.3 From 192000dda22e02225772e862b92e7c09e5a17d08 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Sun, 6 Jan 2013 03:38:22 +0000 Subject: Btrfs: use right range to find checksum for compressed extents For compressed extents, the range of checksum is covered by disk length, and the disk length is different with ram length, so we need to use disk length instead to get us the right checksum. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index de8899b04d6..9027bb1e746 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3357,6 +3357,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (skip_csum) return 0; + if (em->compress_type) { + csum_offset = 0; + csum_len = block_len; + } + /* block start is already adjusted for the file extent offset. */ ret = btrfs_lookup_csums_range(log->fs_info->csum_root, em->block_start + csum_offset, -- cgit v1.2.3