From 5dc562c541e1026df9d43913c2f6b91156e22d32 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Aug 2012 13:14:17 -0400 Subject: Btrfs: turbo charge fsync At least for the vm workload. Currently on fsync we will 1) Truncate all items in the log tree for the given inode if they exist and 2) Copy all items for a given inode into the log The problem with this is that for things like VMs you can have lots of extents from the fragmented writing behavior, and worst yet you may have only modified a few extents, not the entire thing. This patch fixes this problem by tracking which transid modified our extent, and then when we do the tree logging we find all of the extents we've modified in our current transaction, sort them and commit them. We also only truncate up to the xattrs of the inode and copy that stuff in normally, and then just drop any extents in the range we have that exist in the log already. Here are some numbers of a 50 meg fio job that does random writes and fsync()s after every write Original Patched SATA drive 82KB/s 140KB/s Fusion drive 431KB/s 2532KB/s So around 2-6 times faster depending on your hardware. There are a few corner cases, for example if you truncate at all we have to do it the old way since there is no way to be sure what is in the log is ok. This probably could be done smarter, but if you write-fsync-truncate-write-fsync you deserve what you get. All this work is in RAM of course so if your inode gets evicted from cache and you read it in and fsync it we'll do it the slow way if we are still in the same transaction that we last modified the inode in. The biggest cool part of this is that it requires no changes to the recovery code, so if you fsync with this patch and crash and load an old kernel, it will run the recovery and be a-ok. I have tested this pretty thoroughly with an fsync tester and everything comes back fine, as well as xfstests. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 7c97b330145..1fe82cfc1d9 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -35,6 +35,7 @@ void extent_map_exit(void) void extent_map_tree_init(struct extent_map_tree *tree) { tree->map = RB_ROOT; + INIT_LIST_HEAD(&tree->modified_extents); rwlock_init(&tree->lock); } @@ -54,7 +55,9 @@ struct extent_map *alloc_extent_map(void) em->in_tree = 0; em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; + em->generation = 0; atomic_set(&em->refs, 1); + INIT_LIST_HEAD(&em->list); return em; } @@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em) WARN_ON(atomic_read(&em->refs) == 0); if (atomic_dec_and_test(&em->refs)) { WARN_ON(em->in_tree); + WARN_ON(!list_empty(&em->list)); kmem_cache_free(extent_map_cache, em); } } @@ -198,6 +202,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_len += merge->block_len; em->block_start = merge->block_start; merge->in_tree = 0; + if (merge->generation > em->generation) { + em->generation = merge->generation; + list_move(&em->list, &tree->modified_extents); + } + + list_del_init(&merge->list); rb_erase(&merge->rb_node, &tree->map); free_extent_map(merge); } @@ -211,11 +221,29 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_len += merge->len; rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; + if (merge->generation > em->generation) { + em->generation = merge->generation; + list_move(&em->list, &tree->modified_extents); + } + list_del_init(&merge->list); free_extent_map(merge); } } -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +/** + * unpint_extent_cache - unpin an extent from the cache + * @tree: tree to unpin the extent in + * @start: logical offset in the file + * @len: length of the extent + * @gen: generation that this extent has been modified in + * @prealloc: if this is set we need to clear the prealloc flag + * + * Called after an extent has been written to disk properly. Set the generation + * to the generation that actually added the file item to the inode so we know + * we need to sync this extent when we call fsync(). + */ +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, + u64 gen) { int ret = 0; struct extent_map *em; @@ -228,10 +256,11 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) if (!em) goto out; + list_move(&em->list, &tree->modified_extents); + em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); try_merge_map(tree, em); - free_extent_map(em); out: write_unlock(&tree->lock); @@ -358,6 +387,7 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase(&em->rb_node, &tree->map); + list_del_init(&em->list); em->in_tree = 0; return ret; } -- cgit v1.2.3 From 4e2f84e63dc138eca91e89ccbc34f37732ce58f7 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 27 Aug 2012 10:52:20 -0600 Subject: Btrfs: improve fsync by filtering extents that we want This is based on Josef's "Btrfs: turbo charge fsync". The above Josef's patch performs very good in random sync write test, because we won't have too much extents to merge. However, it does not performs good on the test: dd if=/dev/zero of=foobar bs=4k count=12500 oflag=sync The reason is when we do sequencial sync write, we need to merge the current extent just with the previous one, so that we can get accumulated extents to log: A(4k) --> AA(8k) --> AAA(12k) --> AAAA(16k) ... So we'll have to flush more and more checksum into log tree, which is the bottleneck according to my tests. But we can avoid this by telling fsync the real extents that are needed to be logged. With this, I did the above dd sync write test (size=50m), w/o (orig) w/ (josef's) w/ (this) SATA 104KB/s 109KB/s 121KB/s ramdisk 1.5MB/s 1.5MB/s 10.7MB/s (613%) Signed-off-by: Liu Bo --- fs/btrfs/extent_map.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 1fe82cfc1d9..ac606f076eb 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -203,6 +203,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_start = merge->block_start; merge->in_tree = 0; if (merge->generation > em->generation) { + em->mod_start = em->start; + em->mod_len = em->len; em->generation = merge->generation; list_move(&em->list, &tree->modified_extents); } @@ -222,6 +224,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; if (merge->generation > em->generation) { + em->mod_len = em->len; em->generation = merge->generation; list_move(&em->list, &tree->modified_extents); } @@ -247,6 +250,7 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, { int ret = 0; struct extent_map *em; + bool prealloc = false; write_lock(&tree->lock); em = lookup_extent_mapping(tree, start, len); @@ -259,8 +263,21 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->mod_start = em->start; + em->mod_len = em->len; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + prealloc = true; + clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } try_merge_map(tree, em); + + if (prealloc) { + em->mod_start = em->start; + em->mod_len = em->len; + } + free_extent_map(em); out: write_unlock(&tree->lock); @@ -298,6 +315,9 @@ int add_extent_mapping(struct extent_map_tree *tree, } atomic_inc(&em->refs); + em->mod_start = em->start; + em->mod_len = em->len; + try_merge_map(tree, em); out: return ret; -- cgit v1.2.3 From 837e197283199de640857192ca32767cb6e24fe8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 7 Sep 2012 03:00:48 -0600 Subject: btrfs: polish names of kmem caches Usecase: watch 'grep btrfs < /proc/slabinfo' easy to watch all caches in one go. Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index ac606f076eb..8d1364d385d 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { - extent_map_cache = kmem_cache_create("extent_map", + extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) -- cgit v1.2.3 From ff44c6e36dc9dcc02652a1105b120bdf08cea9f7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Sep 2012 12:59:20 -0400 Subject: Btrfs: do not hold the write_lock on the extent tree while logging Dave Sterba pointed out a sleeping while atomic bug while doing fsync. This is because I'm an idiot and didn't realize that rwlock's were spin locks, so we've been holding this thing while doing allocations and such which is not good. This patch fixes this by dropping the write lock before we do anything heavy and re-acquire it when it is done. We also need to take a ref on the em's in case their corresponding pages are evicted and mark them as being logged so that releasepage does not remove them and doesn't remove them from our local list. Thanks, Reported-by: Dave Sterba Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 8d1364d385d..b8cbc8d5c7f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -407,7 +407,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase(&em->rb_node, &tree->map); - list_del_init(&em->list); + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_del_init(&em->list); em->in_tree = 0; return ret; } -- cgit v1.2.3 From 52b1de91ea07d1f5600b3cae789d5be9a1f61787 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 30 Oct 2012 17:13:52 +0800 Subject: btrfs: unpin_extent_cache: fix the typo and unnecessary arguements - unpint->unpin - prealloc is no more used Signed-off-by: Liu Bo Signed-off-by: Jiri Kosina --- fs/btrfs/extent_map.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b8cbc8d5c7f..ce9f7921672 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -234,12 +234,11 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) } /** - * unpint_extent_cache - unpin an extent from the cache + * unpin_extent_cache - unpin an extent from the cache * @tree: tree to unpin the extent in * @start: logical offset in the file * @len: length of the extent * @gen: generation that this extent has been modified in - * @prealloc: if this is set we need to clear the prealloc flag * * Called after an extent has been written to disk properly. Set the generation * to the generation that actually added the file item to the inode so we know -- cgit v1.2.3 From b11e234d21e73df94099e473a080bca502b9a496 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 3 Dec 2012 10:58:15 -0500 Subject: Btrfs: do not mark ems as prealloc if we are writing to them We are going to use EM's to log extents in the future, so we need to not mark them as prealloc if they aren't actually prealloc extents. Instead mark them with FILLING so we know to ammend mod_start/mod_len and that way we don't confuse the extent logging code. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b8cbc8d5c7f..85ae2b6fe03 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -266,9 +266,9 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, em->mod_start = em->start; em->mod_len = em->len; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { prealloc = true; - clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); + clear_bit(EXTENT_FLAG_FILLING, &em->flags); } try_merge_map(tree, em); -- cgit v1.2.3 From 70c8a91ce21b83ccd2d9e7c968775430ead4353d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Oct 2012 16:54:30 -0400 Subject: Btrfs: log changed inodes based on the extent map tree We don't really need to copy extents from the source tree since we have all of the information already available to us in the extent_map tree. So instead just write the extents straight to the log tree and don't bother to copy the extent items from the source tree. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 85ae2b6fe03..fff2c28497b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree) struct extent_map *alloc_extent_map(void) { struct extent_map *em; - em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); + em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); if (!em) return NULL; em->in_tree = 0; @@ -198,16 +198,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) merge = rb_entry(rb, struct extent_map, rb_node); if (rb && mergable_maps(merge, em)) { em->start = merge->start; + em->orig_start = merge->orig_start; em->len += merge->len; em->block_len += merge->block_len; em->block_start = merge->block_start; merge->in_tree = 0; - if (merge->generation > em->generation) { - em->mod_start = em->start; - em->mod_len = em->len; - em->generation = merge->generation; - list_move(&em->list, &tree->modified_extents); - } + em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; + em->mod_start = merge->mod_start; + em->generation = max(em->generation, merge->generation); + list_move(&em->list, &tree->modified_extents); list_del_init(&merge->list); rb_erase(&merge->rb_node, &tree->map); @@ -223,11 +222,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_len += merge->len; rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; - if (merge->generation > em->generation) { - em->mod_len = em->len; - em->generation = merge->generation; - list_move(&em->list, &tree->modified_extents); - } + em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; + em->generation = max(em->generation, merge->generation); list_del_init(&merge->list); free_extent_map(merge); } -- cgit v1.2.3 From 201a90389424d6771d24fc5d72f7e34cb4a8f967 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Jan 2013 12:02:07 -0500 Subject: Btrfs: do not allow logged extents to be merged or removed We drop the extent map tree lock while we're logging extents, so somebody could come in and merge another extent into this one and screw up our logging, or they could even remove us from the list which would keep us from logging the extent or freeing our ref on it, so we need to make sure to not clear LOGGING until after the extent is logged, and then we can merge it to adjacent extents. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index fff2c28497b..ed88f5ee4be 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) return 0; + if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || + test_bit(EXTENT_FLAG_LOGGING, &next->flags)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -256,7 +260,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, if (!em) goto out; - list_move(&em->list, &tree->modified_extents); + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); em->mod_start = em->start; @@ -281,6 +286,12 @@ out: } +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +{ + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + try_merge_map(tree, em); +} + /** * add_extent_mapping - add new extent map to the extent tree * @tree: tree to insert new map in -- cgit v1.2.3 From 222c81dc3874b4fe98371be665d0447a36447653 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 28 Jan 2013 09:45:20 -0500 Subject: Btrfs: do not merge logged extents if we've removed them from the tree You can run into this problem where if somebody is fsyncing and writing out the existing extents you will have removed the extent map from the em tree, but it's still valid for the current fsync so we go ahead and write it. The problem is we unconditionally try to merge it back into the em tree, but if we've removed it from the em tree that will cause use after free problems. Fix this to only merge if we are still a part of the tree. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index ed88f5ee4be..9759911dd34 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -289,7 +289,8 @@ out: void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) { clear_bit(EXTENT_FLAG_LOGGING, &em->flags); - try_merge_map(tree, em); + if (em->in_tree) + try_merge_map(tree, em); } /** -- cgit v1.2.3