From 5dc562c541e1026df9d43913c2f6b91156e22d32 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Aug 2012 13:14:17 -0400 Subject: Btrfs: turbo charge fsync At least for the vm workload. Currently on fsync we will 1) Truncate all items in the log tree for the given inode if they exist and 2) Copy all items for a given inode into the log The problem with this is that for things like VMs you can have lots of extents from the fragmented writing behavior, and worst yet you may have only modified a few extents, not the entire thing. This patch fixes this problem by tracking which transid modified our extent, and then when we do the tree logging we find all of the extents we've modified in our current transaction, sort them and commit them. We also only truncate up to the xattrs of the inode and copy that stuff in normally, and then just drop any extents in the range we have that exist in the log already. Here are some numbers of a 50 meg fio job that does random writes and fsync()s after every write Original Patched SATA drive 82KB/s 140KB/s Fusion drive 431KB/s 2532KB/s So around 2-6 times faster depending on your hardware. There are a few corner cases, for example if you truncate at all we have to do it the old way since there is no way to be sure what is in the log is ok. This probably could be done smarter, but if you write-fsync-truncate-write-fsync you deserve what you get. All this work is in RAM of course so if your inode gets evicted from cache and you read it in and fsync it we'll do it the slow way if we are still in the same transaction that we last modified the inode in. The biggest cool part of this is that it requires no changes to the recovery code, so if you fsync with this patch and crash and load an old kernel, it will run the recovery and be a-ok. I have tested this pretty thoroughly with an fsync tester and everything comes back fine, as well as xfstests. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent_map.h') diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 1195f09761f..2388a60bd6e 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -23,15 +23,18 @@ struct extent_map { u64 orig_start; u64 block_start; u64 block_len; + u64 generation; unsigned long flags; struct block_device *bdev; atomic_t refs; unsigned int in_tree; unsigned int compress_type; + struct list_head list; }; struct extent_map_tree { struct rb_root map; + struct list_head modified_extents; rwlock_t lock; }; @@ -60,7 +63,7 @@ struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); #endif -- cgit v1.2.3 From 4e2f84e63dc138eca91e89ccbc34f37732ce58f7 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 27 Aug 2012 10:52:20 -0600 Subject: Btrfs: improve fsync by filtering extents that we want This is based on Josef's "Btrfs: turbo charge fsync". The above Josef's patch performs very good in random sync write test, because we won't have too much extents to merge. However, it does not performs good on the test: dd if=/dev/zero of=foobar bs=4k count=12500 oflag=sync The reason is when we do sequencial sync write, we need to merge the current extent just with the previous one, so that we can get accumulated extents to log: A(4k) --> AA(8k) --> AAA(12k) --> AAAA(16k) ... So we'll have to flush more and more checksum into log tree, which is the bottleneck according to my tests. But we can avoid this by telling fsync the real extents that are needed to be logged. With this, I did the above dd sync write test (size=50m), w/o (orig) w/ (josef's) w/ (this) SATA 104KB/s 109KB/s 121KB/s ramdisk 1.5MB/s 1.5MB/s 10.7MB/s (613%) Signed-off-by: Liu Bo --- fs/btrfs/extent_map.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/extent_map.h') diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 2388a60bd6e..8e6294b5135 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -20,6 +20,8 @@ struct extent_map { /* all of these are in bytes */ u64 start; u64 len; + u64 mod_start; + u64 mod_len; u64 orig_start; u64 block_start; u64 block_len; -- cgit v1.2.3 From ff44c6e36dc9dcc02652a1105b120bdf08cea9f7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Sep 2012 12:59:20 -0400 Subject: Btrfs: do not hold the write_lock on the extent tree while logging Dave Sterba pointed out a sleeping while atomic bug while doing fsync. This is because I'm an idiot and didn't realize that rwlock's were spin locks, so we've been holding this thing while doing allocations and such which is not good. This patch fixes this by dropping the write lock before we do anything heavy and re-acquire it when it is done. We also need to take a ref on the em's in case their corresponding pages are evicted and mark them as being logged so that releasepage does not remove them and doesn't remove them from our local list. Thanks, Reported-by: Dave Sterba Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/extent_map.h') diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 8e6294b5135..679225555f7 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -13,6 +13,7 @@ #define EXTENT_FLAG_COMPRESSED 1 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ +#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ struct extent_map { struct rb_node rb_node; -- cgit v1.2.3 From b493968096944a11422c4d80fb87af537ca1cac7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 3 Dec 2012 10:31:19 -0500 Subject: Btrfs: keep track of the extents original block length If we've written to a prealloc extent we need to know the original block len for the extent. We can't figure this out currently since ->block_len is just set to the extent length. So introduce ->orig_block_len so that we know how many bytes were in the original extent for proper extent logging that future patches will need. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_map.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/extent_map.h') diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 679225555f7..99a0dcb5ba2 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -24,6 +24,7 @@ struct extent_map { u64 mod_start; u64 mod_len; u64 orig_start; + u64 orig_block_len; u64 block_start; u64 block_len; u64 generation; -- cgit v1.2.3 From b11e234d21e73df94099e473a080bca502b9a496 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 3 Dec 2012 10:58:15 -0500 Subject: Btrfs: do not mark ems as prealloc if we are writing to them We are going to use EM's to log extents in the future, so we need to not mark them as prealloc if they aren't actually prealloc extents. Instead mark them with FILLING so we know to ammend mod_start/mod_len and that way we don't confuse the extent logging code. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_map.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/extent_map.h') diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 99a0dcb5ba2..922943ce29e 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -14,6 +14,7 @@ #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ +#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ struct extent_map { struct rb_node rb_node; -- cgit v1.2.3 From 201a90389424d6771d24fc5d72f7e34cb4a8f967 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Jan 2013 12:02:07 -0500 Subject: Btrfs: do not allow logged extents to be merged or removed We drop the extent map tree lock while we're logging extents, so somebody could come in and merge another extent into this one and screw up our logging, or they could even remove us from the list which would keep us from logging the extent or freeing our ref on it, so we need to make sure to not clear LOGGING until after the extent is logged, and then we can merge it to adjacent extents. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/extent_map.h') diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 922943ce29e..c6598c89cff 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); #endif -- cgit v1.2.3