From fbe104942d3ff44f6802e8e4a3fbf267c1fb9ac4 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 9 Jul 2012 16:29:29 -0400 Subject: ext4: split ext4_file_write into buffered IO and direct IO ext4_file_dio_write is defined in order to split buffered IO and direct IO in ext4. This patch just refactor some stuff in write path. CC: Tao Ma CC: Eric Sandeen CC: Robin Dong Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/file.c | 60 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 22 deletions(-) (limited to 'fs/ext4/file.c') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8c7642a0005..a10dc7742ae 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -90,34 +90,16 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, } static ssize_t -ext4_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; int unaligned_aio = 0; ssize_t ret; - /* - * If we have encountered a bitmap-format file, the size limit - * is smaller than s_maxbytes, which is for extent-mapped files. - */ - - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - size_t length = iov_length(iov, nr_segs); - - if ((pos > sbi->s_bitmap_maxbytes || - (pos == sbi->s_bitmap_maxbytes && length > 0))) - return -EFBIG; - - if (pos + length > sbi->s_bitmap_maxbytes) { - nr_segs = iov_shorten((struct iovec *)iov, nr_segs, - sbi->s_bitmap_maxbytes - pos); - } - } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) && - !is_sync_kiocb(iocb))) { + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + !is_sync_kiocb(iocb)) unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos); - } /* Unaligned direct AIO must be serialized; see comment above */ if (unaligned_aio) { @@ -141,6 +123,40 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, return ret; } +static ssize_t +ext4_file_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + ssize_t ret; + + /* + * If we have encountered a bitmap-format file, the size limit + * is smaller than s_maxbytes, which is for extent-mapped files. + */ + + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + size_t length = iov_length(iov, nr_segs); + + if ((pos > sbi->s_bitmap_maxbytes || + (pos == sbi->s_bitmap_maxbytes && length > 0))) + return -EFBIG; + + if (pos + length > sbi->s_bitmap_maxbytes) { + nr_segs = iov_shorten((struct iovec *)iov, nr_segs, + sbi->s_bitmap_maxbytes - pos); + } + } + + if (unlikely(iocb->ki_filp->f_flags & O_DIRECT)) + ret = ext4_file_dio_write(iocb, iov, nr_segs, pos); + else + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + + return ret; +} + static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = ext4_page_mkwrite, -- cgit v1.2.3 From e8b96eb5034a0ccebf36760f88e31ea3e3cdf1e4 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 30 Apr 2012 13:11:29 -0500 Subject: vfs: allow custom EOF in generic_file_llseek code For ext3/4 htree directories, using the vfs llseek function with SEEK_END goes to i_size like for any other file, but in reality we want the maximum possible hash value. Recent changes in ext4 have cut & pasted generic_file_llseek() back into fs/ext4/dir.c, but replicating this core code seems like a bad idea, especially since the copy has already diverged from the vfs. This patch updates generic_file_llseek_size to accept both a custom maximum offset, and a custom EOF position. With this in place, ext4_dir_llseek can pass in the appropriate maximum hash position for both maxsize and eof, and get what it wants. As far as I know, this does not fix any bugs - nfs in the kernel doesn't use SEEK_END, and I don't know of any user who does. But some ext4 folks seem keen on doing the right thing here, and I can't really argue. (Patch also fixes up some comments slightly) Signed-off-by: Eric Sandeen Signed-off-by: Al Viro --- fs/ext4/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/ext4/file.c') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8c7642a0005..f3dadd0a0d5 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -225,7 +225,8 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) else maxbytes = inode->i_sb->s_maxbytes; - return generic_file_llseek_size(file, offset, origin, maxbytes); + return generic_file_llseek_size(file, offset, origin, + maxbytes, i_size_read(inode)); } const struct file_operations ext4_file_operations = { -- cgit v1.2.3 From ec7268ce21b379a248705548573393e4f346b20b Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 30 Apr 2012 13:14:03 -0500 Subject: ext4: use core vfs llseek code for dir seeks Use the new functionality in generic_file_llseek_size() to accept a custom EOF position, and un-cut-and-paste all the vfs llseek code from ext4. Also fix up comments on ext4_llseek() to reflect reality. Signed-off-by: Eric Sandeen Signed-off-by: Al Viro --- fs/ext4/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/ext4/file.c') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f3dadd0a0d5..782eecb57e4 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -211,9 +211,9 @@ static int ext4_file_open(struct inode * inode, struct file * filp) } /* - * ext4_llseek() copied from generic_file_llseek() to handle both - * block-mapped and extent-mapped maxbytes values. This should - * otherwise be identical with generic_file_llseek(). + * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values + * by calling generic_file_llseek_size() with the appropriate maxbytes + * value for each. */ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) { -- cgit v1.2.3 From 4bd809dbbf177ad0c450d702466b1da63e1b4b7e Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 22 Jul 2012 20:19:31 -0400 Subject: ext4: don't take the i_mutex lock when doing DIO overwrites Aligned and overwrite direct I/O can be parallelized. In ext4_file_dio_write, we first check whether these conditions are satisfied or not. If so, we take i_data_sem and release i_mutex lock directly. Meanwhile iocb->private is set to indicate that this is a dio overwrite, and it will be handled in ext4_ext_direct_IO. [ Added fix from Dan Carpenter to fix locking bug on the error path. ] CC: Tao Ma CC: Eric Sandeen CC: Robin Dong Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Signed-off-by: Dan Carpenter --- fs/ext4/file.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) (limited to 'fs/ext4/file.c') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index a10dc7742ae..1c81509f5bd 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -93,9 +93,13 @@ static ssize_t ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct blk_plug plug; int unaligned_aio = 0; ssize_t ret; + int overwrite = 0; + size_t length = iov_length(iov, nr_segs); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && !is_sync_kiocb(iocb)) @@ -115,7 +119,50 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, ext4_aiodio_wait(inode); } - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + BUG_ON(iocb->ki_pos != pos); + + mutex_lock(&inode->i_mutex); + blk_start_plug(&plug); + + iocb->private = &overwrite; + + /* check whether we do a DIO overwrite or not */ + if (ext4_should_dioread_nolock(inode) && !unaligned_aio && + !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + int err, len; + + map.m_lblk = pos >> blkbits; + map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) + - map.m_lblk; + len = map.m_len; + + err = ext4_map_blocks(NULL, inode, &map, 0); + /* + * 'err==len' means that all of blocks has been preallocated no + * matter they are initialized or not. For excluding + * uninitialized extents, we need to check m_flags. There are + * two conditions that indicate for initialized extents. + * 1) If we hit extent cache, EXT4_MAP_MAPPED flag is returned; + * 2) If we do a real lookup, non-flags are returned. + * So we should check these two conditions. + */ + if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) + overwrite = 1; + } + + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + mutex_unlock(&inode->i_mutex); + + if (ret > 0 || ret == -EIOCBQUEUED) { + ssize_t err; + + err = generic_write_sync(file, pos, ret); + if (err < 0 && ret > 0) + ret = err; + } + blk_finish_plug(&plug); if (unaligned_aio) mutex_unlock(ext4_aio_mutex(inode)); -- cgit v1.2.3 From 044ce47fec90ec0f25605e87a5d72cca14568bc3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 22 Jul 2012 20:31:31 -0400 Subject: ext4: convert last user of ext4_mark_super_dirty() to ext4_handle_dirty_super() The last user of ext4_mark_super_dirty() in ext4_file_open() is so rare it can well be modifying the superblock properly by journalling the change. Change it and get rid of ext4_mark_super_dirty() as it's not needed anymore. Artem: small amendments. Artem: tested using xfstests for both journalled and non-journalled ext4. Signed-off-by: Jan Kara Signed-off-by: Artem Bityutskiy Signed-off-by: "Theodore Ts'o" Tested-by: Artem Bityutskiy --- fs/ext4/file.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs/ext4/file.c') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 1c81509f5bd..f77e795fed6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -244,9 +244,21 @@ static int ext4_file_open(struct inode * inode, struct file * filp) path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); if (!IS_ERR(cp)) { + handle_t *handle; + int err; + + handle = ext4_journal_start_sb(sb, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) { + ext4_journal_stop(handle); + return err; + } strlcpy(sbi->s_es->s_last_mounted, cp, sizeof(sbi->s_es->s_last_mounted)); - ext4_mark_super_dirty(sb); + ext4_handle_dirty_super(handle, sb); + ext4_journal_stop(handle); } } /* -- cgit v1.2.3 From e27f41e1b789e60e7d8cc9c81fd93ca49ef31f13 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 28 Sep 2012 23:24:52 -0400 Subject: ext4: give i_aiodio_unwritten a more appropriate name AIO/DIO prefix is wrong because it account unwritten extents which also may be scheduled from buffered write endio Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/ext4/file.c') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3b0e3bdaabf..39335bda404 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -55,11 +55,11 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } -static void ext4_aiodio_wait(struct inode *inode) +static void ext4_unwritten_wait(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0)); + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); } /* @@ -116,7 +116,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, "performance will be poor.", inode->i_ino, current->comm); mutex_lock(ext4_aio_mutex(inode)); - ext4_aiodio_wait(inode); + ext4_unwritten_wait(inode); } BUG_ON(iocb->ki_pos != pos); -- cgit v1.2.3 From c278531d39f3158bfee93dc67da0b77e09776de2 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 5 Oct 2012 11:31:55 -0400 Subject: ext4: fix ext4_flush_completed_IO wait semantics BUG #1) All places where we call ext4_flush_completed_IO are broken because buffered io and DIO/AIO goes through three stages 1) submitted io, 2) completed io (in i_completed_io_list) conversion pended 3) finished io (conversion done) And by calling ext4_flush_completed_IO we will flush only requests which were in (2) stage, which is wrong because: 1) punch_hole and truncate _must_ wait for all outstanding unwritten io regardless to it's state. 2) fsync and nolock_dio_read should also wait because there is a time window between end_page_writeback() and ext4_add_complete_io() As result integrity fsync is broken in case of buffered write to fallocated region: fsync blkdev_completion ->filemap_write_and_wait_range ->ext4_end_bio ->end_page_writeback <-- filemap_write_and_wait_range return ->ext4_flush_completed_IO sees empty i_completed_io_list but pended conversion still exist ->ext4_add_complete_io BUG #2) Race window becomes wider due to the 'ext4: completed_io locking cleanup V4' patch series This patch make following changes: 1) ext4_flush_completed_io() now first try to flush completed io and when wait for any outstanding unwritten io via ext4_unwritten_wait() 2) Rename function to more appropriate name. 3) Assert that all callers of ext4_flush_unwritten_io should hold i_mutex to prevent endless wait Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/file.c') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 39335bda404..ca6f07afe60 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -55,7 +55,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } -static void ext4_unwritten_wait(struct inode *inode) +void ext4_unwritten_wait(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); -- cgit v1.2.3 From 0b173bc4daa8f8ec03a85abf5e47b23502ff80af Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:46 -0700 Subject: mm: kill vma flag VM_CAN_NONLINEAR Move actual pte filling for non-linear file mappings into the new special vma operation: ->remap_pages(). Filesystems must implement this method to get non-linear mapping support, if it uses filemap_fault() then generic_file_remap_pages() can be used. Now device drivers can implement this method and obtain nonlinear vma support. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf #arch/tile Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/file.c') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ca6f07afe60..bf3966bccd3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -207,6 +207,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = ext4_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) @@ -217,7 +218,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &ext4_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } -- cgit v1.2.3 From c8c0df241cc2719b1262e627f999638411934f60 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Thu, 8 Nov 2012 21:57:40 -0500 Subject: ext4: introduce lseek SEEK_DATA/SEEK_HOLE support This patch makes ext4 really support SEEK_DATA/SEEK_HOLE flags. Block-mapped and extent-mapped files are fully implemented together because ext4_map_blocks hides this differences. After applying this patch, it will cause a failure in xfstest #285 when the file is block-mapped due to block-mapped file isn't support fallocate(2). I had tried to use ext4_ext_walk_space() to retrieve the offset for a extent-mapped file. But finally I decide to keep using ext4_map_blocks() to support SEEK_DATA/SEEK_HOLE because ext4_map_blocks() can hide the difference between block-mapped file and extent-mapped file. Moreover, in next step, extent status tree will track all extent status, and we can get all mappings from this tree. So I think that using ext4_map_blocks() is a better choice. CC: Hugh Dickins Signed-off-by: Jie Liu Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/file.c | 334 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 332 insertions(+), 2 deletions(-) (limited to 'fs/ext4/file.c') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index bf3966bccd3..2f5759eb9f8 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -285,6 +286,324 @@ static int ext4_file_open(struct inode * inode, struct file * filp) return dquot_file_open(inode, filp); } +/* + * Here we use ext4_map_blocks() to get a block mapping for a extent-based + * file rather than ext4_ext_walk_space() because we can introduce + * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same + * function. When extent status tree has been fully implemented, it will + * track all extent status for a file and we can directly use it to + * retrieve the offset for SEEK_DATA/SEEK_HOLE. + */ + +/* + * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to + * lookup page cache to check whether or not there has some data between + * [startoff, endoff] because, if this range contains an unwritten extent, + * we determine this extent as a data or a hole according to whether the + * page cache has data or not. + */ +static int ext4_find_unwritten_pgoff(struct inode *inode, + int origin, + struct ext4_map_blocks *map, + loff_t *offset) +{ + struct pagevec pvec; + unsigned int blkbits; + pgoff_t index; + pgoff_t end; + loff_t endoff; + loff_t startoff; + loff_t lastoff; + int found = 0; + + blkbits = inode->i_sb->s_blocksize_bits; + startoff = *offset; + lastoff = startoff; + endoff = (map->m_lblk + map->m_len) << blkbits; + + index = startoff >> PAGE_CACHE_SHIFT; + end = endoff >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + do { + int i, num; + unsigned long nr_pages; + + num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + (pgoff_t)num); + if (nr_pages == 0) { + if (origin == SEEK_DATA) + break; + + BUG_ON(origin != SEEK_HOLE); + /* + * If this is the first time to go into the loop and + * offset is not beyond the end offset, it will be a + * hole at this offset + */ + if (lastoff == startoff || lastoff < endoff) + found = 1; + break; + } + + /* + * If this is the first time to go into the loop and + * offset is smaller than the first page offset, it will be a + * hole at this offset. + */ + if (lastoff == startoff && origin == SEEK_HOLE && + lastoff < page_offset(pvec.pages[0])) { + found = 1; + break; + } + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + /* + * If the current offset is not beyond the end of given + * range, it will be a hole. + */ + if (lastoff < endoff && origin == SEEK_HOLE && + page->index > end) { + found = 1; + *offset = lastoff; + goto out; + } + + lock_page(page); + + if (unlikely(page->mapping != inode->i_mapping)) { + unlock_page(page); + continue; + } + + if (!page_has_buffers(page)) { + unlock_page(page); + continue; + } + + if (page_has_buffers(page)) { + lastoff = page_offset(page); + bh = head = page_buffers(page); + do { + if (buffer_uptodate(bh) || + buffer_unwritten(bh)) { + if (origin == SEEK_DATA) + found = 1; + } else { + if (origin == SEEK_HOLE) + found = 1; + } + if (found) { + *offset = max_t(loff_t, + startoff, lastoff); + unlock_page(page); + goto out; + } + lastoff += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); + } + + lastoff = page_offset(page) + PAGE_SIZE; + unlock_page(page); + } + + /* + * The no. of pages is less than our desired, that would be a + * hole in there. + */ + if (nr_pages < num && origin == SEEK_HOLE) { + found = 1; + *offset = lastoff; + break; + } + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index <= end); + +out: + pagevec_release(&pvec); + return found; +} + +/* + * ext4_seek_data() retrieves the offset for SEEK_DATA. + */ +static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_map_blocks map; + struct extent_status es; + ext4_lblk_t start, last, end; + loff_t dataoff, isize; + int blkbits; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + + blkbits = inode->i_sb->s_blocksize_bits; + start = offset >> blkbits; + last = start; + end = isize >> blkbits; + dataoff = offset; + + do { + map.m_lblk = last; + map.m_len = end - last + 1; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { + if (last != start) + dataoff = last << blkbits; + break; + } + + /* + * If there is a delay extent at this offset, + * it will be as a data. + */ + es.start = last; + (void)ext4_es_find_extent(inode, &es); + if (last >= es.start && + last < es.start + es.len) { + if (last != start) + dataoff = last << blkbits; + break; + } + + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int unwritten; + unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, + &map, &dataoff); + if (unwritten) + break; + } + + last++; + dataoff = last << blkbits; + } while (last <= end); + + mutex_unlock(&inode->i_mutex); + + if (dataoff > isize) + return -ENXIO; + + if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) + return -EINVAL; + if (dataoff > maxsize) + return -EINVAL; + + if (dataoff != file->f_pos) { + file->f_pos = dataoff; + file->f_version = 0; + } + + return dataoff; +} + +/* + * ext4_seek_hole() retrieves the offset for SEEK_HOLE. + */ +static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_map_blocks map; + struct extent_status es; + ext4_lblk_t start, last, end; + loff_t holeoff, isize; + int blkbits; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + + blkbits = inode->i_sb->s_blocksize_bits; + start = offset >> blkbits; + last = start; + end = isize >> blkbits; + holeoff = offset; + + do { + map.m_lblk = last; + map.m_len = end - last + 1; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { + last += ret; + holeoff = last << blkbits; + continue; + } + + /* + * If there is a delay extent at this offset, + * we will skip this extent. + */ + es.start = last; + (void)ext4_es_find_extent(inode, &es); + if (last >= es.start && + last < es.start + es.len) { + last = es.start + es.len; + holeoff = last << blkbits; + continue; + } + + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int unwritten; + unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, + &map, &holeoff); + if (!unwritten) { + last += ret; + holeoff = last << blkbits; + continue; + } + } + + /* find a hole */ + break; + } while (last <= end); + + mutex_unlock(&inode->i_mutex); + + if (holeoff > isize) + holeoff = isize; + + if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) + return -EINVAL; + if (holeoff > maxsize) + return -EINVAL; + + if (holeoff != file->f_pos) { + file->f_pos = holeoff; + file->f_version = 0; + } + + return holeoff; +} + /* * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values * by calling generic_file_llseek_size() with the appropriate maxbytes @@ -300,8 +619,19 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) else maxbytes = inode->i_sb->s_maxbytes; - return generic_file_llseek_size(file, offset, origin, - maxbytes, i_size_read(inode)); + switch (origin) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek_size(file, offset, origin, + maxbytes, i_size_read(inode)); + case SEEK_DATA: + return ext4_seek_data(file, offset, maxbytes); + case SEEK_HOLE: + return ext4_seek_hole(file, offset, maxbytes); + } + + return -EINVAL; } const struct file_operations ext4_file_operations = { -- cgit v1.2.3 From 939da1084458246d2e29dd921c2012c177000e96 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 10 Dec 2012 16:30:43 -0500 Subject: ext4: Remove CONFIG_EXT4_FS_XATTR Ted has sent out a RFC about removing this feature. Eric and Jan confirmed that both RedHat and SUSE enable this feature in all their product. David also said that "As far as I know, it's enabled in all Android kernels that use ext4." So it seems OK for us. And what's more, as inline data depends its implementation on xattr, and to be frank, I don't run any test again inline data enabled while xattr disabled. So I think we should add inline data and remove this config option in the same release. [ The savings if you disable CONFIG_EXT4_FS_XATTR is only 27k, which isn't much in the grand scheme of things. Since no one seems to be testing this configuration except for some automated compile farms, on balance we are better removing this config option, and so that it is effectively always enabled. -- tytso ] Cc: David Brown Cc: Eric Sandeen Reviewed-by: Jan Kara Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/file.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/ext4/file.c') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2f5759eb9f8..b64a60bf105 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -656,12 +656,10 @@ const struct file_operations ext4_file_operations = { const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; -- cgit v1.2.3 From 965c8e59cfcf845ecde2265a1d1bfee5f011d302 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Dec 2012 15:59:39 -0800 Subject: lseek: the "whence" argument is called "whence" But the kernel decided to call it "origin" instead. Fix most of the sites. Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/file.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs/ext4/file.c') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b64a60bf105..d07c27ca594 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -303,7 +303,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) * page cache has data or not. */ static int ext4_find_unwritten_pgoff(struct inode *inode, - int origin, + int whence, struct ext4_map_blocks *map, loff_t *offset) { @@ -333,10 +333,10 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, (pgoff_t)num); if (nr_pages == 0) { - if (origin == SEEK_DATA) + if (whence == SEEK_DATA) break; - BUG_ON(origin != SEEK_HOLE); + BUG_ON(whence != SEEK_HOLE); /* * If this is the first time to go into the loop and * offset is not beyond the end offset, it will be a @@ -352,7 +352,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, * offset is smaller than the first page offset, it will be a * hole at this offset. */ - if (lastoff == startoff && origin == SEEK_HOLE && + if (lastoff == startoff && whence == SEEK_HOLE && lastoff < page_offset(pvec.pages[0])) { found = 1; break; @@ -366,7 +366,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, * If the current offset is not beyond the end of given * range, it will be a hole. */ - if (lastoff < endoff && origin == SEEK_HOLE && + if (lastoff < endoff && whence == SEEK_HOLE && page->index > end) { found = 1; *offset = lastoff; @@ -391,10 +391,10 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, do { if (buffer_uptodate(bh) || buffer_unwritten(bh)) { - if (origin == SEEK_DATA) + if (whence == SEEK_DATA) found = 1; } else { - if (origin == SEEK_HOLE) + if (whence == SEEK_HOLE) found = 1; } if (found) { @@ -416,7 +416,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, * The no. of pages is less than our desired, that would be a * hole in there. */ - if (nr_pages < num && origin == SEEK_HOLE) { + if (nr_pages < num && whence == SEEK_HOLE) { found = 1; *offset = lastoff; break; @@ -609,7 +609,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) * by calling generic_file_llseek_size() with the appropriate maxbytes * value for each. */ -loff_t ext4_llseek(struct file *file, loff_t offset, int origin) +loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t maxbytes; @@ -619,11 +619,11 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) else maxbytes = inode->i_sb->s_maxbytes; - switch (origin) { + switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: - return generic_file_llseek_size(file, offset, origin, + return generic_file_llseek_size(file, offset, whence, maxbytes, i_size_read(inode)); case SEEK_DATA: return ext4_seek_data(file, offset, maxbytes); -- cgit v1.2.3 From a28a9178e8fcd9b94f7333184ce78e816c8cb2af Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 25 Dec 2012 13:33:13 -0500 Subject: ext4: remove unaligned AIO warning printk Although I put this in, I now think it was a bad decision. For most users, there is very little to be done in this case. They get the message, once per day, with no real context or proposed action. TBH, it generates support calls when it probably does not need to; the message sounds more dire than the situation really is. Just nuke it. Normal investigation via blktrace or whatnot can reveal poor IO patterns if bad performance is encountered. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/file.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs/ext4/file.c') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b64a60bf105..1c0aad7db1e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -108,14 +108,6 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, /* Unaligned direct AIO must be serialized; see comment above */ if (unaligned_aio) { - static unsigned long unaligned_warn_time; - - /* Warn about this once per day */ - if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ)) - ext4_msg(inode->i_sb, KERN_WARNING, - "Unaligned AIO/DIO on inode %ld by %s; " - "performance will be poor.", - inode->i_ino, current->comm); mutex_lock(ext4_aio_mutex(inode)); ext4_unwritten_wait(inode); } -- cgit v1.2.3