aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c836
1 files changed, 533 insertions, 303 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 02bc8cbe728..cbfe13bf5b2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -233,6 +233,11 @@ void ext4_evict_inode(struct inode *inode)
if (is_bad_inode(inode))
goto no_delete;
+ /*
+ * Protect us against freezing - iput() caller didn't have to have any
+ * protection against it
+ */
+ sb_start_intwrite(inode->i_sb);
handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
@@ -242,6 +247,7 @@ void ext4_evict_inode(struct inode *inode)
* cleaned up.
*/
ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
goto no_delete;
}
@@ -273,6 +279,7 @@ void ext4_evict_inode(struct inode *inode)
stop_handle:
ext4_journal_stop(handle);
ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
goto no_delete;
}
}
@@ -301,6 +308,7 @@ void ext4_evict_inode(struct inode *inode)
else
ext4_free_inode(handle, inode);
ext4_journal_stop(handle);
+ sb_end_intwrite(inode->i_sb);
return;
no_delete:
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
@@ -346,6 +354,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
used = ei->i_reserved_data_blocks;
}
+ if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
+ ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d "
+ "with only %d reserved metadata blocks\n", __func__,
+ inode->i_ino, ei->i_allocated_meta_blocks,
+ ei->i_reserved_meta_blocks);
+ WARN_ON(1);
+ ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
+ }
+
/* Update per-inode reservations */
ei->i_reserved_data_blocks -= used;
ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
@@ -467,49 +484,6 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
}
/*
- * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
- */
-static void set_buffers_da_mapped(struct inode *inode,
- struct ext4_map_blocks *map)
-{
- struct address_space *mapping = inode->i_mapping;
- struct pagevec pvec;
- int i, nr_pages;
- pgoff_t index, end;
-
- index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
- end = (map->m_lblk + map->m_len - 1) >>
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
- pagevec_init(&pvec, 0);
- while (index <= end) {
- nr_pages = pagevec_lookup(&pvec, mapping, index,
- min(end - index + 1,
- (pgoff_t)PAGEVEC_SIZE));
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct buffer_head *bh, *head;
-
- if (unlikely(page->mapping != mapping) ||
- !PageDirty(page))
- break;
-
- if (page_has_buffers(page)) {
- bh = head = page_buffers(page);
- do {
- set_buffer_da_mapped(bh);
- bh = bh->b_this_page;
- } while (bh != head);
- }
- index++;
- }
- pagevec_release(&pvec);
- }
-}
-
-/*
* The ext4_map_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
*
@@ -544,7 +518,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* Try to see if we can get the block without requesting a new
* file system block.
*/
- down_read((&EXT4_I(inode)->i_data_sem));
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ down_read((&EXT4_I(inode)->i_data_sem));
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -552,10 +527,20 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
retval = ext4_ind_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
}
- up_read((&EXT4_I(inode)->i_data_sem));
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ up_read((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret = check_block_validity(inode, map);
+ int ret;
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ /* delayed alloc may be allocated by fallocate and
+ * coverted to initialized by directIO.
+ * we need to handle delayed extent here.
+ */
+ down_write((&EXT4_I(inode)->i_data_sem));
+ goto delayed_mapped;
+ }
+ ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -633,12 +618,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
- /* If we have successfully mapped the delayed allocated blocks,
- * set the BH_Da_Mapped bit on them. Its important to do this
- * under the protection of i_data_sem.
- */
- if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
- set_buffers_da_mapped(inode, map);
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ int ret;
+delayed_mapped:
+ /* delayed allocation blocks has been allocated */
+ ret = ext4_es_remove_extent(inode, map->m_lblk,
+ map->m_len);
+ if (ret < 0)
+ retval = ret;
+ }
}
up_write((&EXT4_I(inode)->i_data_sem));
@@ -661,10 +649,13 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
int ret = 0, started = 0;
int dio_credits;
+ if (ext4_has_inline_data(inode))
+ return -ERANGE;
+
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
- if (flags && !handle) {
+ if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
/* Direct IO write... */
if (map.m_len > DIO_MAX_BLOCKS)
map.m_len = DIO_MAX_BLOCKS;
@@ -713,11 +704,13 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
err = ext4_map_blocks(handle, inode, &map,
create ? EXT4_GET_BLOCKS_CREATE : 0);
+ /* ensure we send some value back into *errp */
+ *errp = 0;
+
if (err < 0)
*errp = err;
if (err <= 0)
return NULL;
- *errp = 0;
bh = sb_getblk(inode->i_sb, map.m_pblk);
if (!bh) {
@@ -777,13 +770,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return NULL;
}
-static int walk_page_buffers(handle_t *handle,
- struct buffer_head *head,
- unsigned from,
- unsigned to,
- int *partial,
- int (*fn)(handle_t *handle,
- struct buffer_head *bh))
+int ext4_walk_page_buffers(handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)(handle_t *handle,
+ struct buffer_head *bh))
{
struct buffer_head *bh;
unsigned block_start, block_end;
@@ -833,8 +826,8 @@ static int walk_page_buffers(handle_t *handle,
* is elevated. We'll still have enough credits for the tiny quotafile
* write.
*/
-static int do_journal_get_write_access(handle_t *handle,
- struct buffer_head *bh)
+int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh)
{
int dirty = buffer_dirty(bh);
int ret;
@@ -857,7 +850,7 @@ static int do_journal_get_write_access(handle_t *handle,
return ret;
}
-static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -881,6 +874,17 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
+ flags, pagep);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ ret = 0;
+ goto out;
+ }
+ }
+
retry:
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
@@ -898,6 +902,7 @@ retry:
ret = -ENOMEM;
goto out;
}
+
*pagep = page;
if (ext4_should_dioread_nolock(inode))
@@ -906,8 +911,9 @@ retry:
ret = __block_write_begin(page, pos, len, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, do_journal_get_write_access);
+ ret = ext4_walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL,
+ do_journal_get_write_access);
}
if (ret) {
@@ -962,7 +968,12 @@ static int ext4_generic_write_end(struct file *file,
struct inode *inode = mapping->host;
handle_t *handle = ext4_journal_current_handle();
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ if (ext4_has_inline_data(inode))
+ copied = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ else
+ copied = block_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
/*
* No need to use i_size_read() here, the i_size
@@ -1113,16 +1124,21 @@ static int ext4_journalled_write_end(struct file *file,
BUG_ON(!ext4_handle_valid(handle));
- if (copied < len) {
- if (!PageUptodate(page))
- copied = 0;
- page_zero_new_buffers(page, from+copied, to);
- }
+ if (ext4_has_inline_data(inode))
+ copied = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ else {
+ if (copied < len) {
+ if (!PageUptodate(page))
+ copied = 0;
+ page_zero_new_buffers(page, from+copied, to);
+ }
- ret = walk_page_buffers(handle, page_buffers(page), from,
- to, &partial, write_end_fn);
- if (!partial)
- SetPageUptodate(page);
+ ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
+ to, &partial, write_end_fn);
+ if (!partial)
+ SetPageUptodate(page);
+ }
new_i_size = pos + copied;
if (new_i_size > inode->i_size)
i_size_write(inode, pos+copied);
@@ -1171,6 +1187,17 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int md_needed;
int ret;
+ ext4_lblk_t save_last_lblock;
+ int save_len;
+
+ /*
+ * We will charge metadata quota at writeout time; this saves
+ * us from metadata over-estimation, though we may go over by
+ * a small amount in the end. Here we just reserve for data.
+ */
+ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
+ if (ret)
+ return ret;
/*
* recalculate the amount of metadata blocks to reserve
@@ -1179,32 +1206,31 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
*/
repeat:
spin_lock(&ei->i_block_reservation_lock);
+ /*
+ * ext4_calc_metadata_amount() has side effects, which we have
+ * to be prepared undo if we fail to claim space.
+ */
+ save_len = ei->i_da_metadata_calc_len;
+ save_last_lblock = ei->i_da_metadata_calc_last_lblock;
md_needed = EXT4_NUM_B2C(sbi,
ext4_calc_metadata_amount(inode, lblock));
trace_ext4_da_reserve_space(inode, md_needed);
- spin_unlock(&ei->i_block_reservation_lock);
/*
- * We will charge metadata quota at writeout time; this saves
- * us from metadata over-estimation, though we may go over by
- * a small amount in the end. Here we just reserve for data.
- */
- ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
- if (ret)
- return ret;
- /*
* We do still charge estimated metadata to the sb though;
* we cannot afford to run out of free blocks.
*/
if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
- dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+ ei->i_da_metadata_calc_len = save_len;
+ ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+ spin_unlock(&ei->i_block_reservation_lock);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
}
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
return -ENOSPC;
}
- spin_lock(&ei->i_block_reservation_lock);
ei->i_reserved_data_blocks++;
ei->i_reserved_meta_blocks += md_needed;
spin_unlock(&ei->i_block_reservation_lock);
@@ -1270,6 +1296,7 @@ static void ext4_da_page_release_reservation(struct page *page,
struct inode *inode = page->mapping->host;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int num_clusters;
+ ext4_fsblk_t lblk;
head = page_buffers(page);
bh = head;
@@ -1279,20 +1306,23 @@ static void ext4_da_page_release_reservation(struct page *page,
if ((offset <= curr_off) && (buffer_delay(bh))) {
to_release++;
clear_buffer_delay(bh);
- clear_buffer_da_mapped(bh);
}
curr_off = next_off;
} while ((bh = bh->b_this_page) != head);
+ if (to_release) {
+ lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, lblk, to_release);
+ }
+
/* If we have released all the blocks belonging to a cluster, then we
* need to release the reserved space for that cluster. */
num_clusters = EXT4_NUM_B2C(sbi, to_release);
while (num_clusters > 0) {
- ext4_fsblk_t lblk;
lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
((num_clusters - 1) << sbi->s_cluster_bits);
if (sbi->s_cluster_ratio == 1 ||
- !ext4_find_delalloc_cluster(inode, lblk, 1))
+ !ext4_find_delalloc_cluster(inode, lblk))
ext4_da_release_space(inode, 1);
num_clusters--;
@@ -1398,8 +1428,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
clear_buffer_delay(bh);
bh->b_blocknr = pblock;
}
- if (buffer_da_mapped(bh))
- clear_buffer_da_mapped(bh);
if (buffer_unwritten(bh) ||
buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock);
@@ -1469,9 +1497,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
struct pagevec pvec;
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
+ ext4_lblk_t start, last;
index = mpd->first_page;
end = mpd->next_page - 1;
+
+ start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, start, last - start + 1);
+
+ pagevec_init(&pvec, 0);
while (index <= end) {
nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
if (nr_pages == 0)
@@ -1625,15 +1660,6 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
for (i = 0; i < map.m_len; i++)
unmap_underlying_metadata(bdev, map.m_pblk + i);
-
- if (ext4_should_order_data(mpd->inode)) {
- err = ext4_jbd2_file_inode(handle, mpd->inode);
- if (err) {
- /* Only if the journal is aborted */
- mpd->retval = err;
- goto submit_io;
- }
- }
}
/*
@@ -1764,7 +1790,19 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
* file system block.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ if (ext4_has_inline_data(inode)) {
+ /*
+ * We will soon create blocks for this page, and let
+ * us pretend as if the blocks aren't allocated yet.
+ * In case of clusters, we have to handle the work
+ * of mapping from cluster so that the reserved space
+ * is calculated properly.
+ */
+ if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
+ ext4_find_delalloc_cluster(inode, map->m_lblk))
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ retval = 0;
+ } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
retval = ext4_ext_map_blocks(NULL, inode, map, 0);
else
retval = ext4_ind_map_blocks(NULL, inode, map, 0);
@@ -1783,6 +1821,10 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
goto out_unlock;
}
+ retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
+ if (retval)
+ goto out_unlock;
+
/* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
* and it should not appear on the bh->b_state.
*/
@@ -1811,8 +1853,8 @@ out_unlock:
* We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
* initialized properly.
*/
-static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
{
struct ext4_map_blocks map;
int ret = 0;
@@ -1886,15 +1928,29 @@ static int __ext4_journalled_writepage(struct page *page,
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
- struct buffer_head *page_bufs;
+ struct buffer_head *page_bufs = NULL;
handle_t *handle = NULL;
- int ret = 0;
- int err;
+ int ret = 0, err = 0;
+ int inline_data = ext4_has_inline_data(inode);
+ struct buffer_head *inode_bh = NULL;
ClearPageChecked(page);
- page_bufs = page_buffers(page);
- BUG_ON(!page_bufs);
- walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
+
+ if (inline_data) {
+ BUG_ON(page->index != 0);
+ BUG_ON(len > ext4_get_max_inline_size(inode));
+ inode_bh = ext4_journalled_write_inline_data(inode, len, page);
+ if (inode_bh == NULL)
+ goto out;
+ } else {
+ page_bufs = page_buffers(page);
+ if (!page_bufs) {
+ BUG();
+ goto out;
+ }
+ ext4_walk_page_buffers(handle, page_bufs, 0, len,
+ NULL, bget_one);
+ }
/* As soon as we unlock the page, it can go away, but we have
* references to buffers so we are safe */
unlock_page(page);
@@ -1907,11 +1963,18 @@ static int __ext4_journalled_writepage(struct page *page,
BUG_ON(!ext4_handle_valid(handle));
- ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
- do_journal_get_write_access);
+ if (inline_data) {
+ ret = ext4_journal_get_write_access(handle, inode_bh);
+
+ err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
- err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
- write_end_fn);
+ } else {
+ ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ do_journal_get_write_access);
+
+ err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ write_end_fn);
+ }
if (ret == 0)
ret = err;
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
@@ -1919,15 +1982,15 @@ static int __ext4_journalled_writepage(struct page *page,
if (!ret)
ret = err;
- walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+ if (!ext4_has_inline_data(inode))
+ ext4_walk_page_buffers(handle, page_bufs, 0, len,
+ NULL, bput_one);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
+ brelse(inode_bh);
return ret;
}
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
-
/*
* Note that we don't need to start a transaction unless we're journaling data
* because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -1941,7 +2004,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
* This function can get called via...
* - ext4_da_writepages after taking page lock (have journal handle)
* - journal_submit_inode_data_buffers (no journal handle)
- * - shrink_page_list via pdflush (no journal handle)
+ * - shrink_page_list via the kswapd/direct reclaim (no journal handle)
* - grab_page_cache when doing write_begin (have journal handle)
*
* We don't do any block allocation in this function. If we have page with
@@ -2001,8 +2064,8 @@ static int ext4_writepage(struct page *page,
commit_write = 1;
}
page_bufs = page_buffers(page);
- if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
- ext4_bh_delay_or_unwritten)) {
+ if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_delay_or_unwritten)) {
/*
* We don't want to do block allocation, so redirty
* the page and return. We may reach here when we do
@@ -2068,7 +2131,8 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
* mpage_da_map_and_submit to map a single contiguous memory region
* and then write them.
*/
-static int write_cache_pages_da(struct address_space *mapping,
+static int write_cache_pages_da(handle_t *handle,
+ struct address_space *mapping,
struct writeback_control *wbc,
struct mpage_da_data *mpd,
pgoff_t *done_index)
@@ -2147,6 +2211,17 @@ static int write_cache_pages_da(struct address_space *mapping,
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
+ /*
+ * If we have inline data and arrive here, it means that
+ * we will soon create the block for the 1st page, so
+ * we'd better clear the inline data here.
+ */
+ if (ext4_has_inline_data(inode)) {
+ BUG_ON(ext4_test_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA));
+ ext4_destroy_inline_data(handle, inode);
+ }
+
if (mpd->next_page != page->index)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
@@ -2353,7 +2428,8 @@ retry:
* contiguous region of logical blocks that need
* blocks to be allocated by ext4 and submit them.
*/
- ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
+ ret = write_cache_pages_da(handle, mapping,
+ wbc, &mpd, &done_index);
/*
* If we have a contiguous extent of pages and we
* haven't done the I/O yet, map the blocks and submit
@@ -2417,7 +2493,6 @@ out_writepages:
return ret;
}
-#define FALL_BACK_TO_NONDELALLOC 1
static int ext4_nonda_switch(struct super_block *sb)
{
s64 free_blocks, dirty_blocks;
@@ -2434,6 +2509,16 @@ static int ext4_nonda_switch(struct super_block *sb)
free_blocks = EXT4_C2B(sbi,
percpu_counter_read_positive(&sbi->s_freeclusters_counter));
dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+ /*
+ * Start pushing delalloc when 1/2 of free blocks are dirty.
+ */
+ if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
+ !writeback_in_progress(sb->s_bdi) &&
+ down_read_trylock(&sb->s_umount)) {
+ writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
+ up_read(&sb->s_umount);
+ }
+
if (2 * free_blocks < 3 * dirty_blocks ||
free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
/*
@@ -2442,13 +2527,6 @@ static int ext4_nonda_switch(struct super_block *sb)
*/
return 1;
}
- /*
- * Even if we don't switch but are nearing capacity,
- * start pushing delalloc when 1/2 of free blocks are dirty.
- */
- if (free_blocks < 2 * dirty_blocks)
- writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
-
return 0;
}
@@ -2471,6 +2549,19 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
}
*fsdata = (void *)0;
trace_ext4_da_write_begin(inode, pos, len, flags);
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ ret = ext4_da_write_inline_data_begin(mapping, inode,
+ pos, len, flags,
+ pagep, fsdata);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ ret = 0;
+ goto out;
+ }
+ }
+
retry:
/*
* With delayed allocation, we don't log the i_disksize update
@@ -2572,22 +2663,13 @@ static int ext4_da_write_end(struct file *file,
* changes. So let's piggyback the i_disksize mark_inode_dirty
* into that.
*/
-
new_i_size = pos + copied;
if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
- if (ext4_da_should_update_i_disksize(page, end)) {
+ if (ext4_has_inline_data(inode) ||
+ ext4_da_should_update_i_disksize(page, end)) {
down_write(&EXT4_I(inode)->i_data_sem);
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- /*
- * Updating i_disksize when extending file
- * without needing block allocation
- */
- if (ext4_should_order_data(inode))
- ret = ext4_jbd2_file_inode(handle,
- inode);
-
+ if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- }
up_write(&EXT4_I(inode)->i_data_sem);
/* We need to mark inode dirty even if
* new_i_size is less that inode->i_size
@@ -2596,8 +2678,16 @@ static int ext4_da_write_end(struct file *file,
ext4_mark_inode_dirty(handle, inode);
}
}
- ret2 = generic_write_end(file, mapping, pos, len, copied,
+
+ if (write_mode != CONVERT_INLINE_DATA &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+ ext4_has_inline_data(inode))
+ ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
+ page);
+ else
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
+
copied = ret2;
if (ret2 < 0)
ret = ret2;
@@ -2690,6 +2780,12 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
+ /*
+ * We can get here for an inline file via the FIBMAP ioctl
+ */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
test_opt(inode->i_sb, DELALLOC)) {
/*
@@ -2735,14 +2831,30 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
static int ext4_readpage(struct file *file, struct page *page)
{
+ int ret = -EAGAIN;
+ struct inode *inode = page->mapping->host;
+
trace_ext4_readpage(page);
- return mpage_readpage(page, ext4_get_block);
+
+ if (ext4_has_inline_data(inode))
+ ret = ext4_readpage_inline(inode, page);
+
+ if (ret == -EAGAIN)
+ return mpage_readpage(page, ext4_get_block);
+
+ return ret;
}
static int
ext4_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
+ struct inode *inode = mapping->host;
+
+ /* If the file has inline data, no need to do readpages. */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
@@ -2768,8 +2880,6 @@ static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offs
static void ext4_invalidatepage(struct page *page, unsigned long offset)
{
- journal_t *journal = EXT4_JOURNAL(page->mapping->host);
-
trace_ext4_invalidatepage(page, offset);
/*
@@ -2777,16 +2887,34 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
*/
if (ext4_should_dioread_nolock(page->mapping->host))
ext4_invalidatepage_free_endio(page, offset);
+
+ /* No journalling happens on data buffers when this function is used */
+ WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+
+ block_invalidatepage(page, offset);
+}
+
+static int __ext4_journalled_invalidatepage(struct page *page,
+ unsigned long offset)
+{
+ journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+
+ trace_ext4_journalled_invalidatepage(page, offset);
+
/*
* If it's a full truncate we just forget about the pending dirtying
*/
if (offset == 0)
ClearPageChecked(page);
- if (journal)
- jbd2_journal_invalidatepage(journal, page, offset);
- else
- block_invalidatepage(page, offset);
+ return jbd2_journal_invalidatepage(journal, page, offset);
+}
+
+/* Wrapper for aops... */
+static void ext4_journalled_invalidatepage(struct page *page,
+ unsigned long offset)
+{
+ WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -2809,7 +2937,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
* We allocate an uinitialized extent if blocks haven't been allocated.
* The extent will be converted to initialized after the IO is complete.
*/
-static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
@@ -2818,15 +2946,21 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
EXT4_GET_BLOCKS_IO_CREATE_EXT);
}
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_NO_LOCK);
+}
+
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private, int ret,
bool is_async)
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
ext4_io_end_t *io_end = iocb->private;
- struct workqueue_struct *wq;
- unsigned long flags;
- struct ext4_inode_info *ei;
/* if not async direct IO or dio with 0 bytes write, just return */
if (!io_end || !size)
@@ -2855,24 +2989,14 @@ out:
io_end->iocb = iocb;
io_end->result = ret;
}
- wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-
- /* Add the io_end to per-inode completed aio dio list*/
- ei = EXT4_I(io_end->inode);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &ei->i_completed_io_list);
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
}
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
{
ext4_io_end_t *io_end = bh->b_private;
- struct workqueue_struct *wq;
struct inode *inode;
- unsigned long flags;
if (!test_clear_buffer_uninit(bh) || !io_end)
goto out;
@@ -2891,15 +3015,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
*/
inode = io_end->inode;
ext4_set_io_unwritten_flag(inode, io_end);
-
- /* Add the io_end to per-inode completed io list*/
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
- wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
out:
bh->b_private = NULL;
bh->b_end_io = NULL;
@@ -2942,10 +3058,10 @@ retry:
* fall back to buffered IO.
*
* For holes, we fallocate those blocks, mark them as uninitialized
- * If those blocks were preallocated, we mark sure they are splited, but
+ * If those blocks were preallocated, we mark sure they are split, but
* still keep the range to write as uninitialized.
*
- * The unwrritten extents will be converted to written when DIO is completed.
+ * The unwritten extents will be converted to written when DIO is completed.
* For async direct IO, since the IO may still pending when return, we
* set up an end_io call back function, which will do the conversion
* when async direct IO completed.
@@ -2963,92 +3079,120 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
struct inode *inode = file->f_mapping->host;
ssize_t ret;
size_t count = iov_length(iov, nr_segs);
-
+ int overwrite = 0;
+ get_block_t *get_block_func = NULL;
+ int dio_flags = 0;
loff_t final_size = offset + count;
- if (rw == WRITE && final_size <= inode->i_size) {
- /*
- * We could direct write to holes and fallocate.
- *
- * Allocated blocks to fill the hole are marked as uninitialized
- * to prevent parallel buffered read to expose the stale data
- * before DIO complete the data IO.
- *
- * As to previously fallocated extents, ext4 get_block
- * will just simply mark the buffer mapped but still
- * keep the extents uninitialized.
- *
- * for non AIO case, we will convert those unwritten extents
- * to written after return back from blockdev_direct_IO.
- *
- * for async DIO, the conversion needs to be defered when
- * the IO is completed. The ext4 end_io callback function
- * will be called to take care of the conversion work.
- * Here for async case, we allocate an io_end structure to
- * hook to the iocb.
- */
- iocb->private = NULL;
- EXT4_I(inode)->cur_aio_dio = NULL;
- if (!is_sync_kiocb(iocb)) {
- ext4_io_end_t *io_end =
- ext4_init_io_end(inode, GFP_NOFS);
- if (!io_end)
- return -ENOMEM;
- io_end->flag |= EXT4_IO_END_DIRECT;
- iocb->private = io_end;
- /*
- * we save the io structure for current async
- * direct IO, so that later ext4_map_blocks()
- * could flag the io structure whether there
- * is a unwritten extents needs to be converted
- * when IO is completed.
- */
- EXT4_I(inode)->cur_aio_dio = iocb->private;
- }
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block_write,
- ext4_end_io_dio,
- NULL,
- DIO_LOCKING);
- if (iocb->private)
- EXT4_I(inode)->cur_aio_dio = NULL;
+ /* Use the old path for reads and writes beyond i_size. */
+ if (rw != WRITE || final_size > inode->i_size)
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+
+ BUG_ON(iocb->private == NULL);
+
+ /* If we do a overwrite dio, i_mutex locking can be released */
+ overwrite = *((int *)iocb->private);
+
+ if (overwrite) {
+ atomic_inc(&inode->i_dio_count);
+ down_read(&EXT4_I(inode)->i_data_sem);
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ /*
+ * We could direct write to holes and fallocate.
+ *
+ * Allocated blocks to fill the hole are marked as
+ * uninitialized to prevent parallel buffered read to expose
+ * the stale data before DIO complete the data IO.
+ *
+ * As to previously fallocated extents, ext4 get_block will
+ * just simply mark the buffer mapped but still keep the
+ * extents uninitialized.
+ *
+ * For non AIO case, we will convert those unwritten extents
+ * to written after return back from blockdev_direct_IO.
+ *
+ * For async DIO, the conversion needs to be deferred when the
+ * IO is completed. The ext4 end_io callback function will be
+ * called to take care of the conversion work. Here for async
+ * case, we allocate an io_end structure to hook to the iocb.
+ */
+ iocb->private = NULL;
+ ext4_inode_aio_set(inode, NULL);
+ if (!is_sync_kiocb(iocb)) {
+ ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_end) {
+ ret = -ENOMEM;
+ goto retake_lock;
+ }
+ io_end->flag |= EXT4_IO_END_DIRECT;
+ iocb->private = io_end;
/*
- * The io_end structure takes a reference to the inode,
- * that structure needs to be destroyed and the
- * reference to the inode need to be dropped, when IO is
- * complete, even with 0 byte write, or failed.
- *
- * In the successful AIO DIO case, the io_end structure will be
- * desctroyed and the reference to the inode will be dropped
- * after the end_io call back function is called.
- *
- * In the case there is 0 byte write, or error case, since
- * VFS direct IO won't invoke the end_io call back function,
- * we need to free the end_io structure here.
+ * we save the io structure for current async direct
+ * IO, so that later ext4_map_blocks() could flag the
+ * io structure whether there is a unwritten extents
+ * needs to be converted when IO is completed.
*/
- if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
- ext4_free_io_end(iocb->private);
- iocb->private = NULL;
- } else if (ret > 0 && ext4_test_inode_state(inode,
+ ext4_inode_aio_set(inode, io_end);
+ }
+
+ if (overwrite) {
+ get_block_func = ext4_get_block_write_nolock;
+ } else {
+ get_block_func = ext4_get_block_write;
+ dio_flags = DIO_LOCKING;
+ }
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ get_block_func,
+ ext4_end_io_dio,
+ NULL,
+ dio_flags);
+
+ if (iocb->private)
+ ext4_inode_aio_set(inode, NULL);
+ /*
+ * The io_end structure takes a reference to the inode, that
+ * structure needs to be destroyed and the reference to the
+ * inode need to be dropped, when IO is complete, even with 0
+ * byte write, or failed.
+ *
+ * In the successful AIO DIO case, the io_end structure will
+ * be destroyed and the reference to the inode will be dropped
+ * after the end_io call back function is called.
+ *
+ * In the case there is 0 byte write, or error case, since VFS
+ * direct IO won't invoke the end_io call back function, we
+ * need to free the end_io structure here.
+ */
+ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+ ext4_free_io_end(iocb->private);
+ iocb->private = NULL;
+ } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
- int err;
- /*
- * for non AIO case, since the IO is already
- * completed, we could do the conversion right here
- */
- err = ext4_convert_unwritten_extents(inode,
- offset, ret);
- if (err < 0)
- ret = err;
- ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
- }
- return ret;
+ int err;
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the conversion right here
+ */
+ err = ext4_convert_unwritten_extents(inode,
+ offset, ret);
+ if (err < 0)
+ ret = err;
+ ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+ }
+
+retake_lock:
+ /* take i_mutex locking again if we do a ovewrite dio */
+ if (overwrite) {
+ inode_dio_done(inode);
+ up_read(&EXT4_I(inode)->i_data_sem);
+ mutex_lock(&inode->i_mutex);
}
- /* for write the the end of file case, we fall back to old way */
- return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+ return ret;
}
static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
@@ -3065,6 +3209,10 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
if (ext4_should_journal_data(inode))
return 0;
+ /* Let buffer I/O handle the inline data case. */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -3132,7 +3280,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.write_end = ext4_journalled_write_end,
.set_page_dirty = ext4_journalled_set_page_dirty,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
+ .invalidatepage = ext4_journalled_invalidatepage,
.releasepage = ext4_releasepage,
.direct_IO = ext4_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -3227,7 +3375,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
* handle: The journal handle
* inode: The files inode
* page: A locked page that contains the offset "from"
- * from: The starting byte offset (from the begining of the file)
+ * from: The starting byte offset (from the beginning of the file)
* to begin discarding
* len: The length of bytes to discard
* flags: Optional flags that may be used:
@@ -3235,11 +3383,11 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
* EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
* Only zero the regions of the page whose buffer heads
* have already been unmapped. This flag is appropriate
- * for updateing the contents of a page whose blocks may
+ * for updating the contents of a page whose blocks may
* have already been released, and we only want to zero
* out the regions that correspond to those released blocks.
*
- * Returns zero on sucess or negative on failure.
+ * Returns zero on success or negative on failure.
*/
static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
struct inode *inode, struct page *page, loff_t from,
@@ -3400,7 +3548,7 @@ int ext4_can_truncate(struct inode *inode)
* @offset: The offset where the hole will begin
* @len: The length of the hole
*
- * Returns: 0 on sucess or negative on failure
+ * Returns: 0 on success or negative on failure
*/
int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
@@ -3462,6 +3610,14 @@ void ext4_truncate(struct inode *inode)
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+ if (ext4_has_inline_data(inode)) {
+ int has_inline = 1;
+
+ ext4_inline_data_truncate(inode, &has_inline);
+ if (has_inline)
+ return;
+ }
+
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ext4_ext_truncate(inode);
else
@@ -3687,6 +3843,19 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
}
}
+static inline void ext4_iget_extra_inode(struct inode *inode,
+ struct ext4_inode *raw_inode,
+ struct ext4_inode_info *ei)
+{
+ __le32 *magic = (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
+ if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ ext4_find_inline_data_nolock(inode);
+ } else
+ EXT4_I(inode)->i_inline_off = 0;
+}
+
struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
{
struct ext4_iloc iloc;
@@ -3757,6 +3926,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
+ ei->i_inline_off = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
@@ -3829,11 +3999,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
} else {
- __le32 *magic = (void *)raw_inode +
- EXT4_GOOD_OLD_INODE_SIZE +
- ei->i_extra_isize;
- if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
- ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ ext4_iget_extra_inode(inode, raw_inode, ei);
}
}
@@ -3856,17 +4022,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_file_acl);
ret = -EIO;
goto bad_inode;
- } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- (S_ISLNK(inode->i_mode) &&
- !ext4_inode_is_fast_symlink(inode)))
- /* Validate extent which is part of inode */
- ret = ext4_ext_check_inode(inode);
- } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- (S_ISLNK(inode->i_mode) &&
- !ext4_inode_is_fast_symlink(inode))) {
- /* Validate block references which are part of inode */
- ret = ext4_ind_check_inode(inode);
+ } else if (!ext4_has_inline_data(inode)) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))))
+ /* Validate extent which is part of inode */
+ ret = ext4_ext_check_inode(inode);
+ } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))) {
+ /* Validate block references which are part of inode */
+ ret = ext4_ind_check_inode(inode);
+ }
}
if (ret)
goto bad_inode;
@@ -3922,7 +4090,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
if (i_blocks <= ~0U) {
/*
- * i_blocks can be represnted in a 32 bit variable
+ * i_blocks can be represented in a 32 bit variable
* as multiple of 512 bytes
*/
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
@@ -3966,6 +4134,7 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
+ int need_datasync = 0;
uid_t i_uid;
gid_t i_gid;
@@ -4016,7 +4185,10 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- ext4_isize_set(raw_inode, ei->i_disksize);
+ if (ei->i_disksize != ext4_isize(raw_inode)) {
+ ext4_isize_set(raw_inode, ei->i_disksize);
+ need_datasync = 1;
+ }
if (ei->i_disksize > 0x7fffffffULL) {
struct super_block *sb = inode->i_sb;
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -4034,7 +4206,7 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_SET_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
ext4_handle_sync(handle);
- err = ext4_handle_dirty_super_now(handle, sb);
+ err = ext4_handle_dirty_super(handle, sb);
}
}
raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -4049,9 +4221,10 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le32(new_encode_dev(inode->i_rdev));
raw_inode->i_block[2] = 0;
}
- } else
+ } else if (!ext4_has_inline_data(inode)) {
for (block = 0; block < EXT4_N_BLOCKS; block++)
raw_inode->i_block[block] = ei->i_data[block];
+ }
raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
if (ei->i_extra_isize) {
@@ -4069,7 +4242,7 @@ static int ext4_do_update_inode(handle_t *handle,
err = rc;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
- ext4_update_inode_fsync_trans(handle, inode, 0);
+ ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_brelse:
brelse(bh);
ext4_std_error(inode->i_sb, err);
@@ -4083,7 +4256,7 @@ out_brelse:
*
* - Within generic_file_write() for O_SYNC files.
* Here, there will be no transaction running. We wait for any running
- * trasnaction to commit.
+ * transaction to commit.
*
* - Within sys_sync(), kupdate and such.
* We wait on commit, if tol to.
@@ -4148,6 +4321,47 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
+ * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
+ * buffers that are attached to a page stradding i_size and are undergoing
+ * commit. In that case we have to wait for commit to finish and try again.
+ */
+static void ext4_wait_for_tail_page_commit(struct inode *inode)
+{
+ struct page *page;
+ unsigned offset;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = 0;
+ int ret;
+
+ offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ /*
+ * All buffers in the last page remain valid? Then there's nothing to
+ * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
+ * blocksize case
+ */
+ if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
+ return;
+ while (1) {
+ page = find_lock_page(inode->i_mapping,
+ inode->i_size >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return;
+ ret = __ext4_journalled_invalidatepage(page, offset);
+ unlock_page(page);
+ page_cache_release(page);
+ if (ret != -EBUSY)
+ return;
+ commit_tid = 0;
+ read_lock(&journal->j_state_lock);
+ if (journal->j_committing_transaction)
+ commit_tid = journal->j_committing_transaction->t_tid;
+ read_unlock(&journal->j_state_lock);
+ if (commit_tid)
+ jbd2_log_wait_commit(journal, commit_tid);
+ }
+}
+
+/*
* ext4_setattr()
*
* Called from notify_change.
@@ -4212,7 +4426,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- inode_dio_wait(inode);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4261,8 +4474,29 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- if (attr->ia_size != i_size_read(inode))
- truncate_setsize(inode, attr->ia_size);
+ if (attr->ia_size != inode->i_size) {
+ loff_t oldsize = inode->i_size;
+
+ i_size_write(inode, attr->ia_size);
+ /*
+ * Blocks are going to be removed from the inode. Wait
+ * for dio in flight. Temporarily disable
+ * dioread_nolock to prevent livelock.
+ */
+ if (orphan) {
+ if (!ext4_should_journal_data(inode)) {
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ ext4_inode_resume_unlocked_dio(inode);
+ } else
+ ext4_wait_for_tail_page_commit(inode);
+ }
+ /*
+ * Truncate pagecache after we've waited for commit
+ * in data=journal mode to make pages freeable.
+ */
+ truncate_pagecache(inode, oldsize, inode->i_size);
+ }
ext4_truncate(inode);
}
@@ -4327,7 +4561,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* worse case, the indexs blocks spread over different block groups
*
* If datablocks are discontiguous, they are possible to spread over
- * different block groups too. If they are contiuguous, with flexbg,
+ * different block groups too. If they are contiguous, with flexbg,
* they could still across block group boundary.
*
* Also account for superblock, inode, quota and xattr blocks
@@ -4503,14 +4737,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
* inode out, but prune_icache isn't a user-visible syncing function.
* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
* we start and wait on commits.
- *
- * Is this efficient/effective? Well, we're being nice to the system
- * by cleaning up our inodes proactively so they can be reaped
- * without I/O. But we are potentially leaving up to five seconds'
- * worth of inodes floating about which prune_icache wants us to
- * write out. One way to fix that would be to get prune_icache()
- * to do a write_super() to free up some memory. It has the desired
- * effect.
*/
int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
{
@@ -4649,6 +4875,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
jbd2_journal_lock_updates(journal);
/*
@@ -4668,6 +4898,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
+ ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */
@@ -4701,11 +4932,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
get_block_t *get_block;
int retries = 0;
- /*
- * This check is racy but catches the common case. We rely on
- * __block_page_mkwrite() to do a reliable check.
- */
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
@@ -4736,8 +4964,9 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
* journal_start/journal_stop which can block and take a long time
*/
if (page_has_buffers(page)) {
- if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
- ext4_bh_unmapped)) {
+ if (!ext4_walk_page_buffers(NULL, page_buffers(page),
+ 0, len, NULL,
+ ext4_bh_unmapped)) {
/* Wait so that we don't change page under IO */
wait_on_page_writeback(page);
ret = VM_FAULT_LOCKED;
@@ -4758,7 +4987,7 @@ retry_alloc:
}
ret = __block_page_mkwrite(vma, vmf, get_block);
if (!ret && ext4_should_journal_data(inode)) {
- if (walk_page_buffers(handle, page_buffers(page), 0,
+ if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
unlock_page(page);
ret = VM_FAULT_SIGBUS;
@@ -4773,5 +5002,6 @@ retry_alloc:
out_ret:
ret = block_page_mkwrite_return(ret);
out:
+ sb_end_pagefault(inode->i_sb);
return ret;
}