diff options
author | Patrick McHardy <kaber@trash.net> | 2013-03-31 18:10:34 +0200 |
---|---|---|
committer | Patrick McHardy <kaber@trash.net> | 2013-03-31 18:10:34 +0200 |
commit | 70711d223510ba1773cfe1d7770a56141c815ff8 (patch) | |
tree | 4a71f38a3a554ddecaa31b7d8c6bc49b7d1705b4 /fs/xfs | |
parent | d53b4ed072d9779cdf53582c46436dec06d0961f (diff) | |
parent | 19f949f52599ba7c3f67a5897ac6be14bfcb1200 (diff) |
Merge tag 'v3.8' of /home/kaber/src/repos/linux
Linux 3.8
Signed-off-by: Patrick McHardy <kaber@trash.net>
Conflicts:
include/linux/Kbuild
include/linux/netlink.h
Diffstat (limited to 'fs/xfs')
86 files changed, 6974 insertions, 4842 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 6100ec0fa1d..5a7ffe54f5d 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -2,6 +2,7 @@ config XFS_FS tristate "XFS filesystem support" depends on BLOCK select EXPORTFS + select LIBCRC32C help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index d2bf974b1a2..d02201df855 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -37,9 +37,8 @@ xfs-y += xfs_aops.o \ xfs_file.o \ xfs_filestream.o \ xfs_fsops.o \ - xfs_fs_subr.o \ xfs_globals.o \ - xfs_iget.o \ + xfs_icache.o \ xfs_ioctl.o \ xfs_iomap.o \ xfs_iops.o \ @@ -47,7 +46,6 @@ xfs-y += xfs_aops.o \ xfs_message.o \ xfs_mru_cache.o \ xfs_super.o \ - xfs_sync.o \ xfs_xattr.o \ xfs_rename.o \ xfs_utils.o \ diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h index 4732d71262c..104db0f3bed 100644 --- a/fs/xfs/uuid.h +++ b/fs/xfs/uuid.h @@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid); extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); +static inline void +uuid_copy(uuid_t *dst, uuid_t *src) +{ + memcpy(dst, src, sizeof(uuid_t)); +} + #endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index ac702a6eab9..1d32f1d5276 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -337,7 +337,7 @@ xfs_xattr_acl_get(struct dentry *dentry, const char *name, if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, value, size); + error = posix_acl_to_xattr(&init_user_ns, acl, value, size); posix_acl_release(acl); return error; @@ -361,7 +361,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, if (!value) goto set_acl; - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (!acl) { /* * acl_set_file(3) may request that we set default ACLs with diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 44d65c1533c..f2aeedb6a57 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -108,6 +108,8 @@ typedef struct xfs_agf { extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); +extern const struct xfs_buf_ops xfs_agf_buf_ops; + /* * Size of the unlinked inode hash table in the agi. */ @@ -161,6 +163,8 @@ typedef struct xfs_agi { extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); +extern const struct xfs_buf_ops xfs_agi_buf_ops; + /* * The third a.g. block contains the a.g. freelist, an array * of block pointers to blocks owned by the allocation btree code. @@ -233,6 +237,7 @@ typedef struct xfs_perag { #define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup in xfs_inode_ag_iterator */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ +#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ #define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) #define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 4f33c32affe..393055fe3ae 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -430,6 +430,60 @@ xfs_alloc_fixup_trees( return 0; } +static void +xfs_agfl_verify( + struct xfs_buf *bp) +{ +#ifdef WHEN_CRCS_COME_ALONG + /* + * we cannot actually do any verification of the AGFL because mkfs does + * not initialise the AGFL to zero or NULL. Hence the only valid part of + * the AGFL is what the AGF says is active. We can't get to the AGF, so + * we can't verify just those entries are valid. + * + * This problem goes away when the CRC format change comes along as that + * requires the AGFL to be initialised by mkfs. At that point, we can + * verify the blocks in the agfl -active or not- lie within the bounds + * of the AG. Until then, just leave this check ifdef'd out. + */ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + int agfl_ok = 1; + + int i; + + for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { + if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK || + be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + agfl_ok = 0; + } + + if (!agfl_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +#endif +} + +static void +xfs_agfl_write_verify( + struct xfs_buf *bp) +{ + xfs_agfl_verify(bp); +} + +static void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ + xfs_agfl_verify(bp); +} + +const struct xfs_buf_ops xfs_agfl_buf_ops = { + .verify_read = xfs_agfl_read_verify, + .verify_write = xfs_agfl_write_verify, +}; + /* * Read in the allocation group free block array. */ @@ -447,7 +501,7 @@ xfs_alloc_read_agfl( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -1866,6 +1920,7 @@ xfs_alloc_fix_freelist( /* * Initialize the args structure. */ + memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; targs.agbp = agbp; @@ -2090,6 +2145,63 @@ xfs_alloc_put_freelist( return 0; } +static void +xfs_agf_verify( + struct xfs_buf *bp) + { + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agf *agf; + int agf_ok; + + agf = XFS_BUF_TO_AGF(bp); + + agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && + XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && + be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp); + + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag) + agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) == + bp->b_pag->pag_agno; + + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= + be32_to_cpu(agf->agf_length); + + if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_agf_read_verify( + struct xfs_buf *bp) +{ + xfs_agf_verify(bp); +} + +static void +xfs_agf_write_verify( + struct xfs_buf *bp) +{ + xfs_agf_verify(bp); +} + +const struct xfs_buf_ops xfs_agf_buf_ops = { + .verify_read = xfs_agf_read_verify, + .verify_write = xfs_agf_write_verify, +}; + /* * Read in the allocation group header (free/alloc section). */ @@ -2101,44 +2213,19 @@ xfs_read_agf( int flags, /* XFS_BUF_ */ struct xfs_buf **bpp) /* buffer for the ag freelist header */ { - struct xfs_agf *agf; /* ag freelist header */ - int agf_ok; /* set if agf is consistent */ int error; ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp); + XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); if (error) return error; if (!*bpp) return 0; ASSERT(!(*bpp)->b_error); - agf = XFS_BUF_TO_AGF(*bpp); - - /* - * Validate the magic number of the agf block. - */ - agf_ok = - agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_seqno) == agno; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) - agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= - be32_to_cpu(agf->agf_length); - if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF))) { - XFS_CORRUPTION_ERROR("xfs_alloc_read_agf", - XFS_ERRLEVEL_LOW, mp, agf); - xfs_trans_brelse(tp, *bpp); - return XFS_ERROR(EFSCORRUPTED); - } xfs_buf_set_ref(*bpp, XFS_AGF_REF); return 0; } @@ -2207,7 +2294,7 @@ xfs_alloc_read_agf( * group or loop over the allocation groups to find the result. */ int /* error */ -__xfs_alloc_vextent( +xfs_alloc_vextent( xfs_alloc_arg_t *args) /* allocation argument structure */ { xfs_agblock_t agsize; /* allocation group size */ @@ -2417,46 +2504,6 @@ error0: return error; } -static void -xfs_alloc_vextent_worker( - struct work_struct *work) -{ - struct xfs_alloc_arg *args = container_of(work, - struct xfs_alloc_arg, work); - unsigned long pflags; - - /* we are in a transaction context here */ - current_set_flags_nested(&pflags, PF_FSTRANS); - - args->result = __xfs_alloc_vextent(args); - complete(args->done); - - current_restore_flags_nested(&pflags, PF_FSTRANS); -} - -/* - * Data allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Metadata - * requests, OTOH, are generally from low stack usage paths, so avoid the - * context switch overhead here. - */ -int -xfs_alloc_vextent( - struct xfs_alloc_arg *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->userdata) - return __xfs_alloc_vextent(args); - - - args->done = &done; - INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - return args->result; -} - /* * Free an extent. * Just break up the extent address and hand off to xfs_free_ag_extent diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 93be4a667ca..99d0a610155 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -120,9 +120,6 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* set if this is user data */ xfs_fsblock_t firstblock; /* io first block allocated */ - struct completion *done; - struct work_struct work; - int result; } xfs_alloc_arg_t; /* @@ -234,4 +231,7 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agfl_buf_ops; + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index f1647caace8..b1ddef6b268 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -121,6 +121,8 @@ xfs_allocbt_free_block( xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); + + xfs_trans_binval(cur->bc_tp, bp); return 0; } @@ -270,6 +272,82 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } +static void +xfs_allocbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + int sblock_ok; /* block passes checks */ + + /* + * magic number and level verification + * + * During growfs operations, we can't verify the exact level as the + * perag is not fully initialised and hence not attached to the buffer. + * In this case, check against the maximum tree depth. + */ + level = be16_to_cpu(block->bb_level); + switch (block->bb_magic) { + case cpu_to_be32(XFS_ABTB_MAGIC): + if (pag) + sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi]; + else + sblock_ok = level < mp->m_ag_maxlevels; + break; + case cpu_to_be32(XFS_ABTC_MAGIC): + if (pag) + sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi]; + else + sblock_ok = level < mp->m_ag_maxlevels; + break; + default: + sblock_ok = 0; + break; + } + + /* numrecs verification */ + sblock_ok = sblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0]; + + /* sibling pointer verification */ + sblock_ok = sblock_ok && + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_leftsib && + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_rightsib; + + if (!sblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_allocbt_read_verify( + struct xfs_buf *bp) +{ + xfs_allocbt_verify(bp); +} + +static void +xfs_allocbt_write_verify( + struct xfs_buf *bp) +{ + xfs_allocbt_verify(bp); +} + +const struct xfs_buf_ops xfs_allocbt_buf_ops = { + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, +}; + + #ifdef DEBUG STATIC int xfs_allocbt_keys_inorder( @@ -325,6 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, + .buf_ops = &xfs_allocbt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index a6caa0022c9..7e89a2b429d 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -51,20 +51,6 @@ typedef struct xfs_alloc_rec_incore { typedef __be32 xfs_alloc_ptr_t; /* - * Minimum and maximum blocksize and sectorsize. - * The blocksize upper limit is pretty much arbitrary. - * The sectorsize upper limit is due to sizeof(sb_sectsize). - */ -#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ -#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ -#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) -#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) -#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ -#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ -#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) -#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) - -/* * Block numbers in the AG: * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3. */ @@ -107,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, xfs_agnumber_t, xfs_btnum_t); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); +extern const struct xfs_buf_ops xfs_allocbt_buf_ops; + #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 8dad722c004..5f707e53717 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -86,11 +86,11 @@ xfs_destroy_ioend( } if (ioend->io_iocb) { + inode_dio_done(ioend->io_inode); if (ioend->io_isasync) { aio_complete(ioend->io_iocb, ioend->io_error ? ioend->io_error : ioend->io_result, 0); } - inode_dio_done(ioend->io_inode); } mempool_free(ioend, xfs_ioend_pool); @@ -124,6 +124,12 @@ xfs_setfilesize_trans_alloc( ioend->io_append_trans = tp; /* + * We may pass freeze protection with a transaction. So tell lockdep + * we released it. + */ + rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); + /* * We hand off the transaction to the completion thread now, so * clear the flag here. */ @@ -143,11 +149,13 @@ xfs_setfilesize( xfs_fsize_t isize; /* - * The transaction was allocated in the I/O submission thread, - * thus we need to mark ourselves as beeing in a transaction - * manually. + * The transaction may have been allocated in the I/O submission thread, + * thus we need to mark ourselves as beeing in a transaction manually. + * Similarly for freeze protection. */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); xfs_ilock(ip, XFS_ILOCK_EXCL); isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); @@ -179,9 +187,10 @@ xfs_finish_ioend( if (atomic_dec_and_test(&ioend->io_remaining)) { struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - if (ioend->io_type == IO_UNWRITTEN) + if (ioend->io_type == XFS_IO_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) + else if (ioend->io_append_trans || + (ioend->io_isdirect && xfs_ioend_is_append(ioend))) queue_work(mp->m_data_workqueue, &ioend->io_work); else xfs_destroy_ioend(ioend); @@ -210,36 +219,32 @@ xfs_end_io( * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. */ - if (ioend->io_type == IO_UNWRITTEN) { + if (ioend->io_type == XFS_IO_UNWRITTEN) { + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, + ioend->io_size); + } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) { /* - * For buffered I/O we never preallocate a transaction when - * doing the unwritten extent conversion, but for direct I/O - * we do not know if we are converting an unwritten extent - * or not at the point where we preallocate the transaction. + * For direct I/O we do not know if we need to allocate blocks + * or not so we can't preallocate an append transaction as that + * results in nested reservations and log space deadlocks. Hence + * allocate the transaction here. While this is sub-optimal and + * can block IO completion for some time, we're stuck with doing + * it this way until we can pass the ioend to the direct IO + * allocation callbacks and avoid nesting that way. */ - if (ioend->io_append_trans) { - ASSERT(ioend->io_isdirect); - - current_set_flags_nested( - &ioend->io_append_trans->t_pflags, PF_FSTRANS); - xfs_trans_cancel(ioend->io_append_trans, 0); - } - - error = xfs_iomap_write_unwritten(ip, ioend->io_offset, - ioend->io_size); - if (error) { - ioend->io_error = -error; + error = xfs_setfilesize_trans_alloc(ioend); + if (error) goto done; - } + error = xfs_setfilesize(ioend); } else if (ioend->io_append_trans) { error = xfs_setfilesize(ioend); - if (error) - ioend->io_error = -error; } else { ASSERT(!xfs_ioend_is_append(ioend)); } done: + if (error) + ioend->io_error = -error; xfs_destroy_ioend(ioend); } @@ -312,7 +317,7 @@ xfs_map_blocks( if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); - if (type == IO_UNWRITTEN) + if (type == XFS_IO_UNWRITTEN) bmapi_flags |= XFS_BMAPI_IGSTATE; if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { @@ -323,10 +328,10 @@ xfs_map_blocks( ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); - ASSERT(offset <= mp->m_maxioffset); + ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset + count > mp->m_maxioffset) - count = mp->m_maxioffset - offset; + if (offset + count > mp->m_super->s_maxbytes) + count = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, @@ -336,7 +341,7 @@ xfs_map_blocks( if (error) return -XFS_ERROR(error); - if (type == IO_DELALLOC && + if (type == XFS_IO_DELALLOC && (!nimaps || isnullstartblock(imap->br_startblock))) { error = xfs_iomap_write_allocate(ip, offset, count, imap); if (!error) @@ -345,7 +350,7 @@ xfs_map_blocks( } #ifdef DEBUG - if (type == IO_UNWRITTEN) { + if (type == XFS_IO_UNWRITTEN) { ASSERT(nimaps); ASSERT(imap->br_startblock != HOLESTARTBLOCK); ASSERT(imap->br_startblock != DELAYSTARTBLOCK); @@ -466,11 +471,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) * * The fix is two passes across the ioend list - one to start writeback on the * buffer_heads, and then submit them for I/O on the second pass. + * + * If @fail is non-zero, it means that we have a situation where some part of + * the submission process has failed after we have marked paged for writeback + * and unlocked them. In this situation, we need to fail the ioend chain rather + * than submit it to IO. This typically only happens on a filesystem shutdown. */ STATIC void xfs_submit_ioend( struct writeback_control *wbc, - xfs_ioend_t *ioend) + xfs_ioend_t *ioend, + int fail) { xfs_ioend_t *head = ioend; xfs_ioend_t *next; @@ -491,6 +502,18 @@ xfs_submit_ioend( next = ioend->io_list; bio = NULL; + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately + * as there is only one reference to the ioend at this point in + * time. + */ + if (fail) { + ioend->io_error = -fail; + xfs_finish_ioend(ioend); + continue; + } + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { if (!bio) { @@ -634,11 +657,11 @@ xfs_check_page_type( bh = head = page_buffers(page); do { if (buffer_unwritten(bh)) - acceptable += (type == IO_UNWRITTEN); + acceptable += (type == XFS_IO_UNWRITTEN); else if (buffer_delay(bh)) - acceptable += (type == IO_DELALLOC); + acceptable += (type == XFS_IO_DELALLOC); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable += (type == IO_OVERWRITE); + acceptable += (type == XFS_IO_OVERWRITE); else break; } while ((bh = bh->b_this_page) != head); @@ -721,11 +744,11 @@ xfs_convert_page( if (buffer_unwritten(bh) || buffer_delay(bh) || buffer_mapped(bh)) { if (buffer_unwritten(bh)) - type = IO_UNWRITTEN; + type = XFS_IO_UNWRITTEN; else if (buffer_delay(bh)) - type = IO_DELALLOC; + type = XFS_IO_DELALLOC; else - type = IO_OVERWRITE; + type = XFS_IO_OVERWRITE; if (!xfs_imap_valid(inode, imap, offset)) { done = 1; @@ -733,7 +756,7 @@ xfs_convert_page( } lock_buffer(bh); - if (type != IO_OVERWRITE) + if (type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, imap, offset); xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); @@ -831,7 +854,7 @@ xfs_aops_discard_page( struct buffer_head *bh, *head; loff_t offset = page_offset(page); - if (!xfs_check_page_type(page, IO_DELALLOC)) + if (!xfs_check_page_type(page, XFS_IO_DELALLOC)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -927,11 +950,26 @@ xfs_vm_writepage( end_index = offset >> PAGE_CACHE_SHIFT; last_index = (offset - 1) >> PAGE_CACHE_SHIFT; if (page->index >= end_index) { - if ((page->index >= end_index + 1) || - !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { + unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); + + /* + * Just skip the page if it is fully outside i_size, e.g. due + * to a truncate operation that is in progress. + */ + if (page->index >= end_index + 1 || offset_into_page == 0) { unlock_page(page); return 0; } + + /* + * The page straddles i_size. It must be zeroed out on each + * and every writepage invocation because it may be mmapped. + * "A file is mapped in multiples of the page size. For a file + * that is not a multiple of the page size, the remaining + * memory is zeroed when mapped, and writes to that region are + * not written out to the file." + */ + zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); } end_offset = min_t(unsigned long long, @@ -941,7 +979,7 @@ xfs_vm_writepage( bh = head = page_buffers(page); offset = page_offset(page); - type = IO_OVERWRITE; + type = XFS_IO_OVERWRITE; if (wbc->sync_mode == WB_SYNC_NONE) nonblocking = 1; @@ -966,18 +1004,18 @@ xfs_vm_writepage( } if (buffer_unwritten(bh)) { - if (type != IO_UNWRITTEN) { - type = IO_UNWRITTEN; + if (type != XFS_IO_UNWRITTEN) { + type = XFS_IO_UNWRITTEN; imap_valid = 0; } } else if (buffer_delay(bh)) { - if (type != IO_DELALLOC) { - type = IO_DELALLOC; + if (type != XFS_IO_DELALLOC) { + type = XFS_IO_DELALLOC; imap_valid = 0; } } else if (buffer_uptodate(bh)) { - if (type != IO_OVERWRITE) { - type = IO_OVERWRITE; + if (type != XFS_IO_OVERWRITE) { + type = XFS_IO_OVERWRITE; imap_valid = 0; } } else { @@ -1013,7 +1051,7 @@ xfs_vm_writepage( } if (imap_valid) { lock_buffer(bh); - if (type != IO_OVERWRITE) + if (type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, &imap, offset); xfs_add_to_ioend(inode, bh, offset, type, &ioend, new_ioend); @@ -1030,7 +1068,18 @@ xfs_vm_writepage( xfs_start_page_writeback(page, 1, count); - if (ioend && imap_valid) { + /* if there is no IO to be submitted for this page, we are done */ + if (!ioend) + return 0; + + ASSERT(iohead); + + /* + * Any errors from this point onwards need tobe reported through the IO + * completion path as we have marked the initial page as under writeback + * and unlocked it. + */ + if (imap_valid) { xfs_off_t end_index; end_index = imap.br_startoff + imap.br_blockcount; @@ -1049,20 +1098,15 @@ xfs_vm_writepage( wbc, end_index); } - if (iohead) { - /* - * Reserve log space if we might write beyond the on-disk - * inode size. - */ - if (ioend->io_type != IO_UNWRITTEN && - xfs_ioend_is_append(ioend)) { - err = xfs_setfilesize_trans_alloc(ioend); - if (err) - goto error; - } - xfs_submit_ioend(wbc, iohead); - } + /* + * Reserve log space if we might write beyond the on-disk inode size. + */ + err = 0; + if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + err = xfs_setfilesize_trans_alloc(ioend); + + xfs_submit_ioend(wbc, iohead, err); return 0; @@ -1162,9 +1206,9 @@ __xfs_get_blocks( lockmode = xfs_ilock_map_shared(ip); } - ASSERT(offset <= mp->m_maxioffset); - if (offset + size > mp->m_maxioffset) - size = mp->m_maxioffset - offset; + ASSERT(offset <= mp->m_super->s_maxbytes); + if (offset + size > mp->m_super->s_maxbytes) + size = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -1351,7 +1395,7 @@ xfs_end_io_direct_write( ioend->io_iocb = iocb; ioend->io_result = ret; if (private && size > 0) - ioend->io_type = IO_UNWRITTEN; + ioend->io_type = XFS_IO_UNWRITTEN; if (is_async) { ioend->io_isasync = 1; @@ -1378,25 +1422,21 @@ xfs_vm_direct_IO( size_t size = iov_length(iov, nr_segs); /* - * We need to preallocate a transaction for a size update - * here. In the case that this write both updates the size - * and converts at least on unwritten extent we will cancel - * the still clean transaction after the I/O has finished. + * We cannot preallocate a size update transaction here as we + * don't know whether allocation is necessary or not. Hence we + * can only tell IO completion that one is necessary if we are + * not doing unwritten extent conversion. */ - iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT); - if (offset + size > XFS_I(inode)->i_d.di_size) { - ret = xfs_setfilesize_trans_alloc(ioend); - if (ret) - goto out_destroy_ioend; + iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); + if (offset + size > XFS_I(inode)->i_d.di_size) ioend->io_isdirect = 1; - } ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, xfs_get_blocks_direct, xfs_end_io_direct_write, NULL, 0); if (ret != -EIOCBQUEUED && iocb->private) - goto out_trans_cancel; + goto out_destroy_ioend; } else { ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, @@ -1406,12 +1446,6 @@ xfs_vm_direct_IO( return ret; -out_trans_cancel: - if (ioend->io_append_trans) { - current_set_flags_nested(&ioend->io_append_trans->t_pflags, - PF_FSTRANS); - xfs_trans_cancel(ioend->io_append_trans, 0); - } out_destroy_ioend: xfs_destroy_ioend(ioend); return ret; @@ -1584,7 +1618,7 @@ xfs_vm_bmap( trace_xfs_vm_bmap(XFS_I(inode)); xfs_ilock(ip, XFS_IOLOCK_SHARED); - xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); + filemap_write_and_wait(mapping); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 84eafbcb0d9..c325abb8d61 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -24,17 +24,17 @@ extern mempool_t *xfs_ioend_pool; * Types of I/O for bmap clustering and I/O completion tracking. */ enum { - IO_DIRECT = 0, /* special case for direct I/O ioends */ - IO_DELALLOC, /* mapping covers delalloc region */ - IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */ - IO_OVERWRITE, /* mapping covers already allocated extent */ + XFS_IO_DIRECT = 0, /* special case for direct I/O ioends */ + XFS_IO_DELALLOC, /* covers delalloc region */ + XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ + XFS_IO_OVERWRITE, /* covers already allocated extent */ }; #define XFS_IO_TYPES \ { 0, "" }, \ - { IO_DELALLOC, "delalloc" }, \ - { IO_UNWRITTEN, "unwritten" }, \ - { IO_OVERWRITE, "overwrite" } + { XFS_IO_DELALLOC, "delalloc" }, \ + { XFS_IO_UNWRITTEN, "unwritten" }, \ + { XFS_IO_OVERWRITE, "overwrite" } /* * xfs_ioend struct manages large extent writes for XFS. diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index a17ff01b5ad..aaf472532b3 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -893,7 +893,7 @@ STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args) { xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int retval, error, committed, forkoff; trace_xfs_attr_leaf_addname(args); @@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); + return error; /* * Look up the given attribute in the leaf block. Figure out if @@ -915,11 +913,11 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ retval = xfs_attr_leaf_lookup_int(bp, args); if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return(retval); } else if (retval == EEXIST) { if (args->flags & ATTR_CREATE) { /* pure create op */ - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return(retval); } @@ -937,7 +935,6 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * if required. */ retval = xfs_attr_leaf_add(bp, args); - xfs_da_buf_done(bp); if (retval == ENOSPC) { /* * Promote the attribute list to the Btree format, then @@ -1032,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Read in the block containing the "old" attr, then * remove the "old" attr from that block (neat, huh!) */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, - &bp, XFS_ATTR_FORK); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, + -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); - (void)xfs_attr_leaf_remove(bp, args); + return error; + + xfs_attr_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. @@ -1065,8 +1062,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ if (committed) xfs_trans_ijoin(args->trans, dp, 0); - } else - xfs_da_buf_done(bp); + } /* * Commit the remove and start the next trans in series. @@ -1092,7 +1088,7 @@ STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args) { xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error, committed, forkoff; trace_xfs_attr_leaf_removename(args); @@ -1102,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); - if (error) { - return(error); - } + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; - ASSERT(bp != NULL); error = xfs_attr_leaf_lookup_int(bp, args); if (error == ENOATTR) { - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return(error); } - (void)xfs_attr_leaf_remove(bp, args); + xfs_attr_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. @@ -1141,8 +1134,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) */ if (committed) xfs_trans_ijoin(args->trans, dp, 0); - } else - xfs_da_buf_done(bp); + } return(0); } @@ -1155,23 +1147,23 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) STATIC int xfs_attr_leaf_get(xfs_da_args_t *args) { - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; + trace_xfs_attr_leaf_get(args); + args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); + return error; error = xfs_attr_leaf_lookup_int(bp, args); if (error != EEXIST) { - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return(error); } error = xfs_attr_leaf_getvalue(bp, args); - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { error = xfs_attr_rmtval_get(args); } @@ -1184,25 +1176,18 @@ xfs_attr_leaf_get(xfs_da_args_t *args) STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context) { - xfs_attr_leafblock_t *leaf; int error; - xfs_dabuf_t *bp; + struct xfs_buf *bp; + + trace_xfs_attr_leaf_list(context); context->cursor->blkno = 0; - error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp); if (error) return XFS_ERROR(error); - ASSERT(bp != NULL); - leaf = bp->data; - if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW, - context->dp->i_mount, leaf); - xfs_da_brelse(NULL, bp); - return XFS_ERROR(EFSCORRUPTED); - } error = xfs_attr_leaf_list_int(bp, context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return XFS_ERROR(error); } @@ -1489,7 +1474,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_da_state_t *state; xfs_da_state_blk_t *blk; xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int retval, error, committed, forkoff; trace_xfs_attr_node_removename(args); @@ -1601,15 +1586,11 @@ xfs_attr_node_removename(xfs_da_args_t *args) */ ASSERT(state->path.active == 1); ASSERT(state->path.blk[0].bp); - xfs_da_buf_done(state->path.blk[0].bp); state->path.blk[0].bp = NULL; - error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, - XFS_ATTR_FORK); + error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp); if (error) goto out; - ASSERT((((xfs_attr_leafblock_t *)bp->data)->hdr.info.magic) == - cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); @@ -1635,7 +1616,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) if (committed) xfs_trans_ijoin(args->trans, dp, 0); } else - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); } error = 0; @@ -1657,6 +1638,8 @@ xfs_attr_fillstate(xfs_da_state_t *state) xfs_da_state_blk_t *blk; int level; + trace_xfs_attr_fillstate(state->args); + /* * Roll down the "path" in the state structure, storing the on-disk * block number for those buffers in the "path". @@ -1665,8 +1648,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->bp) { - blk->disk_blkno = xfs_da_blkno(blk->bp); - xfs_da_buf_done(blk->bp); + blk->disk_blkno = XFS_BUF_ADDR(blk->bp); blk->bp = NULL; } else { blk->disk_blkno = 0; @@ -1681,8 +1663,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->bp) { - blk->disk_blkno = xfs_da_blkno(blk->bp); - xfs_da_buf_done(blk->bp); + blk->disk_blkno = XFS_BUF_ADDR(blk->bp); blk->bp = NULL; } else { blk->disk_blkno = 0; @@ -1705,6 +1686,8 @@ xfs_attr_refillstate(xfs_da_state_t *state) xfs_da_state_blk_t *blk; int level, error; + trace_xfs_attr_refillstate(state->args); + /* * Roll down the "path" in the state structure, storing the on-disk * block number for those buffers in the "path". @@ -1713,7 +1696,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_read_buf(state->args->trans, + error = xfs_da_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); @@ -1732,7 +1715,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_read_buf(state->args->trans, + error = xfs_da_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); @@ -1761,6 +1744,8 @@ xfs_attr_node_get(xfs_da_args_t *args) int error, retval; int i; + trace_xfs_attr_node_get(args); + state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; @@ -1792,7 +1777,7 @@ xfs_attr_node_get(xfs_da_args_t *args) * If not in a transaction, we have to release all the buffers. */ for (i = 0; i < state->path.active; i++) { - xfs_da_brelse(args->trans, state->path.blk[i].bp); + xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } @@ -1808,7 +1793,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) xfs_da_intnode_t *node; xfs_da_node_entry_t *btree; int error, i; - xfs_dabuf_t *bp; + struct xfs_buf *bp; + + trace_xfs_attr_node_list(context); cursor = context->cursor; cursor->initted = 1; @@ -1820,35 +1807,35 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ bp = NULL; if (cursor->blkno > 0) { - error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, + error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if ((error != 0) && (error != EFSCORRUPTED)) return(error); if (bp) { - node = bp->data; + node = bp->b_addr; switch (be16_to_cpu(node->hdr.info.magic)) { case XFS_DA_NODE_MAGIC: trace_xfs_attr_list_wrong_blk(context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); bp = NULL; break; case XFS_ATTR_LEAF_MAGIC: - leaf = bp->data; + leaf = bp->b_addr; if (cursor->hashval > be32_to_cpu(leaf->entries[ be16_to_cpu(leaf->hdr.count)-1].hashval)) { trace_xfs_attr_list_wrong_blk(context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); bp = NULL; } else if (cursor->hashval <= be32_to_cpu(leaf->entries[0].hashval)) { trace_xfs_attr_list_wrong_blk(context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); bp = NULL; } break; default: trace_xfs_attr_list_wrong_blk(context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); bp = NULL; } } @@ -1862,18 +1849,12 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (bp == NULL) { cursor->blkno = 0; for (;;) { - error = xfs_da_read_buf(NULL, context->dp, + error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if (error) return(error); - if (unlikely(bp == NULL)) { - XFS_ERROR_REPORT("xfs_attr_node_list(2)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount); - return(XFS_ERROR(EFSCORRUPTED)); - } - node = bp->data; + node = bp->b_addr; if (node->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) break; @@ -1883,7 +1864,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) XFS_ERRLEVEL_LOW, context->dp->i_mount, node); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return(XFS_ERROR(EFSCORRUPTED)); } btree = node->btree; @@ -1898,10 +1879,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) } } if (i == be16_to_cpu(node->hdr.count)) { - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return(0); } - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); } } ASSERT(bp != NULL); @@ -1912,36 +1893,22 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) * adding the information. */ for (;;) { - leaf = bp->data; - if (unlikely(leaf->hdr.info.magic != - cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount, leaf); - xfs_da_brelse(NULL, bp); - return(XFS_ERROR(EFSCORRUPTED)); - } + leaf = bp->b_addr; error = xfs_attr_leaf_list_int(bp, context); if (error) { - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return error; } if (context->seen_enough || leaf->hdr.info.forw == 0) break; cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); - xfs_da_brelse(NULL, bp); - error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK); + xfs_trans_brelse(NULL, bp); + error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1, + &bp); if (error) - return(error); - if (unlikely((bp == NULL))) { - XFS_ERROR_REPORT("xfs_attr_node_list(5)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount); - return(XFS_ERROR(EFSCORRUPTED)); - } + return error; } - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return(0); } @@ -1965,6 +1932,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) int nmap, error, tmp, valuelen, blkcnt, i; xfs_dablk_t lblkno; + trace_xfs_attr_rmtval_get(args); + ASSERT(!(args->flags & ATTR_KERNOVAL)); mp = args->dp->i_mount; @@ -1986,7 +1955,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp); + dblkno, blkcnt, 0, &bp, NULL); if (error) return(error); @@ -2020,6 +1989,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) xfs_dablk_t lblkno; int blkcnt, valuelen, nmap, error, tmp, committed; + trace_xfs_attr_rmtval_set(args); + dp = args->dp; mp = dp->i_mount; src = args->value; @@ -2149,6 +2120,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) xfs_dablk_t lblkno; int valuelen, blkcnt, nmap, error, done, committed; + trace_xfs_attr_rmtval_remove(args); + mp = args->dp->i_mount; /* diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 7d89d800f51..ee24993c7d1 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -54,10 +54,11 @@ * Routines used for growing the Btree. */ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block, - xfs_dabuf_t **bpp); -STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args, - int freemap_index); -STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer); + struct xfs_buf **bpp); +STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, + xfs_da_args_t *args, int freemap_index); +STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args, + struct xfs_buf *leaf_buffer); STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); @@ -71,9 +72,9 @@ STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state, * Routines used for shrinking the Btree. */ STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dabuf_t *bp, int level); + struct xfs_buf *bp, int level); STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dabuf_t *bp); + struct xfs_buf *bp); STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dablk_t blkno, int blkcnt); @@ -87,6 +88,52 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, xfs_mount_t *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); +static void +xfs_attr_leaf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_attr_leaf_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC); + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_attr_leaf_read_verify( + struct xfs_buf *bp) +{ + xfs_attr_leaf_verify(bp); +} + +static void +xfs_attr_leaf_write_verify( + struct xfs_buf *bp) +{ + xfs_attr_leaf_verify(bp); +} + +const struct xfs_buf_ops xfs_attr_leaf_buf_ops = { + .verify_read = xfs_attr_leaf_read_verify, + .verify_write = xfs_attr_leaf_write_verify, +}; + +int +xfs_attr_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops); +} + /*======================================================================== * Namespace helper routines *========================================================================*/ @@ -480,7 +527,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) char *tmpbuffer; int error, i, size; xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; xfs_ifork_t *ifp; trace_xfs_attr_sf_to_leaf(args); @@ -550,8 +597,6 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) error = 0; out: - if(bp) - xfs_da_buf_done(bp); kmem_free(tmpbuffer); return(error); } @@ -737,14 +782,16 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) * a shortform attribute list. */ int -xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) +xfs_attr_shortform_allfit( + struct xfs_buf *bp, + struct xfs_inode *dp) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; xfs_attr_leaf_name_local_t *name_loc; int bytes, i; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); entry = &leaf->entries[0]; @@ -774,7 +821,10 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) * Convert a leaf attribute list to shortform attribute list */ int -xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) +xfs_attr_leaf_to_shortform( + struct xfs_buf *bp, + xfs_da_args_t *args, + int forkoff) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; @@ -791,10 +841,10 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) ASSERT(tmpbuffer != NULL); ASSERT(bp != NULL); - memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount)); + memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount)); leaf = (xfs_attr_leafblock_t *)tmpbuffer; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - memset(bp->data, 0, XFS_LBSIZE(dp->i_mount)); + memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount)); /* * Clean out the prior contents of the attribute list. @@ -855,7 +905,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) xfs_attr_leafblock_t *leaf; xfs_da_intnode_t *node; xfs_inode_t *dp; - xfs_dabuf_t *bp1, *bp2; + struct xfs_buf *bp1, *bp2; xfs_dablk_t blkno; int error; @@ -866,21 +916,19 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, - XFS_ATTR_FORK); + error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1); if (error) goto out; - ASSERT(bp1 != NULL); + bp2 = NULL; error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, XFS_ATTR_FORK); if (error) goto out; - ASSERT(bp2 != NULL); - memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount)); - xfs_da_buf_done(bp1); + bp2->b_ops = bp1->b_ops; + memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); bp1 = NULL; - xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); + xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); /* * Set up the new root node. @@ -888,21 +936,17 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); if (error) goto out; - node = bp1->data; - leaf = bp2->data; + node = bp1->b_addr; + leaf = bp2->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); /* both on-disk, don't endian-flip twice */ node->btree[0].hashval = leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval; node->btree[0].before = cpu_to_be32(blkno); node->hdr.count = cpu_to_be16(1); - xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1); + xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1); error = 0; out: - if (bp1) - xfs_da_buf_done(bp1); - if (bp2) - xfs_da_buf_done(bp2); return(error); } @@ -916,12 +960,15 @@ out: * or a leaf in a node attribute list. */ STATIC int -xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) +xfs_attr_leaf_create( + xfs_da_args_t *args, + xfs_dablk_t blkno, + struct xfs_buf **bpp) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_hdr_t *hdr; xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; trace_xfs_attr_leaf_create(args); @@ -932,8 +979,8 @@ xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) XFS_ATTR_FORK); if (error) return(error); - ASSERT(bp != NULL); - leaf = bp->data; + bp->b_ops = &xfs_attr_leaf_buf_ops; + leaf = bp->b_addr; memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); hdr = &leaf->hdr; hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC); @@ -947,7 +994,7 @@ xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) - sizeof(xfs_attr_leaf_hdr_t)); - xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); + xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); *bpp = bp; return(0); @@ -1014,7 +1061,9 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Add a name to the leaf attribute list structure. */ int -xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr_leaf_add( + struct xfs_buf *bp, + struct xfs_da_args *args) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_hdr_t *hdr; @@ -1023,7 +1072,7 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) trace_xfs_attr_leaf_add(args); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(leaf->hdr.count))); @@ -1068,7 +1117,7 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) * Compact the entries to coalesce free space. * This may change the hdr->count via dropping INCOMPLETE entries. */ - xfs_attr_leaf_compact(args->trans, bp); + xfs_attr_leaf_compact(args, bp); /* * After compaction, the block is guaranteed to have only one @@ -1085,7 +1134,10 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) * Add a name to a leaf attribute list structure. */ STATIC int -xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) +xfs_attr_leaf_add_work( + struct xfs_buf *bp, + xfs_da_args_t *args, + int mapindex) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_hdr_t *hdr; @@ -1096,7 +1148,9 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) xfs_mount_t *mp; int tmp, i; - leaf = bp->data; + trace_xfs_attr_leaf_add_work(args); + + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE)); @@ -1110,7 +1164,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) tmp = be16_to_cpu(hdr->count) - args->index; tmp *= sizeof(xfs_attr_leaf_entry_t); memmove((char *)(entry+1), (char *)entry, tmp); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); } be16_add_cpu(&hdr->count, 1); @@ -1142,7 +1196,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) args->index2++; } } - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); ASSERT((args->index == 0) || (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval))); @@ -1174,7 +1228,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) args->rmtblkno = 1; args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); } - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), xfs_attr_leaf_entsize(leaf, args->index))); @@ -1198,7 +1252,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) } } be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index)); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); return(0); } @@ -1207,24 +1261,28 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) * Garbage collect a leaf attribute list block by copying it to a new buffer. */ STATIC void -xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) +xfs_attr_leaf_compact( + struct xfs_da_args *args, + struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf_s, *leaf_d; - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; - xfs_mount_t *mp; - char *tmpbuffer; + xfs_attr_leafblock_t *leaf_s, *leaf_d; + xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + struct xfs_trans *trans = args->trans; + struct xfs_mount *mp = trans->t_mountp; + char *tmpbuffer; + + trace_xfs_attr_leaf_compact(args); - mp = trans->t_mountp; tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); ASSERT(tmpbuffer != NULL); - memcpy(tmpbuffer, bp->data, XFS_LBSIZE(mp)); - memset(bp->data, 0, XFS_LBSIZE(mp)); + memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); + memset(bp->b_addr, 0, XFS_LBSIZE(mp)); /* * Copy basic information */ leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; - leaf_d = bp->data; + leaf_d = bp->b_addr; hdr_s = &leaf_s->hdr; hdr_d = &leaf_d->hdr; hdr_d->info = hdr_s->info; /* struct copy */ @@ -1247,7 +1305,7 @@ xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) */ xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0, be16_to_cpu(hdr_s->count), mp); - xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); + xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); kmem_free(tmpbuffer); } @@ -1279,10 +1337,11 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, */ ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC); ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + ASSERT(leaf2->hdr.count == 0); args = state->args; trace_xfs_attr_leaf_rebalance(args); @@ -1298,8 +1357,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, tmp_blk = blk1; blk1 = blk2; blk2 = tmp_blk; - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; swap = 1; } hdr1 = &leaf1->hdr; @@ -1336,9 +1395,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, max = be16_to_cpu(hdr2->firstused) - sizeof(xfs_attr_leaf_hdr_t); max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); - if (space > max) { - xfs_attr_leaf_compact(args->trans, blk2->bp); - } + if (space > max) + xfs_attr_leaf_compact(args, blk2->bp); /* * Move high entries from leaf1 to low end of leaf2. @@ -1346,13 +1404,14 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count, leaf2, 0, count, state->mp); - xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); } else if (count > be16_to_cpu(hdr1->count)) { /* * I assert that since all callers pass in an empty * second buffer, this code should never execute. */ + ASSERT(0); /* * Figure the total bytes to be added to the destination leaf. @@ -1368,9 +1427,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, max = be16_to_cpu(hdr1->firstused) - sizeof(xfs_attr_leaf_hdr_t); max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); - if (space > max) { - xfs_attr_leaf_compact(args->trans, blk1->bp); - } + if (space > max) + xfs_attr_leaf_compact(args, blk1->bp); /* * Move low entries from leaf2 to high end of leaf1. @@ -1378,8 +1436,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_attr_leaf_moveents(leaf2, 0, leaf1, be16_to_cpu(hdr1->count), count, state->mp); - xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); } /* @@ -1414,10 +1472,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, args->index2 = 0; args->blkno2 = blk2->blkno; } else { + /* + * On a double leaf split, the original attr location + * is already stored in blkno2/index2, so don't + * overwrite it overwise we corrupt the tree. + */ blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); - args->index = args->index2 = blk2->index; - args->blkno = args->blkno2 = blk2->blkno; + args->index = blk2->index; + args->blkno = blk2->blkno; + if (!state->extravalid) { + /* + * set the new attr location to match the old + * one and let the higher level split code + * decide where in the leaf to place it. + */ + args->index2 = blk2->index; + args->blkno2 = blk2->blkno; + } } } else { ASSERT(state->inleaf == 1); @@ -1448,8 +1520,8 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, /* * Set up environment. */ - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; hdr1 = &leaf1->hdr; hdr2 = &leaf2->hdr; foundit = 0; @@ -1551,7 +1623,9 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) xfs_da_blkinfo_t *info; int count, bytes, forward, error, retval, i; xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; + + trace_xfs_attr_leaf_toosmall(state->args); /* * Check for the degenerate case of the block being over 50% full. @@ -1559,7 +1633,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * to coalesce with a sibling. */ blk = &state->path.blk[ state->path.active-1 ]; - info = blk->bp->data; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); leaf = (xfs_attr_leafblock_t *)info; count = be16_to_cpu(leaf->hdr.count); @@ -1612,23 +1686,21 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) blkno = be32_to_cpu(info->back); if (blkno == 0) continue; - error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, XFS_ATTR_FORK); + error = xfs_attr_leaf_read(state->args->trans, state->args->dp, + blkno, -1, &bp); if (error) return(error); - ASSERT(bp != NULL); leaf = (xfs_attr_leafblock_t *)info; count = be16_to_cpu(leaf->hdr.count); bytes = state->blocksize - (state->blocksize>>2); bytes -= be16_to_cpu(leaf->hdr.usedbytes); - leaf = bp->data; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + leaf = bp->b_addr; count += be16_to_cpu(leaf->hdr.count); bytes -= be16_to_cpu(leaf->hdr.usedbytes); bytes -= count * sizeof(xfs_attr_leaf_entry_t); bytes -= sizeof(xfs_attr_leaf_hdr_t); - xfs_da_brelse(state->args->trans, bp); + xfs_trans_brelse(state->args->trans, bp); if (bytes >= 0) break; /* fits with at least 25% to spare */ } @@ -1666,7 +1738,9 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * If two leaves are 37% full, when combined they will leave 25% free. */ int -xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr_leaf_remove( + struct xfs_buf *bp, + xfs_da_args_t *args) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_hdr_t *hdr; @@ -1676,7 +1750,9 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) int tablesize, tmp, i; xfs_mount_t *mp; - leaf = bp->data; + trace_xfs_attr_leaf_remove(args); + + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; mp = args->trans->t_mountp; @@ -1769,7 +1845,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) */ memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize); be16_add_cpu(&hdr->usedbytes, -entsize); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), entsize)); @@ -1777,7 +1853,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) * sizeof(xfs_attr_leaf_entry_t); memmove((char *)entry, (char *)(entry+1), tmp); be16_add_cpu(&hdr->count, -1); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); entry = &leaf->entries[be16_to_cpu(hdr->count)]; memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t)); @@ -1807,7 +1883,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) } else { hdr->holes = 1; /* mark as needing compaction */ } - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); /* @@ -1840,8 +1916,8 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, mp = state->mp; ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC); ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC); - drop_leaf = drop_blk->bp->data; - save_leaf = save_blk->bp->data; + drop_leaf = drop_blk->bp->b_addr; + save_leaf = save_blk->bp->b_addr; ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); drop_hdr = &drop_leaf->hdr; @@ -1906,7 +1982,7 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, kmem_free(tmpbuffer); } - xfs_da_log_buf(state->args->trans, save_blk->bp, 0, + xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, state->blocksize - 1); /* @@ -1934,7 +2010,9 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * Don't change the args->value unless we find the attribute. */ int -xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr_leaf_lookup_int( + struct xfs_buf *bp, + xfs_da_args_t *args) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; @@ -1945,7 +2023,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) trace_xfs_attr_leaf_lookup(args); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(be16_to_cpu(leaf->hdr.count) < (XFS_LBSIZE(args->dp->i_mount)/8)); @@ -2041,7 +2119,9 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) * list structure. */ int -xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr_leaf_getvalue( + struct xfs_buf *bp, + xfs_da_args_t *args) { int valuelen; xfs_attr_leafblock_t *leaf; @@ -2049,7 +2129,7 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args) xfs_attr_leaf_name_local_t *name_loc; xfs_attr_leaf_name_remote_t *name_rmt; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(be16_to_cpu(leaf->hdr.count) < (XFS_LBSIZE(args->dp->i_mount)/8)); @@ -2247,12 +2327,14 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, * Return 0 unless leaf2 should go before leaf1. */ int -xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp) +xfs_attr_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp) { xfs_attr_leafblock_t *leaf1, *leaf2; - leaf1 = leaf1_bp->data; - leaf2 = leaf2_bp->data; + leaf1 = leaf1_bp->b_addr; + leaf2 = leaf2_bp->b_addr; ASSERT((leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) && (leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC))); if ((be16_to_cpu(leaf1->hdr.count) > 0) && @@ -2272,11 +2354,13 @@ xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp) * Pick up the last hashvalue from a leaf block. */ xfs_dahash_t -xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count) +xfs_attr_leaf_lasthash( + struct xfs_buf *bp, + int *count) { xfs_attr_leafblock_t *leaf; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); if (count) *count = be16_to_cpu(leaf->hdr.count); @@ -2337,7 +2421,9 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local) * Copy out attribute list entries for attr_list(), for leaf attribute lists. */ int -xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) +xfs_attr_leaf_list_int( + struct xfs_buf *bp, + xfs_attr_list_context_t *context) { attrlist_cursor_kern_t *cursor; xfs_attr_leafblock_t *leaf; @@ -2345,7 +2431,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) int retval, i; ASSERT(bp != NULL); - leaf = bp->data; + leaf = bp->b_addr; cursor = context->cursor; cursor->initted = 1; @@ -2463,7 +2549,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; xfs_attr_leaf_name_remote_t *name_rmt; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; #ifdef DEBUG xfs_attr_leaf_name_local_t *name_loc; @@ -2475,15 +2561,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) /* * Set up the operation. */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); - if (error) { + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) return(error); - } - ASSERT(bp != NULL); - leaf = bp->data; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + leaf = bp->b_addr; ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); entry = &leaf->entries[ args->index ]; @@ -2505,7 +2587,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) #endif /* DEBUG */ entry->flags &= ~XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if (args->rmtblkno) { @@ -2513,10 +2595,9 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->valuelen); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - xfs_da_buf_done(bp); /* * Commit the flag value change and start the next trans in series. @@ -2533,7 +2614,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; xfs_attr_leaf_name_remote_t *name_rmt; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; trace_xfs_attr_leaf_setflag(args); @@ -2541,31 +2622,26 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) /* * Set up the operation. */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); - if (error) { + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) return(error); - } - ASSERT(bp != NULL); - leaf = bp->data; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + leaf = bp->b_addr; ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); entry = &leaf->entries[ args->index ]; ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); entry->flags |= XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if ((entry->flags & XFS_ATTR_LOCAL) == 0) { name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); name_rmt->valueblk = 0; name_rmt->valuelen = 0; - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - xfs_da_buf_done(bp); /* * Commit the flag value change and start the next trans in series. @@ -2586,7 +2662,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) xfs_attr_leafblock_t *leaf1, *leaf2; xfs_attr_leaf_entry_t *entry1, *entry2; xfs_attr_leaf_name_remote_t *name_rmt; - xfs_dabuf_t *bp1, *bp2; + struct xfs_buf *bp1, *bp2; int error; #ifdef DEBUG xfs_attr_leaf_name_local_t *name_loc; @@ -2599,35 +2675,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) /* * Read the block containing the "old" attr */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, - XFS_ATTR_FORK); - if (error) { - return(error); - } - ASSERT(bp1 != NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); + if (error) + return error; /* * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { - error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, - -1, &bp2, XFS_ATTR_FORK); - if (error) { - return(error); - } - ASSERT(bp2 != NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2, + -1, &bp2); + if (error) + return error; } else { bp2 = bp1; } - leaf1 = bp1->data; - ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + leaf1 = bp1->b_addr; ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); ASSERT(args->index >= 0); entry1 = &leaf1->entries[ args->index ]; - leaf2 = bp2->data; - ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + leaf2 = bp2->b_addr; ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); ASSERT(args->index2 >= 0); entry2 = &leaf2->entries[ args->index2 ]; @@ -2660,30 +2729,27 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0); entry1->flags &= ~XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp1, + xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); if (args->rmtblkno) { ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->valuelen); - xfs_da_log_buf(args->trans, bp1, + xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); } entry2->flags |= XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp2, + xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2); name_rmt->valueblk = 0; name_rmt->valuelen = 0; - xfs_da_log_buf(args->trans, bp2, + xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); } - xfs_da_buf_done(bp1); - if (bp1 != bp2) - xfs_da_buf_done(bp2); /* * Commit the flag value change and start the next trans in series. @@ -2706,7 +2772,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) { xfs_da_blkinfo_t *info; xfs_daddr_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; /* @@ -2715,23 +2781,23 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * the extents in reverse order the extent containing * block 0 must still be there. */ - error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) return(error); - blkno = xfs_da_blkno(bp); + blkno = XFS_BUF_ADDR(bp); /* * Invalidate the tree, even if the "tree" is only a single leaf block. * This is a depth-first traversal! */ - info = bp->data; + info = bp->b_addr; if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { error = xfs_attr_node_inactive(trans, dp, bp, 1); } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) { error = xfs_attr_leaf_inactive(trans, dp, bp); } else { error = XFS_ERROR(EIO); - xfs_da_brelse(*trans, bp); + xfs_trans_brelse(*trans, bp); } if (error) return(error); @@ -2742,7 +2808,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); if (error) return(error); - xfs_da_binval(*trans, bp); /* remove from cache */ + xfs_trans_binval(*trans, bp); /* remove from cache */ /* * Commit the invalidate and start the next transaction. */ @@ -2756,34 +2822,37 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * We're doing a depth-first traversal in order to invalidate everything. */ STATIC int -xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, - int level) +xfs_attr_node_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp, + int level) { xfs_da_blkinfo_t *info; xfs_da_intnode_t *node; xfs_dablk_t child_fsb; xfs_daddr_t parent_blkno, child_blkno; int error, count, i; - xfs_dabuf_t *child_bp; + struct xfs_buf *child_bp; /* * Since this code is recursive (gasp!) we must protect ourselves. */ if (level > XFS_DA_NODE_MAXDEPTH) { - xfs_da_brelse(*trans, bp); /* no locks for later trans */ + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ return(XFS_ERROR(EIO)); } - node = bp->data; + node = bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - parent_blkno = xfs_da_blkno(bp); /* save for re-read later */ + parent_blkno = XFS_BUF_ADDR(bp); /* save for re-read later */ count = be16_to_cpu(node->hdr.count); if (!count) { - xfs_da_brelse(*trans, bp); + xfs_trans_brelse(*trans, bp); return(0); } child_fsb = be32_to_cpu(node->btree[0].before); - xfs_da_brelse(*trans, bp); /* no locks for later trans */ + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ /* * If this is the node level just above the leaves, simply loop @@ -2797,18 +2866,18 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, * traversal of the tree so we may deal with many blocks * before we come back to this one. */ - error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, + error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp, XFS_ATTR_FORK); if (error) return(error); if (child_bp) { /* save for re-read later */ - child_blkno = xfs_da_blkno(child_bp); + child_blkno = XFS_BUF_ADDR(child_bp); /* * Invalidate the subtree, however we have to. */ - info = child_bp->data; + info = child_bp->b_addr; if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { error = xfs_attr_node_inactive(trans, dp, child_bp, level+1); @@ -2817,7 +2886,7 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, child_bp); } else { error = XFS_ERROR(EIO); - xfs_da_brelse(*trans, child_bp); + xfs_trans_brelse(*trans, child_bp); } if (error) return(error); @@ -2830,7 +2899,7 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, &child_bp, XFS_ATTR_FORK); if (error) return(error); - xfs_da_binval(*trans, child_bp); + xfs_trans_binval(*trans, child_bp); } /* @@ -2838,12 +2907,12 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, * child block number. */ if ((i+1) < count) { - error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, - &bp, XFS_ATTR_FORK); + error = xfs_da_node_read(*trans, dp, 0, parent_blkno, + &bp, XFS_ATTR_FORK); if (error) return(error); child_fsb = be32_to_cpu(node->btree[i+1].before); - xfs_da_brelse(*trans, bp); + xfs_trans_brelse(*trans, bp); } /* * Atomically commit the whole invalidate stuff. @@ -2863,7 +2932,10 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, * caught holding something that the logging code wants to flush to disk. */ STATIC int -xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) +xfs_attr_leaf_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; @@ -2871,7 +2943,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) xfs_attr_inactive_list_t *list, *lp; int error, count, size, tmp, i; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); /* @@ -2892,7 +2964,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) * If there are no "remote" values, we're done. */ if (count == 0) { - xfs_da_brelse(*trans, bp); + xfs_trans_brelse(*trans, bp); return(0); } @@ -2919,7 +2991,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) } } } - xfs_da_brelse(*trans, bp); /* unlock for trans. in freextent() */ + xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ /* * Invalidate each of the "remote" value extents. diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 9c7d22fdcf4..77de139a58f 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -31,7 +31,6 @@ struct attrlist; struct attrlist_cursor_kern; struct xfs_attr_list_context; -struct xfs_dabuf; struct xfs_da_args; struct xfs_da_state; struct xfs_da_state_blk; @@ -215,7 +214,7 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_list(struct xfs_attr_list_context *context); -int xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp); +int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); @@ -223,7 +222,7 @@ int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); * Internal routines when attribute fork size == XFS_LBSIZE(mp). */ int xfs_attr_leaf_to_node(struct xfs_da_args *args); -int xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp, +int xfs_attr_leaf_to_shortform(struct xfs_buf *bp, struct xfs_da_args *args, int forkoff); int xfs_attr_leaf_clearflag(struct xfs_da_args *args); int xfs_attr_leaf_setflag(struct xfs_da_args *args); @@ -235,14 +234,14 @@ int xfs_attr_leaf_flipflags(xfs_da_args_t *args); int xfs_attr_leaf_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); -int xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf, +int xfs_attr_leaf_lookup_int(struct xfs_buf *leaf, struct xfs_da_args *args); -int xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args); -int xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer, +int xfs_attr_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); +int xfs_attr_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer, +int xfs_attr_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_list_int(struct xfs_dabuf *bp, +int xfs_attr_leaf_list_int(struct xfs_buf *bp, struct xfs_attr_list_context *context); /* @@ -257,9 +256,15 @@ int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); /* * Utility routines. */ -xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count); -int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp, - struct xfs_dabuf *leaf2_bp); +xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count); +int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local); +int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp); + +extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops; + #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 58b815ec8c9..cdb2d334858 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2437,6 +2437,7 @@ xfs_bmap_btalloc( * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; + memset(&args, 0, sizeof(args)); args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; @@ -2661,8 +2662,9 @@ xfs_bmap_btree_to_extents( if ((error = xfs_btree_check_lptr(cur, cbno, 1))) return error; #endif - if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) return error; cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) @@ -3082,6 +3084,7 @@ xfs_bmap_extents_to_btree( * Convert to a btree with two levels, one record in root. */ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; args.firstblock = *firstblock; @@ -3121,6 +3124,7 @@ xfs_bmap_extents_to_btree( /* * Fill in the child block. */ + abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); ablock->bb_level = 0; @@ -3237,6 +3241,7 @@ xfs_bmap_local_to_extents( xfs_buf_t *bp; /* buffer for extent block */ xfs_bmbt_rec_host_t *ep;/* extent record pointer */ + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; args.firstblock = *firstblock; @@ -3266,6 +3271,7 @@ xfs_bmap_local_to_extents( ASSERT(args.len == 1); *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + bp->b_ops = &xfs_bmbt_buf_ops; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); xfs_bmap_forkoff_reset(args.mp, ip, whichfork); @@ -4075,8 +4081,9 @@ xfs_bmap_read_extents( * pointer (leftmost) at each level. */ while (level-- > 0) { - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) return error; block = XFS_BUF_TO_BLOCK(bp); XFS_WANT_CORRUPTED_GOTO( @@ -4121,7 +4128,8 @@ xfs_bmap_read_extents( */ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); if (nextbno != NULLFSBLOCK) - xfs_btree_reada_bufl(mp, nextbno, 1); + xfs_btree_reada_bufl(mp, nextbno, 1, + &xfs_bmbt_buf_ops); /* * Copy records into the extent records. */ @@ -4153,8 +4161,9 @@ xfs_bmap_read_extents( */ if (bno == NULLFSBLOCK) break; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) return error; block = XFS_BUF_TO_BLOCK(bp); } @@ -4616,12 +4625,11 @@ xfs_bmapi_delay( STATIC int -xfs_bmapi_allocate( - struct xfs_bmalloca *bma, - int flags) +__xfs_bmapi_allocate( + struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; @@ -4654,19 +4662,19 @@ xfs_bmapi_allocate( * Indicate if this is the first user data in the file, or just any * user data. */ - if (!(flags & XFS_BMAPI_METADATA)) { + if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->userdata = (bma->offset == 0) ? XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; } - bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; /* * Only want to do the alignment at the eof if it is userdata and * allocation length is larger than a stripe unit. */ if (mp->m_dalign && bma->length >= mp->m_dalign && - !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { error = xfs_bmap_isaeof(bma, whichfork); if (error) return error; @@ -4706,7 +4714,7 @@ xfs_bmapi_allocate( * A wasdelay extent has been initialized, so shouldn't be flagged * as unwritten. */ - if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && + if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->got.br_state = XFS_EXT_UNWRITTEN; @@ -4734,6 +4742,45 @@ xfs_bmapi_allocate( return 0; } +static void +xfs_bmapi_allocate_worker( + struct work_struct *work) +{ + struct xfs_bmalloca *args = container_of(work, + struct xfs_bmalloca, work); + unsigned long pflags; + + /* we are in a transaction context here */ + current_set_flags_nested(&pflags, PF_FSTRANS); + + args->result = __xfs_bmapi_allocate(args); + complete(args->done); + + current_restore_flags_nested(&pflags, PF_FSTRANS); +} + +/* + * Some allocation requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. Otherwise just + * call directly to avoid the context switch overhead here. + */ +int +xfs_bmapi_allocate( + struct xfs_bmalloca *args) +{ + DECLARE_COMPLETION_ONSTACK(done); + + if (!args->stack_switch) + return __xfs_bmapi_allocate(args); + + + args->done = &done; + INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); + queue_work(xfs_alloc_wq, &args->work); + wait_for_completion(&done); + return args->result; +} + STATIC int xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, @@ -4906,6 +4953,9 @@ xfs_bmapi_write( bma.flist = flist; bma.firstblock = firstblock; + if (flags & XFS_BMAPI_STACK_SWITCH) + bma.stack_switch = 1; + while (bno < end && n < *nmap) { inhole = eof || bma.got.br_startoff > bno; wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); @@ -4919,6 +4969,7 @@ xfs_bmapi_write( bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; bma.offset = bno; + bma.flags = flags; /* * There's a 32/64 bit type mismatch between the @@ -4934,7 +4985,7 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(bma.length > 0); - error = xfs_bmapi_allocate(&bma, flags); + error = xfs_bmapi_allocate(&bma); if (error) goto error0; if (bma.blkno == NULLFSBLOCK) @@ -5517,7 +5568,7 @@ xfs_getbmap( if (xfs_get_extsz_hint(ip) || ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ prealloced = 1; - fixlen = XFS_MAXIOFFSET(mp); + fixlen = mp->m_super->s_maxbytes; } else { prealloced = 0; fixlen = XFS_ISIZE(ip); @@ -5554,7 +5605,7 @@ xfs_getbmap( xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { - error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); + error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); if (error) goto out_unlock_iolock; } @@ -5823,15 +5874,16 @@ xfs_bmap_check_leaf_extents( */ while (level-- > 0) { /* See if buf is in cur first */ + bp_release = 0; bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (bp) { - bp_release = 0; - } else { + if (!bp) { bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; } - if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - goto error_norelse; block = XFS_BUF_TO_BLOCK(bp); XFS_WANT_CORRUPTED_GOTO( xfs_bmap_sanity_check(mp, bp, level), @@ -5908,15 +5960,16 @@ xfs_bmap_check_leaf_extents( if (bno == NULLFSBLOCK) break; + bp_release = 0; bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (bp) { - bp_release = 0; - } else { + if (!bp) { bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; } - if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - goto error_norelse; block = XFS_BUF_TO_BLOCK(bp); } if (bp_release) { @@ -6007,7 +6060,9 @@ xfs_bmap_count_tree( struct xfs_btree_block *block, *nextblock; int numrecs; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) return error; *count += 1; block = XFS_BUF_TO_BLOCK(bp); @@ -6016,8 +6071,10 @@ xfs_bmap_count_tree( /* Not at node above leaves, count this level of nodes */ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); while (nextbno != NULLFSBLOCK) { - if ((error = xfs_btree_read_bufl(mp, tp, nextbno, - 0, &nbp, XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) return error; *count += 1; nextblock = XFS_BUF_TO_BLOCK(nbp); @@ -6046,8 +6103,10 @@ xfs_bmap_count_tree( if (nextbno == NULLFSBLOCK) break; bno = nextbno; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) return error; *count += 1; block = XFS_BUF_TO_BLOCK(bp); diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 803b56d7ce1..5f469c3516e 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -77,6 +77,7 @@ typedef struct xfs_bmap_free * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT 0x040 +#define XFS_BMAPI_STACK_SWITCH 0x080 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -85,7 +86,8 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" } + { XFS_BMAPI_CONVERT, "CONVERT" }, \ + { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" } static inline int xfs_bmapi_aflag(int w) @@ -133,6 +135,11 @@ typedef struct xfs_bmalloca { char userdata;/* set if is user data */ char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ + char stack_switch; + int flags; + struct completion *done; + struct work_struct work; + int result; } xfs_bmalloca_t; /* diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 862084a47a7..061b45cbe61 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -36,6 +36,7 @@ #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_quota.h" +#include "xfs_trace.h" /* * Determine the extent state. @@ -707,6 +708,67 @@ xfs_bmbt_key_diff( cur->bc_rec.b.br_startoff; } +static void +xfs_bmbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + unsigned int level; + int lblock_ok; /* block passes checks */ + + /* magic number and level verification. + * + * We don't know waht fork we belong to, so just verify that the level + * is less than the maximum of the two. Later checks will be more + * precise. + */ + level = be16_to_cpu(block->bb_level); + lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) && + level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]); + + /* numrecs verification */ + lblock_ok = lblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0]; + + /* sibling pointer verification */ + lblock_ok = lblock_ok && + block->bb_u.l.bb_leftsib && + (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_leftsib))) && + block->bb_u.l.bb_rightsib && + (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_rightsib))); + + if (!lblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_bmbt_read_verify( + struct xfs_buf *bp) +{ + xfs_bmbt_verify(bp); +} + +static void +xfs_bmbt_write_verify( + struct xfs_buf *bp) +{ + xfs_bmbt_verify(bp); +} + +const struct xfs_buf_ops xfs_bmbt_buf_ops = { + .verify_read = xfs_bmbt_read_verify, + .verify_write = xfs_bmbt_write_verify, +}; + + #ifdef DEBUG STATIC int xfs_bmbt_keys_inorder( @@ -746,6 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, + .buf_ops = &xfs_bmbt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 0e66c4ea0f8..88469ca0869 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -236,5 +236,6 @@ extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +extern const struct xfs_buf_ops xfs_bmbt_buf_ops; #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index e53e317b158..db010408d70 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -266,9 +266,13 @@ xfs_btree_dup_cursor( for (i = 0; i < new->bc_nlevels; i++) { new->bc_ptrs[i] = cur->bc_ptrs[i]; new->bc_ra[i] = cur->bc_ra[i]; - if ((bp = cur->bc_bufs[i])) { - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) { + bp = cur->bc_bufs[i]; + if (bp) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_BUF_ADDR(bp), mp->m_bsize, + 0, &bp, + cur->bc_ops->buf_ops); + if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; return error; @@ -609,25 +613,26 @@ xfs_btree_offsets( * Get a buffer for the block, return it read in. * Long-form addressing. */ -int /* error */ +int xfs_btree_read_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t fsbno, /* file system block number */ - uint lock, /* lock flags for read_buf */ - xfs_buf_t **bpp, /* buffer for fsbno */ - int refval) /* ref count value for buffer */ -{ - xfs_buf_t *bp; /* return value */ + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for fsbno */ + int refval, /* ref count value for buffer */ + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; /* return value */ xfs_daddr_t d; /* real disk block address */ - int error; + int error; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp))) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp, ops); + if (error) return error; - } ASSERT(!xfs_buf_geterror(bp)); if (bp) xfs_buf_set_ref(bp, refval); @@ -642,15 +647,16 @@ xfs_btree_read_bufl( /* ARGSUSED */ void xfs_btree_reada_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count) /* count of filesystem blocks */ + struct xfs_mount *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops) { xfs_daddr_t d; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); } /* @@ -660,17 +666,18 @@ xfs_btree_reada_bufl( /* ARGSUSED */ void xfs_btree_reada_bufs( - xfs_mount_t *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count) /* count of filesystem blocks */ + struct xfs_mount *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops) { xfs_daddr_t d; ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); } STATIC int @@ -684,12 +691,14 @@ xfs_btree_readahead_lblock( xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { - xfs_btree_reada_bufl(cur->bc_mp, left, 1); + xfs_btree_reada_bufl(cur->bc_mp, left, 1, + cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { - xfs_btree_reada_bufl(cur->bc_mp, right, 1); + xfs_btree_reada_bufl(cur->bc_mp, right, 1, + cur->bc_ops->buf_ops); rval++; } @@ -709,13 +718,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - left, 1); + left, 1, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - right, 1); + right, 1, cur->bc_ops->buf_ops); rval++; } @@ -853,18 +862,22 @@ xfs_btree_set_sibling( } } -STATIC void +void xfs_btree_init_block( - struct xfs_btree_cur *cur, - int level, - int numrecs, - struct xfs_btree_block *new) /* new block */ + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + unsigned int flags) { - new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); + struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp); + + new->bb_magic = cpu_to_be32(magic); new->bb_level = cpu_to_be16(level); new->bb_numrecs = cpu_to_be16(numrecs); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (flags & XFS_BTREE_LONG_PTRS) { new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); } else { @@ -873,6 +886,17 @@ xfs_btree_init_block( } } +STATIC void +xfs_btree_init_block_cur( + struct xfs_btree_cur *cur, + int level, + int numrecs, + struct xfs_buf *bp) +{ + xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum], + level, numrecs, cur->bc_flags); +} + /* * Return true if ptr is the last record in the btree and * we need to track updateÑ• to this record. The decision @@ -972,6 +996,7 @@ xfs_btree_get_buf_block( if (!*bpp) return ENOMEM; + (*bpp)->b_ops = cur->bc_ops->buf_ops; *block = XFS_BUF_TO_BLOCK(*bpp); return 0; } @@ -998,19 +1023,15 @@ xfs_btree_read_buf_block( d = xfs_btree_ptr_to_daddr(cur, ptr); error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, flags, bpp); + mp->m_bsize, flags, bpp, + cur->bc_ops->buf_ops); if (error) return error; ASSERT(!xfs_buf_geterror(*bpp)); - xfs_btree_set_refs(cur, *bpp); *block = XFS_BUF_TO_BLOCK(*bpp); - - error = xfs_btree_check_block(cur, *block, level, *bpp); - if (error) - xfs_trans_brelse(cur->bc_tp, *bpp); - return error; + return 0; } /* @@ -2183,7 +2204,7 @@ xfs_btree_split( goto error0; /* Fill in the btree header for the new right block. */ - xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right); + xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp); /* * Split the entries between the old and the new block evenly. @@ -2492,7 +2513,7 @@ xfs_btree_new_root( nptr = 2; } /* Fill in the new block's btree header and log it. */ - xfs_btree_init_block(cur, cur->bc_nlevels, 2, new); + xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp); xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && !xfs_btree_ptr_is_null(cur, &rptr)); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 5b240de104c..f932897194e 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -188,6 +188,8 @@ struct xfs_btree_ops { __int64_t (*key_diff)(struct xfs_btree_cur *cur, union xfs_btree_key *key); + const struct xfs_buf_ops *buf_ops; + #ifdef DEBUG /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, @@ -355,7 +357,8 @@ xfs_btree_read_bufl( xfs_fsblock_t fsbno, /* file system block number */ uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ - int refval);/* ref count value for buffer */ + int refval, /* ref count value for buffer */ + const struct xfs_buf_ops *ops); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -365,7 +368,8 @@ void /* error */ xfs_btree_reada_bufl( struct xfs_mount *mp, /* file system mount point */ xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count); /* count of filesystem blocks */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -376,8 +380,20 @@ xfs_btree_reada_bufs( struct xfs_mount *mp, /* file system mount point */ xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count); /* count of filesystem blocks */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops); +/* + * Initialise a new btree block header + */ +void +xfs_btree_init_block( + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + unsigned int flags); /* * Common btree core entry points. diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 269b35c084d..fbbb9eb92e3 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -96,6 +96,7 @@ xfs_buf_lru_add( atomic_inc(&bp->b_hold); list_add_tail(&bp->b_lru, &btp->bt_lru); btp->bt_lru_nr++; + bp->b_lru_flags &= ~_XBF_LRU_DISPOSE; } spin_unlock(&btp->bt_lru_lock); } @@ -154,7 +155,8 @@ xfs_buf_stale( struct xfs_buftarg *btp = bp->b_target; spin_lock(&btp->bt_lru_lock); - if (!list_empty(&bp->b_lru)) { + if (!list_empty(&bp->b_lru) && + !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) { list_del_init(&bp->b_lru); btp->bt_lru_nr--; atomic_dec(&bp->b_hold); @@ -164,14 +166,49 @@ xfs_buf_stale( ASSERT(atomic_read(&bp->b_hold) >= 1); } +static int +xfs_buf_get_maps( + struct xfs_buf *bp, + int map_count) +{ + ASSERT(bp->b_maps == NULL); + bp->b_map_count = map_count; + + if (map_count == 1) { + bp->b_maps = &bp->__b_map; + return 0; + } + + bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), + KM_NOFS); + if (!bp->b_maps) + return ENOMEM; + return 0; +} + +/* + * Frees b_pages if it was allocated. + */ +static void +xfs_buf_free_maps( + struct xfs_buf *bp) +{ + if (bp->b_maps != &bp->__b_map) { + kmem_free(bp->b_maps); + bp->b_maps = NULL; + } +} + struct xfs_buf * -xfs_buf_alloc( +_xfs_buf_alloc( struct xfs_buftarg *target, - xfs_daddr_t blkno, - size_t numblks, + struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags) { struct xfs_buf *bp; + int error; + int i; bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); if (unlikely(!bp)) @@ -192,16 +229,28 @@ xfs_buf_alloc( sema_init(&bp->b_sema, 0); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; + bp->b_flags = flags; /* * Set length and io_length to the same value initially. * I/O routines should use io_length, which will be the same in * most cases but may be reset (e.g. XFS recovery). */ - bp->b_length = numblks; - bp->b_io_length = numblks; - bp->b_flags = flags; - bp->b_bn = blkno; + error = xfs_buf_get_maps(bp, nmaps); + if (error) { + kmem_zone_free(xfs_buf_zone, bp); + return NULL; + } + + bp->b_bn = map[0].bm_bn; + bp->b_length = 0; + for (i = 0; i < nmaps; i++) { + bp->b_maps[i].bm_bn = map[i].bm_bn; + bp->b_maps[i].bm_len = map[i].bm_len; + bp->b_length += map[i].bm_len; + } + bp->b_io_length = bp->b_length; + atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); @@ -280,6 +329,7 @@ xfs_buf_free( } else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); + xfs_buf_free_maps(bp); kmem_zone_free(xfs_buf_zone, bp); } @@ -327,8 +377,9 @@ xfs_buf_allocate_memory( } use_alloc_page: - start = BBTOB(bp->b_bn) >> PAGE_SHIFT; - end = (BBTOB(bp->b_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) + >> PAGE_SHIFT; page_count = end - start; error = _xfs_buf_get_pages(bp, page_count, flags); if (unlikely(error)) @@ -425,8 +476,8 @@ _xfs_buf_map_pages( xfs_buf_t * _xfs_buf_find( struct xfs_buftarg *btp, - xfs_daddr_t blkno, - size_t numblks, + struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags, xfs_buf_t *new_bp) { @@ -435,13 +486,36 @@ _xfs_buf_find( struct rb_node **rbp; struct rb_node *parent; xfs_buf_t *bp; + xfs_daddr_t blkno = map[0].bm_bn; + xfs_daddr_t eofs; + int numblks = 0; + int i; + for (i = 0; i < nmaps; i++) + numblks += map[i].bm_len; numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ ASSERT(!(numbytes < (1 << btp->bt_sshift))); ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); + /* + * Corrupted block numbers can get through to here, unfortunately, so we + * have to check that the buffer falls within the filesystem bounds. + */ + eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); + if (blkno >= eofs) { + /* + * XXX (dgc): we should really be returning EFSCORRUPTED here, + * but none of the higher level infrastructure supports + * returning a specific error on buffer lookup failures. + */ + xfs_alert(btp->bt_mount, + "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", + __func__, blkno, eofs); + return NULL; + } + /* get tree root */ pag = xfs_perag_get(btp->bt_mount, xfs_daddr_to_agno(btp->bt_mount, blkno)); @@ -513,7 +587,9 @@ found: */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); + ASSERT(bp->b_iodone == NULL); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_ops = NULL; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -527,31 +603,31 @@ found: * more hits than misses. */ struct xfs_buf * -xfs_buf_get( - xfs_buftarg_t *target, - xfs_daddr_t blkno, - size_t numblks, +xfs_buf_get_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags) { struct xfs_buf *bp; struct xfs_buf *new_bp; int error = 0; - bp = _xfs_buf_find(target, blkno, numblks, flags, NULL); + bp = _xfs_buf_find(target, map, nmaps, flags, NULL); if (likely(bp)) goto found; - new_bp = xfs_buf_alloc(target, blkno, numblks, flags); + new_bp = _xfs_buf_alloc(target, map, nmaps, flags); if (unlikely(!new_bp)) return NULL; error = xfs_buf_allocate_memory(new_bp, flags); if (error) { - kmem_zone_free(xfs_buf_zone, new_bp); + xfs_buf_free(new_bp); return NULL; } - bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp); + bp = _xfs_buf_find(target, map, nmaps, flags, new_bp); if (!bp) { xfs_buf_free(new_bp); return NULL; @@ -560,8 +636,6 @@ xfs_buf_get( if (bp != new_bp) xfs_buf_free(new_bp); - bp->b_io_length = bp->b_length; - found: if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); @@ -584,7 +658,7 @@ _xfs_buf_read( xfs_buf_flags_t flags) { ASSERT(!(flags & XBF_WRITE)); - ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); + ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); @@ -596,22 +670,24 @@ _xfs_buf_read( } xfs_buf_t * -xfs_buf_read( - xfs_buftarg_t *target, - xfs_daddr_t blkno, - size_t numblks, - xfs_buf_flags_t flags) +xfs_buf_read_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops) { - xfs_buf_t *bp; + struct xfs_buf *bp; flags |= XBF_READ; - bp = xfs_buf_get(target, blkno, numblks, flags); + bp = xfs_buf_get_map(target, map, nmaps, flags); if (bp) { trace_xfs_buf_read(bp, flags, _RET_IP_); if (!XFS_BUF_ISDONE(bp)) { XFS_STATS_INC(xb_get_read); + bp->b_ops = ops; _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { /* @@ -634,16 +710,17 @@ xfs_buf_read( * safe manner. */ void -xfs_buf_readahead( - xfs_buftarg_t *target, - xfs_daddr_t blkno, - size_t numblks) +xfs_buf_readahead_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + const struct xfs_buf_ops *ops) { if (bdi_read_congested(target->bt_bdi)) return; - xfs_buf_read(target, blkno, numblks, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); + xfs_buf_read_map(target, map, nmaps, + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); } /* @@ -655,25 +732,24 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - int flags) + int flags, + const struct xfs_buf_ops *ops) { - xfs_buf_t *bp; - int error; + struct xfs_buf *bp; bp = xfs_buf_get_uncached(target, numblks, flags); if (!bp) return NULL; /* set up the buffer for a read IO */ - XFS_BUF_SET_ADDR(bp, daddr); - XFS_BUF_READ(bp); + ASSERT(bp->b_map_count == 1); + bp->b_bn = daddr; + bp->b_maps[0].bm_bn = daddr; + bp->b_flags |= XBF_READ; + bp->b_ops = ops; xfsbdstrat(target->bt_mount, bp); - error = xfs_buf_iowait(bp); - if (error) { - xfs_buf_relse(bp); - return NULL; - } + xfs_buf_iowait(bp); return bp; } @@ -694,7 +770,11 @@ xfs_buf_set_empty( bp->b_addr = NULL; bp->b_length = numblks; bp->b_io_length = numblks; + + ASSERT(bp->b_map_count == 1); bp->b_bn = XFS_BUF_DADDR_NULL; + bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL; + bp->b_maps[0].bm_len = bp->b_length; } static inline struct page * @@ -758,9 +838,10 @@ xfs_buf_get_uncached( { unsigned long page_count; int error, i; - xfs_buf_t *bp; + struct xfs_buf *bp; + DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); - bp = xfs_buf_alloc(target, XFS_BUF_DADDR_NULL, numblks, 0); + bp = _xfs_buf_alloc(target, &map, 1, 0); if (unlikely(bp == NULL)) goto fail; @@ -791,6 +872,7 @@ xfs_buf_get_uncached( __free_page(bp->b_pages[i]); _xfs_buf_free_pages(bp); fail_free_buf: + xfs_buf_free_maps(bp); kmem_zone_free(xfs_buf_zone, bp); fail: return NULL; @@ -937,27 +1019,37 @@ STATIC void xfs_buf_iodone_work( struct work_struct *work) { - xfs_buf_t *bp = + struct xfs_buf *bp = container_of(work, xfs_buf_t, b_iodone_work); + bool read = !!(bp->b_flags & XBF_READ); + + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); + if (read && bp->b_ops) + bp->b_ops->verify_read(bp); if (bp->b_iodone) (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); + else { + ASSERT(read && bp->b_ops); + complete(&bp->b_iowait); + } } void xfs_buf_ioend( - xfs_buf_t *bp, - int schedule) + struct xfs_buf *bp, + int schedule) { + bool read = !!(bp->b_flags & XBF_READ); + trace_xfs_buf_iodone(bp, _RET_IP_); - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); if (bp->b_error == 0) bp->b_flags |= XBF_DONE; - if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { + if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) { if (schedule) { INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); queue_work(xfslogd_workqueue, &bp->b_iodone_work); @@ -965,6 +1057,7 @@ xfs_buf_ioend( xfs_buf_iodone_work(&bp->b_iodone_work); } } else { + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); complete(&bp->b_iowait); } } @@ -1135,45 +1228,53 @@ xfs_buf_bio_end_io( { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; - xfs_buf_ioerror(bp, -error); + /* + * don't overwrite existing errors - otherwise we can lose errors on + * buffers that require multiple bios to complete. + */ + if (!bp->b_error) + xfs_buf_ioerror(bp, -error); - if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); _xfs_buf_ioend(bp, 1); bio_put(bio); } -STATIC void -_xfs_buf_ioapply( - xfs_buf_t *bp) +static void +xfs_buf_ioapply_map( + struct xfs_buf *bp, + int map, + int *buf_offset, + int *count, + int rw) { - int rw, map_i, total_nr_pages, nr_pages; - struct bio *bio; - int offset = bp->b_offset; - int size = BBTOB(bp->b_io_length); - sector_t sector = bp->b_bn; + int page_index; + int total_nr_pages = bp->b_page_count; + int nr_pages; + struct bio *bio; + sector_t sector = bp->b_maps[map].bm_bn; + int size; + int offset; total_nr_pages = bp->b_page_count; - map_i = 0; - if (bp->b_flags & XBF_WRITE) { - if (bp->b_flags & XBF_SYNCIO) - rw = WRITE_SYNC; - else - rw = WRITE; - if (bp->b_flags & XBF_FUA) - rw |= REQ_FUA; - if (bp->b_flags & XBF_FLUSH) - rw |= REQ_FLUSH; - } else if (bp->b_flags & XBF_READ_AHEAD) { - rw = READA; - } else { - rw = READ; + /* skip the pages in the buffer before the start offset */ + page_index = 0; + offset = *buf_offset; + while (offset >= PAGE_SIZE) { + page_index++; + offset -= PAGE_SIZE; } - /* we only use the buffer cache for meta-data */ - rw |= REQ_META; + /* + * Limit the IO size to the length of the current vector, and update the + * remaining IO count for the next time around. + */ + size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); + *count -= size; + *buf_offset += size; next_chunk: atomic_inc(&bp->b_io_remaining); @@ -1188,13 +1289,14 @@ next_chunk: bio->bi_private = bp; - for (; size && nr_pages; nr_pages--, map_i++) { + for (; size && nr_pages; nr_pages--, page_index++) { int rbytes, nbytes = PAGE_SIZE - offset; if (nbytes > size) nbytes = size; - rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset); + rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, + offset); if (rbytes < nbytes) break; @@ -1213,9 +1315,76 @@ next_chunk: if (size) goto next_chunk; } else { + /* + * This is guaranteed not to be the last io reference count + * because the caller (xfs_buf_iorequest) holds a count itself. + */ + atomic_dec(&bp->b_io_remaining); xfs_buf_ioerror(bp, EIO); bio_put(bio); } + +} + +STATIC void +_xfs_buf_ioapply( + struct xfs_buf *bp) +{ + struct blk_plug plug; + int rw; + int offset; + int size; + int i; + + if (bp->b_flags & XBF_WRITE) { + if (bp->b_flags & XBF_SYNCIO) + rw = WRITE_SYNC; + else + rw = WRITE; + if (bp->b_flags & XBF_FUA) + rw |= REQ_FUA; + if (bp->b_flags & XBF_FLUSH) + rw |= REQ_FLUSH; + + /* + * Run the write verifier callback function if it exists. If + * this function fails it will mark the buffer with an error and + * the IO should not be dispatched. + */ + if (bp->b_ops) { + bp->b_ops->verify_write(bp); + if (bp->b_error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_CORRUPT_INCORE); + return; + } + } + } else if (bp->b_flags & XBF_READ_AHEAD) { + rw = READA; + } else { + rw = READ; + } + + /* we only use the buffer cache for meta-data */ + rw |= REQ_META; + + /* + * Walk all the vectors issuing IO on them. Set up the initial offset + * into the buffer and the desired IO size before we start - + * _xfs_buf_ioapply_vec() will modify them appropriately for each + * subsequent call. + */ + offset = bp->b_offset; + size = BBTOB(bp->b_io_length); + blk_start_plug(&plug); + for (i = 0; i < bp->b_map_count; i++) { + xfs_buf_ioapply_map(bp, i, &offset, &size, rw); + if (bp->b_error) + break; + if (size <= 0) + break; /* all done */ + } + blk_finish_plug(&plug); } void @@ -1336,6 +1505,8 @@ restart: while (!list_empty(&btp->bt_lru)) { bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); if (atomic_read(&bp->b_hold) > 1) { + trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + list_move_tail(&bp->b_lru, &btp->bt_lru); spin_unlock(&btp->bt_lru_lock); delay(100); goto restart; @@ -1389,6 +1560,7 @@ xfs_buftarg_shrink( */ list_move(&bp->b_lru, &dispose); btp->bt_lru_nr--; + bp->b_lru_flags |= _XBF_LRU_DISPOSE; } spin_unlock(&btp->bt_lru_lock); @@ -1557,7 +1729,7 @@ xfs_buf_cmp( struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); xfs_daddr_t diff; - diff = ap->b_bn - bp->b_bn; + diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; if (diff < 0) return -1; if (diff > 0) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 79344c48008..433a12ed7b1 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -38,26 +38,28 @@ typedef enum { XBRW_ZERO = 3, /* Zero target memory */ } xfs_buf_rw_t; -#define XBF_READ (1 << 0) /* buffer intended for reading from device */ -#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ -#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ -#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ -#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ -#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ +#define XBF_READ (1 << 0) /* buffer intended for reading from device */ +#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ +#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ +#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ +#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ +#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ /* I/O hints for the BIO layer */ -#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ -#define XBF_FUA (1 << 11)/* force cache write through mode */ -#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ +#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ +#define XBF_FUA (1 << 11)/* force cache write through mode */ +#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ /* flags used only as arguments to access routines */ -#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ +#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ +#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ /* flags used only internally */ -#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ -#define _XBF_KMEM (1 << 21)/* backed by heap memory */ -#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ +#define _XBF_KMEM (1 << 21)/* backed by heap memory */ +#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +#define _XBF_COMPOUND (1 << 23)/* compound buffer */ +#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */ typedef unsigned int xfs_buf_flags_t; @@ -71,11 +73,13 @@ typedef unsigned int xfs_buf_flags_t; { XBF_SYNCIO, "SYNCIO" }, \ { XBF_FUA, "FUA" }, \ { XBF_FLUSH, "FLUSH" }, \ - { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ + { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ - { _XBF_DELWRI_Q, "DELWRI_Q" } + { _XBF_DELWRI_Q, "DELWRI_Q" }, \ + { _XBF_COMPOUND, "COMPOUND" }, \ + { _XBF_LRU_DISPOSE, "LRU_DISPOSE" } typedef struct xfs_buftarg { dev_t bt_dev; @@ -96,8 +100,22 @@ typedef struct xfs_buftarg { struct xfs_buf; typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); + #define XB_PAGES 2 +struct xfs_buf_map { + xfs_daddr_t bm_bn; /* block number for I/O */ + int bm_len; /* size of I/O */ +}; + +#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ + struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; + +struct xfs_buf_ops { + void (*verify_read)(struct xfs_buf *); + void (*verify_write)(struct xfs_buf *); +}; + typedef struct xfs_buf { /* * first cacheline holds all the fields needed for an uncontended cache @@ -107,14 +125,19 @@ typedef struct xfs_buf { * fast-path on locking. */ struct rb_node b_rbnode; /* rbtree node */ - xfs_daddr_t b_bn; /* block number for I/O */ + xfs_daddr_t b_bn; /* block number of buffer */ int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ atomic_t b_lru_ref; /* lru reclaim ref count */ xfs_buf_flags_t b_flags; /* status flags */ struct semaphore b_sema; /* semaphore for lockables */ + /* + * concurrent access to b_lru and b_lru_flags are protected by + * bt_lru_lock and not by b_sema + */ struct list_head b_lru; /* lru list */ + xfs_buf_flags_t b_lru_flags; /* internal lru status flags */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ @@ -127,42 +150,107 @@ typedef struct xfs_buf { struct xfs_trans *b_transp; struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ + struct xfs_buf_map *b_maps; /* compound buffer map */ + struct xfs_buf_map __b_map; /* inline compound buffer map */ + int b_map_count; int b_io_length; /* IO size in BBs */ atomic_t b_pin_count; /* pin count */ atomic_t b_io_remaining; /* #outstanding I/O requests */ unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ unsigned short b_error; /* error code on I/O */ + const struct xfs_buf_ops *b_ops; + #ifdef XFS_BUF_LOCK_TRACKING int b_last_holder; #endif } xfs_buf_t; - /* Finding and Reading Buffers */ -struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks, xfs_buf_flags_t flags, - struct xfs_buf *new_bp); -#define xfs_incore(buftarg,blkno,len,lockit) \ - _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) - -struct xfs_buf *xfs_buf_get(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks, xfs_buf_flags_t flags); -struct xfs_buf *xfs_buf_read(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks, xfs_buf_flags_t flags); -void xfs_buf_readahead(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks); +struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, struct xfs_buf *new_bp); + +static inline struct xfs_buf * +xfs_incore( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return _xfs_buf_find(target, &map, 1, flags, NULL); +} + +struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags); + +static inline struct xfs_buf * +xfs_buf_alloc( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return _xfs_buf_alloc(target, &map, 1, flags); +} + +struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags); +struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops); +void xfs_buf_readahead_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + const struct xfs_buf_ops *ops); + +static inline struct xfs_buf * +xfs_buf_get( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_get_map(target, &map, 1, flags); +} + +static inline struct xfs_buf * +xfs_buf_read( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_read_map(target, &map, 1, flags, ops); +} + +static inline void +xfs_buf_readahead( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + const struct xfs_buf_ops *ops) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_readahead_map(target, &map, 1, ops); +} struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); -struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks, xfs_buf_flags_t flags); void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags); struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, - xfs_daddr_t daddr, size_t numblks, int flags); + xfs_daddr_t daddr, size_t numblks, int flags, + const struct xfs_buf_ops *ops); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ @@ -232,8 +320,18 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) #define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) -#define XFS_BUF_ADDR(bp) ((bp)->b_bn) -#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) +/* + * These macros use the IO block map rather than b_bn. b_bn is now really + * just for the buffer cache index for cached buffers. As IO does not use b_bn + * anymore, uncached buffers do not use b_bn at all and hence must modify the IO + * map directly. Uncached buffers are not allowed to be discontiguous, so this + * is safe to do. + * + * In future, uncached buffers will pass the block number directly to the io + * request function and hence these macros will go away at that point. + */ +#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn) +#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno)) static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index d9e451115f9..3f9949fee39 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -71,7 +71,7 @@ xfs_buf_item_log_debug( chunk_num = byte >> XFS_BLF_SHIFT; word_num = chunk_num >> BIT_TO_WORD_SHIFT; bit_num = chunk_num & (NBWORD - 1); - wordp = &(bip->bli_format.blf_data_map[word_num]); + wordp = &(bip->__bli_format.blf_data_map[word_num]); bit_set = *wordp & (1 << bit_num); ASSERT(bit_set); byte++; @@ -153,33 +153,25 @@ STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); * If the XFS_BLI_STALE flag has been set, then log nothing. */ STATIC uint -xfs_buf_item_size( - struct xfs_log_item *lip) +xfs_buf_item_size_segment( + struct xfs_buf_log_item *bip, + struct xfs_buf_log_format *blfp) { - struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; uint nvecs; int next_bit; int last_bit; - ASSERT(atomic_read(&bip->bli_refcount) > 0); - if (bip->bli_flags & XFS_BLI_STALE) { - /* - * The buffer is stale, so all we need to log - * is the buf log format structure with the - * cancel flag in it. - */ - trace_xfs_buf_item_size_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); - return 1; - } + last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (last_bit == -1) + return 0; + + /* + * initial count for a dirty buffer is 2 vectors - the format structure + * and the first dirty region. + */ + nvecs = 2; - ASSERT(bip->bli_flags & XFS_BLI_LOGGED); - nvecs = 1; - last_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, 0); - ASSERT(last_bit != -1); - nvecs++; while (last_bit != -1) { /* * This takes the bit number to start looking from and @@ -187,16 +179,15 @@ xfs_buf_item_size( * if there are no more bits set or the start bit is * beyond the end of the bitmap. */ - next_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, - last_bit + 1); + next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + last_bit + 1); /* * If we run out of bits, leave the loop, * else if we find a new set of bits bump the number of vecs, * else keep scanning the current set of bits. */ if (next_bit == -1) { - last_bit = -1; + break; } else if (next_bit != last_bit + 1) { last_bit = next_bit; nvecs++; @@ -210,22 +201,73 @@ xfs_buf_item_size( } } - trace_xfs_buf_item_size(bip); return nvecs; } /* - * This is called to fill in the vector of log iovecs for the - * given log buf item. It fills the first entry with a buf log - * format structure, and the rest point to contiguous chunks - * within the buffer. + * This returns the number of log iovecs needed to log the given buf log item. + * + * It calculates this as 1 iovec for the buf log format structure and 1 for each + * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged + * in a single iovec. + * + * Discontiguous buffers need a format structure per region that that is being + * logged. This makes the changes in the buffer appear to log recovery as though + * they came from separate buffers, just like would occur if multiple buffers + * were used instead of a single discontiguous buffer. This enables + * discontiguous buffers to be in-memory constructs, completely transparent to + * what ends up on disk. + * + * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log + * format structures. */ -STATIC void -xfs_buf_item_format( - struct xfs_log_item *lip, - struct xfs_log_iovec *vecp) +STATIC uint +xfs_buf_item_size( + struct xfs_log_item *lip) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); + uint nvecs; + int i; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + trace_xfs_buf_item_size_stale(bip); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + return bip->bli_format_count; + } + + ASSERT(bip->bli_flags & XFS_BLI_LOGGED); + + /* + * the vector count is based on the number of buffer vectors we have + * dirty bits in. This will only be greater than one when we have a + * compound buffer with more than one segment dirty. Hence for compound + * buffers we need to track which segment the dirty bits correspond to, + * and when we move from one segment to the next increment the vector + * count for the extra buf log format structure that will need to be + * written. + */ + nvecs = 0; + for (i = 0; i < bip->bli_format_count; i++) { + nvecs += xfs_buf_item_size_segment(bip, &bip->bli_formats[i]); + } + + trace_xfs_buf_item_size(bip); + return nvecs; +} + +static struct xfs_log_iovec * +xfs_buf_item_format_segment( + struct xfs_buf_log_item *bip, + struct xfs_log_iovec *vecp, + uint offset, + struct xfs_buf_log_format *blfp) +{ struct xfs_buf *bp = bip->bli_buf; uint base_size; uint nvecs; @@ -235,40 +277,33 @@ xfs_buf_item_format( uint nbits; uint buffer_offset; - ASSERT(atomic_read(&bip->bli_refcount) > 0); - ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || - (bip->bli_flags & XFS_BLI_STALE)); + /* copy the flags across from the base format item */ + blfp->blf_flags = bip->__bli_format.blf_flags; /* - * The size of the base structure is the size of the - * declared structure plus the space for the extra words - * of the bitmap. We subtract one from the map size, because - * the first element of the bitmap is accounted for in the - * size of the base structure. + * Base size is the actual size of the ondisk structure - it reflects + * the actual size of the dirty bitmap rather than the size of the in + * memory structure. */ - base_size = - (uint)(sizeof(xfs_buf_log_format_t) + - ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); - vecp->i_addr = &bip->bli_format; + base_size = offsetof(struct xfs_buf_log_format, blf_data_map) + + (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); + + nvecs = 0; + first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { + /* + * If the map is not be dirty in the transaction, mark + * the size as zero and do not advance the vector pointer. + */ + goto out; + } + + vecp->i_addr = blfp; vecp->i_len = base_size; vecp->i_type = XLOG_REG_TYPE_BFORMAT; vecp++; nvecs = 1; - /* - * If it is an inode buffer, transfer the in-memory state to the - * format flags and clear the in-memory state. We do not transfer - * this state if the inode buffer allocation has not yet been committed - * to the log as setting the XFS_BLI_INODE_BUF flag will prevent - * correct replay of the inode allocation. - */ - if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && - xfs_log_item_in_current_chkpt(lip))) - bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; - bip->bli_flags &= ~XFS_BLI_INODE_BUF; - } - if (bip->bli_flags & XFS_BLI_STALE) { /* * The buffer is stale, so all we need to log @@ -276,17 +311,14 @@ xfs_buf_item_format( * cancel flag in it. */ trace_xfs_buf_item_format_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); - bip->bli_format.blf_size = nvecs; - return; + ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); + goto out; } /* * Fill in an iovec for each set of contiguous chunks. */ - first_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, 0); - ASSERT(first_bit != -1); + last_bit = first_bit; nbits = 1; for (;;) { @@ -296,9 +328,8 @@ xfs_buf_item_format( * if there are no more bits set or the start bit is * beyond the end of the bitmap. */ - next_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, - (uint)last_bit + 1); + next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + (uint)last_bit + 1); /* * If we run out of bits fill in the last iovec and get * out of the loop. @@ -309,14 +340,14 @@ xfs_buf_item_format( * keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = first_bit * XFS_BLF_CHUNK; + buffer_offset = offset + first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; break; } else if (next_bit != last_bit + 1) { - buffer_offset = first_bit * XFS_BLF_CHUNK; + buffer_offset = offset + first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; @@ -325,14 +356,17 @@ xfs_buf_item_format( first_bit = next_bit; last_bit = next_bit; nbits = 1; - } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) != - (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) + + } else if (xfs_buf_offset(bp, offset + + (next_bit << XFS_BLF_SHIFT)) != + (xfs_buf_offset(bp, offset + + (last_bit << XFS_BLF_SHIFT)) + XFS_BLF_CHUNK)) { - buffer_offset = first_bit * XFS_BLF_CHUNK; + buffer_offset = offset + first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; -/* You would think we need to bump the nvecs here too, but we do not +/* + * You would think we need to bump the nvecs here too, but we do not * this number is used by recovery, and it gets confused by the boundary * split here * nvecs++; @@ -346,7 +380,50 @@ xfs_buf_item_format( nbits++; } } - bip->bli_format.blf_size = nvecs; +out: + blfp->blf_size = nvecs; + return vecp; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given log buf item. It fills the first entry with a buf log + * format structure, and the rest point to contiguous chunks + * within the buffer. + */ +STATIC void +xfs_buf_item_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *vecp) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + uint offset = 0; + int i; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_STALE)); + + /* + * If it is an inode buffer, transfer the in-memory state to the + * format flags and clear the in-memory state. We do not transfer + * this state if the inode buffer allocation has not yet been committed + * to the log as setting the XFS_BLI_INODE_BUF flag will prevent + * correct replay of the inode allocation. + */ + if (bip->bli_flags & XFS_BLI_INODE_BUF) { + if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + xfs_log_item_in_current_chkpt(lip))) + bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->bli_flags &= ~XFS_BLI_INODE_BUF; + } + + for (i = 0; i < bip->bli_format_count; i++) { + vecp = xfs_buf_item_format_segment(bip, vecp, offset, + &bip->bli_formats[i]); + offset += bp->b_maps[i].bm_len; + } /* * Check to make sure everything is consistent. @@ -418,7 +495,7 @@ xfs_buf_item_unpin( ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(xfs_buf_islocked(bp)); ASSERT(XFS_BUF_ISSTALE(bp)); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); trace_xfs_buf_item_unpin_stale(bip); @@ -459,7 +536,25 @@ xfs_buf_item_unpin( } xfs_buf_relse(bp); } else if (freed && remove) { + /* + * There are currently two references to the buffer - the active + * LRU reference and the buf log item. What we are about to do + * here - simulate a failed IO completion - requires 3 + * references. + * + * The LRU reference is removed by the xfs_buf_stale() call. The + * buf item reference is removed by the xfs_buf_iodone() + * callback that is run by xfs_buf_do_callbacks() during ioend + * processing (via the bp->b_iodone callback), and then finally + * the ioend processing will drop the IO reference if the buffer + * is marked XBF_ASYNC. + * + * Hence we need to take an additional reference here so that IO + * completion processing doesn't free the buffer prematurely. + */ xfs_buf_lock(bp); + xfs_buf_hold(bp); + bp->b_flags |= XBF_ASYNC; xfs_buf_ioerror(bp, EIO); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); @@ -516,7 +611,7 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - int aborted; + int aborted, clean, i; uint hold; /* Clear the buffer's association with this transaction. */ @@ -546,7 +641,7 @@ xfs_buf_item_unlock( */ if (bip->bli_flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { atomic_dec(&bip->bli_refcount); return; @@ -557,12 +652,27 @@ xfs_buf_item_unlock( /* * If the buf item isn't tracking any data, free it, otherwise drop the - * reference we hold to it. + * reference we hold to it. If we are aborting the transaction, this may + * be the only reference to the buf item, so we free it anyway + * regardless of whether it is dirty or not. A dirty abort implies a + * shutdown, anyway. */ - if (xfs_bitmap_empty(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size)) + clean = 1; + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) { + clean = 0; + break; + } + } + if (clean) xfs_buf_item_relse(bp); - else + else if (aborted) { + if (atomic_dec_and_test(&bip->bli_refcount)) { + ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + xfs_buf_item_relse(bp); + } + } else atomic_dec(&bip->bli_refcount); if (!hold) @@ -622,6 +732,35 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_committing = xfs_buf_item_committing }; +STATIC int +xfs_buf_item_get_format( + struct xfs_buf_log_item *bip, + int count) +{ + ASSERT(bip->bli_formats == NULL); + bip->bli_format_count = count; + + if (count == 1) { + bip->bli_formats = &bip->__bli_format; + return 0; + } + + bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), + KM_SLEEP); + if (!bip->bli_formats) + return ENOMEM; + return 0; +} + +STATIC void +xfs_buf_item_free_format( + struct xfs_buf_log_item *bip) +{ + if (bip->bli_formats != &bip->__bli_format) { + kmem_free(bip->bli_formats); + bip->bli_formats = NULL; + } +} /* * Allocate a new buf log item to go with the given buffer. @@ -639,6 +778,8 @@ xfs_buf_item_init( xfs_buf_log_item_t *bip; int chunks; int map_size; + int error; + int i; /* * Check to see if there is already a buf log item for @@ -650,25 +791,33 @@ xfs_buf_item_init( if (lip != NULL && lip->li_type == XFS_LI_BUF) return; - /* - * chunks is the number of XFS_BLF_CHUNK size pieces - * the buffer can be divided into. Make sure not to - * truncate any pieces. map_size is the size of the - * bitmap needed to describe the chunks of the buffer. - */ - chunks = (int)((BBTOB(bp->b_length) + (XFS_BLF_CHUNK - 1)) >> - XFS_BLF_SHIFT); - map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); - - bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, - KM_SLEEP); + bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; xfs_buf_hold(bp); - bip->bli_format.blf_type = XFS_LI_BUF; - bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); - bip->bli_format.blf_len = (ushort)bp->b_length; - bip->bli_format.blf_map_size = map_size; + + /* + * chunks is the number of XFS_BLF_CHUNK size pieces the buffer + * can be divided into. Make sure not to truncate any pieces. + * map_size is the size of the bitmap needed to describe the + * chunks of the buffer. + * + * Discontiguous buffer support follows the layout of the underlying + * buffer. This makes the implementation as simple as possible. + */ + error = xfs_buf_item_get_format(bip, bp->b_map_count); + ASSERT(error == 0); + + for (i = 0; i < bip->bli_format_count; i++) { + chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), + XFS_BLF_CHUNK); + map_size = DIV_ROUND_UP(chunks, NBWORD); + + bip->bli_formats[i].blf_type = XFS_LI_BUF; + bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; + bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; + bip->bli_formats[i].blf_map_size = map_size; + } #ifdef XFS_TRANS_DEBUG /* @@ -699,10 +848,11 @@ xfs_buf_item_init( * item's bitmap. */ void -xfs_buf_item_log( - xfs_buf_log_item_t *bip, +xfs_buf_item_log_segment( + struct xfs_buf_log_item *bip, uint first, - uint last) + uint last, + uint *map) { uint first_bit; uint last_bit; @@ -715,12 +865,6 @@ xfs_buf_item_log( uint mask; /* - * Mark the item as having some dirty data for - * quick reference in xfs_buf_item_dirty. - */ - bip->bli_flags |= XFS_BLI_DIRTY; - - /* * Convert byte offsets to bit numbers. */ first_bit = first >> XFS_BLF_SHIFT; @@ -736,7 +880,7 @@ xfs_buf_item_log( * to set a bit in. */ word_num = first_bit >> BIT_TO_WORD_SHIFT; - wordp = &(bip->bli_format.blf_data_map[word_num]); + wordp = &map[word_num]; /* * Calculate the starting bit in the first word. @@ -783,6 +927,51 @@ xfs_buf_item_log( xfs_buf_item_log_debug(bip, first, last); } +/* + * Mark bytes first through last inclusive as dirty in the buf + * item's bitmap. + */ +void +xfs_buf_item_log( + xfs_buf_log_item_t *bip, + uint first, + uint last) +{ + int i; + uint start; + uint end; + struct xfs_buf *bp = bip->bli_buf; + + /* + * Mark the item as having some dirty data for + * quick reference in xfs_buf_item_dirty. + */ + bip->bli_flags |= XFS_BLI_DIRTY; + + /* + * walk each buffer segment and mark them dirty appropriately. + */ + start = 0; + for (i = 0; i < bip->bli_format_count; i++) { + if (start > last) + break; + end = start + BBTOB(bp->b_maps[i].bm_len); + if (first > end) { + start += BBTOB(bp->b_maps[i].bm_len); + continue; + } + if (first < start) + first = start; + if (end > last) + end = last; + + xfs_buf_item_log_segment(bip, first, end, + &bip->bli_formats[i].blf_data_map[0]); + + start += bp->b_maps[i].bm_len; + } +} + /* * Return 1 if the buffer has some data that has been logged (at any @@ -804,6 +993,7 @@ xfs_buf_item_free( kmem_free(bip->bli_logged); #endif /* XFS_TRANS_DEBUG */ + xfs_buf_item_free_format(bip); kmem_zone_free(xfs_buf_item_zone, bip); } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index b6ecd2061e7..16def435944 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -21,23 +21,6 @@ extern kmem_zone_t *xfs_buf_item_zone; /* - * This is the structure used to lay out a buf log item in the - * log. The data map describes which 128 byte chunks of the buffer - * have been logged. - * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. - */ -typedef struct xfs_buf_log_format { - unsigned short blf_type; /* buf log item type indicator */ - unsigned short blf_size; /* size of this item */ - ushort blf_flags; /* misc state */ - ushort blf_len; /* number of blocks in this buf */ - __int64_t blf_blkno; /* starting blkno of this buf */ - unsigned int blf_map_size; /* size of data bitmap in words */ - unsigned int blf_data_map[1];/* variable size bitmap of */ - /* regions of buffer in this item */ -} xfs_buf_log_format_t; - -/* * This flag indicates that the buffer contains on disk inodes * and requires special recovery handling. */ @@ -61,6 +44,23 @@ typedef struct xfs_buf_log_format { #define NBWORD (NBBY * sizeof(unsigned int)) /* + * This is the structure used to lay out a buf log item in the + * log. The data map describes which 128 byte chunks of the buffer + * have been logged. + */ +#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) + +typedef struct xfs_buf_log_format { + unsigned short blf_type; /* buf log item type indicator */ + unsigned short blf_size; /* size of this item */ + ushort blf_flags; /* misc state */ + ushort blf_len; /* number of blocks in this buf */ + __int64_t blf_blkno; /* starting blkno of this buf */ + unsigned int blf_map_size; /* used size of data bitmap in words */ + unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ +} xfs_buf_log_format_t; + +/* * buf log item flags */ #define XFS_BLI_HOLD 0x01 @@ -102,7 +102,9 @@ typedef struct xfs_buf_log_item { char *bli_orig; /* original buffer copy */ char *bli_logged; /* bytes logged (bitmap) */ #endif - xfs_buf_log_format_t bli_format; /* in-log header */ + int bli_format_count; /* count of headers */ + struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ + struct xfs_buf_log_format __bli_format; /* embedded in-log header */ } xfs_buf_log_item_t; void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h new file mode 100644 index 00000000000..fad1676ad8c --- /dev/null +++ b/fs/xfs/xfs_cksum.h @@ -0,0 +1,63 @@ +#ifndef _XFS_CKSUM_H +#define _XFS_CKSUM_H 1 + +#define XFS_CRC_SEED (~(__uint32_t)0) + +/* + * Calculate the intermediate checksum for a buffer that has the CRC field + * inside it. The offset of the 32bit crc fields is passed as the + * cksum_offset parameter. + */ +static inline __uint32_t +xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t zero = 0; + __uint32_t crc; + + /* Calculate CRC up to the checksum. */ + crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset); + + /* Skip checksum field */ + crc = crc32c(crc, &zero, sizeof(__u32)); + + /* Calculate the rest of the CRC. */ + return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)], + length - (cksum_offset + sizeof(__be32))); +} + +/* + * Convert the intermediate checksum to the final ondisk format. + * + * The CRC32c calculation uses LE format even on BE machines, but returns the + * result in host endian format. Hence we need to byte swap it back to LE format + * so that it is consistent on disk. + */ +static inline __le32 +xfs_end_cksum(__uint32_t crc) +{ + return ~cpu_to_le32(crc); +} + +/* + * Helper to generate the checksum for a buffer. + */ +static inline void +xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); +} + +/* + * Helper to verify the checksum for a buffer. + */ +static inline int +xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); +} + +#endif /* _XFS_CKSUM_H */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 015b946c580..4d7696a0241 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -83,14 +83,92 @@ STATIC void xfs_da_node_unbalance(xfs_da_state_t *state, /* * Utility routines. */ -STATIC uint xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count); -STATIC int xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp); -STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps); +STATIC uint xfs_da_node_lasthash(struct xfs_buf *bp, int *count); +STATIC int xfs_da_node_order(struct xfs_buf *node1_bp, + struct xfs_buf *node2_bp); STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_da_state_blk_t *save_blk); STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); +static void +xfs_da_node_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_node_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC); + block_ok = block_ok && + be16_to_cpu(hdr->level) > 0 && + be16_to_cpu(hdr->count) > 0 ; + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + +} + +static void +xfs_da_node_write_verify( + struct xfs_buf *bp) +{ + xfs_da_node_verify(bp); +} + +/* + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. + */ +static void +xfs_da_node_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + xfs_da_node_verify(bp); + break; + case XFS_ATTR_LEAF_MAGIC: + bp->b_ops = &xfs_attr_leaf_buf_ops; + bp->b_ops->verify_read(bp); + return; + case XFS_DIR2_LEAFN_MAGIC: + bp->b_ops = &xfs_dir2_leafn_buf_ops; + bp->b_ops->verify_read(bp); + return; + default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + mp, info); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } +} + +const struct xfs_buf_ops xfs_da_node_buf_ops = { + .verify_read = xfs_da_node_read_verify, + .verify_write = xfs_da_node_write_verify, +}; + + +int +xfs_da_node_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int which_fork) +{ + return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + which_fork, &xfs_da_node_buf_ops); +} + /*======================================================================== * Routines used for growing the Btree. *========================================================================*/ @@ -100,10 +178,10 @@ STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); */ int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - xfs_dabuf_t **bpp, int whichfork) + struct xfs_buf **bpp, int whichfork) { xfs_da_intnode_t *node; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; xfs_trans_t *tp; @@ -114,7 +192,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, if (error) return(error); ASSERT(bp != NULL); - node = bp->data; + node = bp->b_addr; node->hdr.info.forw = 0; node->hdr.info.back = 0; node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC); @@ -122,9 +200,10 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, node->hdr.count = 0; node->hdr.level = cpu_to_be16(level); - xfs_da_log_buf(tp, bp, + xfs_trans_log_buf(tp, bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + bp->b_ops = &xfs_da_node_buf_ops; *bpp = bp; return(0); } @@ -138,7 +217,7 @@ xfs_da_split(xfs_da_state_t *state) { xfs_da_state_blk_t *oldblk, *newblk, *addblk; xfs_da_intnode_t *node; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int max, action, error, i; trace_xfs_da_split(state->args); @@ -203,7 +282,6 @@ xfs_da_split(xfs_da_state_t *state) case XFS_DA_NODE_MAGIC: error = xfs_da_node_split(state, oldblk, newblk, addblk, max - i, &action); - xfs_da_buf_done(addblk->bp); addblk->bp = NULL; if (error) return(error); /* GROT: dir is inconsistent */ @@ -221,13 +299,6 @@ xfs_da_split(xfs_da_state_t *state) * Update the btree to show the new hashval for this child. */ xfs_da_fixhashpath(state, &state->path); - /* - * If we won't need this block again, it's getting dropped - * from the active path by the loop control, so we need - * to mark it done now. - */ - if (i > 0 || !addblk) - xfs_da_buf_done(oldblk->bp); } if (!addblk) return(0); @@ -239,8 +310,6 @@ xfs_da_split(xfs_da_state_t *state) oldblk = &state->path.blk[0]; error = xfs_da_root_split(state, oldblk, addblk); if (error) { - xfs_da_buf_done(oldblk->bp); - xfs_da_buf_done(addblk->bp); addblk->bp = NULL; return(error); /* GROT: dir is inconsistent */ } @@ -252,7 +321,7 @@ xfs_da_split(xfs_da_state_t *state) * and the original block 0 could be at any position in the list. */ - node = oldblk->bp->data; + node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { bp = addblk->bp; @@ -260,13 +329,13 @@ xfs_da_split(xfs_da_state_t *state) ASSERT(state->extravalid); bp = state->extrablk.bp; } - node = bp->data; + node = bp->b_addr; node->hdr.info.back = cpu_to_be32(oldblk->blkno); - xfs_da_log_buf(state->args->trans, bp, + xfs_trans_log_buf(state->args->trans, bp, XFS_DA_LOGRANGE(node, &node->hdr.info, sizeof(node->hdr.info))); } - node = oldblk->bp->data; + node = oldblk->bp->b_addr; if (node->hdr.info.back) { if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) { bp = addblk->bp; @@ -274,14 +343,12 @@ xfs_da_split(xfs_da_state_t *state) ASSERT(state->extravalid); bp = state->extrablk.bp; } - node = bp->data; + node = bp->b_addr; node->hdr.info.forw = cpu_to_be32(oldblk->blkno); - xfs_da_log_buf(state->args->trans, bp, + xfs_trans_log_buf(state->args->trans, bp, XFS_DA_LOGRANGE(node, &node->hdr.info, sizeof(node->hdr.info))); } - xfs_da_buf_done(oldblk->bp); - xfs_da_buf_done(addblk->bp); addblk->bp = NULL; return(0); } @@ -298,7 +365,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_intnode_t *node, *oldroot; xfs_da_args_t *args; xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error, size; xfs_inode_t *dp; xfs_trans_t *tp; @@ -323,8 +390,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, if (error) return(error); ASSERT(bp != NULL); - node = bp->data; - oldroot = blk1->bp->data; + node = bp->b_addr; + oldroot = blk1->bp->b_addr; if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] - (char *)oldroot); @@ -335,8 +402,9 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, (char *)leaf); } memcpy(node, oldroot, size); - xfs_da_log_buf(tp, bp, 0, size - 1); - xfs_da_buf_done(blk1->bp); + xfs_trans_log_buf(tp, bp, 0, size - 1); + + bp->b_ops = blk1->bp->b_ops; blk1->bp = bp; blk1->blkno = blkno; @@ -348,7 +416,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork); if (error) return(error); - node = bp->data; + node = bp->b_addr; node->btree[0].hashval = cpu_to_be32(blk1->hashval); node->btree[0].before = cpu_to_be32(blk1->blkno); node->btree[1].hashval = cpu_to_be32(blk2->hashval); @@ -365,10 +433,9 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, #endif /* Header is already logged by xfs_da_node_create */ - xfs_da_log_buf(tp, bp, + xfs_trans_log_buf(tp, bp, XFS_DA_LOGRANGE(node, node->btree, sizeof(xfs_da_node_entry_t) * 2)); - xfs_da_buf_done(bp); return(0); } @@ -389,7 +456,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, trace_xfs_da_node_split(state->args); - node = oldblk->bp->data; + node = oldblk->bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); /* @@ -436,7 +503,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * * If we had double-split op below us, then add the extra block too. */ - node = oldblk->bp->data; + node = oldblk->bp->b_addr; if (oldblk->index <= be16_to_cpu(node->hdr.count)) { oldblk->index++; xfs_da_node_add(state, oldblk, addblk); @@ -477,8 +544,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, trace_xfs_da_node_rebalance(state->args); - node1 = blk1->bp->data; - node2 = blk2->bp->data; + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; /* * Figure out how many entries need to move, and in which direction. * Swap the nodes around if that makes it simpler. @@ -532,7 +599,7 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)]; memcpy(btree_d, btree_s, tmp); be16_add_cpu(&node1->hdr.count, count); - xfs_da_log_buf(tp, blk1->bp, + xfs_trans_log_buf(tp, blk1->bp, XFS_DA_LOGRANGE(node1, btree_d, tmp)); /* @@ -549,9 +616,9 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, /* * Log header of node 1 and all current bits of node 2. */ - xfs_da_log_buf(tp, blk1->bp, + xfs_trans_log_buf(tp, blk1->bp, XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr))); - xfs_da_log_buf(tp, blk2->bp, + xfs_trans_log_buf(tp, blk2->bp, XFS_DA_LOGRANGE(node2, &node2->hdr, sizeof(node2->hdr) + sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count))); @@ -560,8 +627,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Record the last hashval from each block for upward propagation. * (note: don't use the swapped node pointers) */ - node1 = blk1->bp->data; - node2 = blk2->bp->data; + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval); blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval); @@ -587,7 +654,7 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, trace_xfs_da_node_add(state->args); - node = oldblk->bp->data; + node = oldblk->bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); ASSERT(newblk->blkno != 0); @@ -606,10 +673,10 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, } btree->hashval = cpu_to_be32(newblk->hashval); btree->before = cpu_to_be32(newblk->blkno); - xfs_da_log_buf(state->args->trans, oldblk->bp, + xfs_trans_log_buf(state->args->trans, oldblk->bp, XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree))); be16_add_cpu(&node->hdr.count, 1); - xfs_da_log_buf(state->args->trans, oldblk->bp, + xfs_trans_log_buf(state->args->trans, oldblk->bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); /* @@ -735,7 +802,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) xfs_da_intnode_t *oldroot; xfs_da_args_t *args; xfs_dablk_t child; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; trace_xfs_da_root_join(state->args); @@ -743,7 +810,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) args = state->args; ASSERT(args != NULL); ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); - oldroot = root_blk->bp->data; + oldroot = root_blk->bp->b_addr; ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); ASSERT(!oldroot->hdr.info.forw); ASSERT(!oldroot->hdr.info.back); @@ -760,16 +827,23 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) */ child = be32_to_cpu(oldroot->btree[0].before); ASSERT(child != 0); - error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, + error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); - xfs_da_blkinfo_onlychild_validate(bp->data, + xfs_da_blkinfo_onlychild_validate(bp->b_addr, be16_to_cpu(oldroot->hdr.level)); - memcpy(root_blk->bp->data, bp->data, state->blocksize); - xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); + /* + * This could be copying a leaf back into the root block in the case of + * there only being a single leaf block left in the tree. Hence we have + * to update the b_ops pointer as well to match the buffer type change + * that could occur. + */ + memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); + root_blk->bp->b_ops = bp->b_ops; + xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); } @@ -791,7 +865,9 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) xfs_da_blkinfo_t *info; int count, forward, error, retval, i; xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; + + trace_xfs_da_node_toosmall(state->args); /* * Check for the degenerate case of the block being over 50% full. @@ -799,7 +875,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * to coalesce with a sibling. */ blk = &state->path.blk[ state->path.active-1 ]; - info = blk->bp->data; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); node = (xfs_da_intnode_t *)info; count = be16_to_cpu(node->hdr.count); @@ -849,7 +925,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) blkno = be32_to_cpu(info->back); if (blkno == 0) continue; - error = xfs_da_read_buf(state->args->trans, state->args->dp, + error = xfs_da_node_read(state->args->trans, state->args->dp, blkno, -1, &bp, state->args->whichfork); if (error) return(error); @@ -859,10 +935,10 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) count = state->node_ents; count -= state->node_ents >> 2; count -= be16_to_cpu(node->hdr.count); - node = bp->data; + node = bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); count -= be16_to_cpu(node->hdr.count); - xfs_da_brelse(state->args->trans, bp); + xfs_trans_brelse(state->args->trans, bp); if (count >= 0) break; /* fits with at least 25% to spare */ } @@ -914,6 +990,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) xfs_dahash_t lasthash=0; int level, count; + trace_xfs_da_fixhashpath(state->args); + level = path->active-1; blk = &path->blk[ level ]; switch (blk->magic) { @@ -934,14 +1012,14 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) break; } for (blk--, level--; level >= 0; blk--, level--) { - node = blk->bp->data; + node = blk->bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); btree = &node->btree[ blk->index ]; if (be32_to_cpu(btree->hashval) == lasthash) break; blk->hashval = lasthash; btree->hashval = cpu_to_be32(lasthash); - xfs_da_log_buf(state->args->trans, blk->bp, + xfs_trans_log_buf(state->args->trans, blk->bp, XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); @@ -960,7 +1038,7 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) trace_xfs_da_node_remove(state->args); - node = drop_blk->bp->data; + node = drop_blk->bp->b_addr; ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count)); ASSERT(drop_blk->index >= 0); @@ -972,15 +1050,15 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) tmp = be16_to_cpu(node->hdr.count) - drop_blk->index - 1; tmp *= (uint)sizeof(xfs_da_node_entry_t); memmove(btree, btree + 1, tmp); - xfs_da_log_buf(state->args->trans, drop_blk->bp, + xfs_trans_log_buf(state->args->trans, drop_blk->bp, XFS_DA_LOGRANGE(node, btree, tmp)); btree = &node->btree[be16_to_cpu(node->hdr.count)-1]; } memset((char *)btree, 0, sizeof(xfs_da_node_entry_t)); - xfs_da_log_buf(state->args->trans, drop_blk->bp, + xfs_trans_log_buf(state->args->trans, drop_blk->bp, XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); be16_add_cpu(&node->hdr.count, -1); - xfs_da_log_buf(state->args->trans, drop_blk->bp, + xfs_trans_log_buf(state->args->trans, drop_blk->bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); /* @@ -1005,8 +1083,8 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_node_unbalance(state->args); - drop_node = drop_blk->bp->data; - save_node = save_blk->bp->data; + drop_node = drop_blk->bp->b_addr; + save_node = save_blk->bp->b_addr; ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); ASSERT(save_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); tp = state->args->trans; @@ -1023,13 +1101,13 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); memmove(btree, &save_node->btree[0], tmp); btree = &save_node->btree[0]; - xfs_da_log_buf(tp, save_blk->bp, + xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, btree, (be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) * sizeof(xfs_da_node_entry_t))); } else { btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)]; - xfs_da_log_buf(tp, save_blk->bp, + xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, btree, be16_to_cpu(drop_node->hdr.count) * sizeof(xfs_da_node_entry_t))); @@ -1042,7 +1120,7 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, memcpy(btree, &drop_node->btree[0], tmp); be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count)); - xfs_da_log_buf(tp, save_blk->bp, + xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, &save_node->hdr, sizeof(save_node->hdr))); @@ -1093,14 +1171,14 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Read the next node down in the tree. */ blk->blkno = blkno; - error = xfs_da_read_buf(args->trans, args->dp, blkno, + error = xfs_da_node_read(args->trans, args->dp, blkno, -1, &blk->bp, args->whichfork); if (error) { blk->blkno = 0; state->path.active--; return(error); } - curr = blk->bp->data; + curr = blk->bp->b_addr; blk->magic = be16_to_cpu(curr->magic); ASSERT(blk->magic == XFS_DA_NODE_MAGIC || blk->magic == XFS_DIR2_LEAFN_MAGIC || @@ -1110,7 +1188,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Search an intermediate node for a match. */ if (blk->magic == XFS_DA_NODE_MAGIC) { - node = blk->bp->data; + node = blk->bp->b_addr; max = be16_to_cpu(node->hdr.count); blk->hashval = be32_to_cpu(node->btree[max-1].hashval); @@ -1216,15 +1294,15 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_blkinfo_t *old_info, *new_info, *tmp_info; xfs_da_args_t *args; int before=0, error; - xfs_dabuf_t *bp; + struct xfs_buf *bp; /* * Set up environment. */ args = state->args; ASSERT(args != NULL); - old_info = old_blk->bp->data; - new_info = new_blk->bp->data; + old_info = old_blk->bp->b_addr; + new_info = new_blk->bp->b_addr; ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC || old_blk->magic == XFS_DIR2_LEAFN_MAGIC || old_blk->magic == XFS_ATTR_LEAF_MAGIC); @@ -1255,18 +1333,17 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = cpu_to_be32(old_blk->blkno); new_info->back = old_info->back; if (old_info->back) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(old_info->back), -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic)); ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno); tmp_info->forw = cpu_to_be32(new_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); - xfs_da_buf_done(bp); + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); } old_info->back = cpu_to_be32(new_blk->blkno); } else { @@ -1277,24 +1354,23 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = old_info->forw; new_info->back = cpu_to_be32(old_blk->blkno); if (old_info->forw) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(old_info->forw), -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(tmp_info->magic == old_info->magic); ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno); tmp_info->back = cpu_to_be32(new_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); - xfs_da_buf_done(bp); + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); } old_info->forw = cpu_to_be32(new_blk->blkno); } - xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); - xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); + xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); + xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); return(0); } @@ -1302,12 +1378,14 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, * Compare two intermediate nodes for "order". */ STATIC int -xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp) +xfs_da_node_order( + struct xfs_buf *node1_bp, + struct xfs_buf *node2_bp) { xfs_da_intnode_t *node1, *node2; - node1 = node1_bp->data; - node2 = node2_bp->data; + node1 = node1_bp->b_addr; + node2 = node2_bp->b_addr; ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) && node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && @@ -1324,11 +1402,13 @@ xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp) * Pick up the last hashvalue from an intermediate node. */ STATIC uint -xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count) +xfs_da_node_lasthash( + struct xfs_buf *bp, + int *count) { xfs_da_intnode_t *node; - node = bp->data; + node = bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); if (count) *count = be16_to_cpu(node->hdr.count); @@ -1346,7 +1426,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, { xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info; xfs_da_args_t *args; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; /* @@ -1354,8 +1434,8 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, */ args = state->args; ASSERT(args != NULL); - save_info = save_blk->bp->data; - drop_info = drop_blk->bp->data; + save_info = save_blk->bp->b_addr; + drop_info = drop_blk->bp->b_addr; ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC || save_blk->magic == XFS_DIR2_LEAFN_MAGIC || save_blk->magic == XFS_ATTR_LEAF_MAGIC); @@ -1374,41 +1454,39 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_back(args); save_info->back = drop_info->back; if (drop_info->back) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(drop_info->back), -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno); tmp_info->forw = cpu_to_be32(save_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info) - 1); - xfs_da_buf_done(bp); } } else { trace_xfs_da_unlink_forward(args); save_info->forw = drop_info->forw; if (drop_info->forw) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(drop_info->forw), -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno); tmp_info->back = cpu_to_be32(save_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info) - 1); - xfs_da_buf_done(bp); } } - xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); + xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); return(0); } @@ -1431,6 +1509,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, xfs_dablk_t blkno=0; int level, error; + trace_xfs_da_path_shift(state->args); + /* * Roll up the Btree looking for the first block where our * current index is not at the edge of the block. Note that @@ -1443,7 +1523,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, level = (path->active-1) - 1; /* skip bottom layer in path */ for (blk = &path->blk[level]; level >= 0; blk--, level--) { ASSERT(blk->bp != NULL); - node = blk->bp->data; + node = blk->bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) { blk->index++; @@ -1471,18 +1551,18 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * (if it's dirty, trans won't actually let go) */ if (release) - xfs_da_brelse(args->trans, blk->bp); + xfs_trans_brelse(args->trans, blk->bp); /* * Read the next child block. */ blk->blkno = blkno; - error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, - &blk->bp, args->whichfork); + error = xfs_da_node_read(args->trans, args->dp, blkno, -1, + &blk->bp, args->whichfork); if (error) return(error); ASSERT(blk->bp != NULL); - info = blk->bp->data; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); @@ -1702,11 +1782,13 @@ xfs_da_grow_inode( * a bmap btree split to do that. */ STATIC int -xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, - xfs_dabuf_t **dead_bufp) +xfs_da_swap_lastblock( + xfs_da_args_t *args, + xfs_dablk_t *dead_blknop, + struct xfs_buf **dead_bufp) { xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno; - xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf; + struct xfs_buf *dead_buf, *last_buf, *sib_buf, *par_buf; xfs_fileoff_t lastoff; xfs_inode_t *ip; xfs_trans_t *tp; @@ -1739,14 +1821,15 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, * Read the last block in the btree space. */ last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; - if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w))) + error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w); + if (error) return error; /* * Copy the last block into the dead buffer and log it. */ - memcpy(dead_buf->data, last_buf->data, mp->m_dirblksize); - xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1); - dead_info = dead_buf->data; + memcpy(dead_buf->b_addr, last_buf->b_addr, mp->m_dirblksize); + xfs_trans_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1); + dead_info = dead_buf->b_addr; /* * Get values from the moved block. */ @@ -1765,9 +1848,10 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); + if (error) goto done; - sib_info = sib_buf->data; + sib_info = sib_buf->b_addr; if (unlikely( be32_to_cpu(sib_info->forw) != last_blkno || sib_info->magic != dead_info->magic)) { @@ -1777,19 +1861,19 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, goto done; } sib_info->forw = cpu_to_be32(dead_blkno); - xfs_da_log_buf(tp, sib_buf, + xfs_trans_log_buf(tp, sib_buf, XFS_DA_LOGRANGE(sib_info, &sib_info->forw, sizeof(sib_info->forw))); - xfs_da_buf_done(sib_buf); sib_buf = NULL; } /* * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); + if (error) goto done; - sib_info = sib_buf->data; + sib_info = sib_buf->b_addr; if (unlikely( be32_to_cpu(sib_info->back) != last_blkno || sib_info->magic != dead_info->magic)) { @@ -1799,10 +1883,9 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, goto done; } sib_info->back = cpu_to_be32(dead_blkno); - xfs_da_log_buf(tp, sib_buf, + xfs_trans_log_buf(tp, sib_buf, XFS_DA_LOGRANGE(sib_info, &sib_info->back, sizeof(sib_info->back))); - xfs_da_buf_done(sib_buf); sib_buf = NULL; } par_blkno = mp->m_dirleafblk; @@ -1811,9 +1894,10 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, * Walk down the tree looking for the parent of the moved block. */ for (;;) { - if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); + if (error) goto done; - par_node = par_buf->data; + par_node = par_buf->b_addr; if (unlikely(par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC) || (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) { @@ -1837,7 +1921,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, par_blkno = be32_to_cpu(par_node->btree[entno].before); if (level == dead_level + 1) break; - xfs_da_brelse(tp, par_buf); + xfs_trans_brelse(tp, par_buf); par_buf = NULL; } /* @@ -1853,7 +1937,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, if (entno < be16_to_cpu(par_node->hdr.count)) break; par_blkno = be32_to_cpu(par_node->hdr.info.forw); - xfs_da_brelse(tp, par_buf); + xfs_trans_brelse(tp, par_buf); par_buf = NULL; if (unlikely(par_blkno == 0)) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)", @@ -1861,9 +1945,10 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, error = XFS_ERROR(EFSCORRUPTED); goto done; } - if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); + if (error) goto done; - par_node = par_buf->data; + par_node = par_buf->b_addr; if (unlikely( be16_to_cpu(par_node->hdr.level) != level || par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC))) { @@ -1878,20 +1963,18 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, * Update the parent entry pointing to the moved block. */ par_node->btree[entno].before = cpu_to_be32(dead_blkno); - xfs_da_log_buf(tp, par_buf, + xfs_trans_log_buf(tp, par_buf, XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before, sizeof(par_node->btree[entno].before))); - xfs_da_buf_done(par_buf); - xfs_da_buf_done(dead_buf); *dead_blknop = last_blkno; *dead_bufp = last_buf; return 0; done: if (par_buf) - xfs_da_brelse(tp, par_buf); + xfs_trans_brelse(tp, par_buf); if (sib_buf) - xfs_da_brelse(tp, sib_buf); - xfs_da_brelse(tp, last_buf); + xfs_trans_brelse(tp, sib_buf); + xfs_trans_brelse(tp, last_buf); return error; } @@ -1899,8 +1982,10 @@ done: * Remove a btree block from a directory or attribute. */ int -xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, - xfs_dabuf_t *dead_buf) +xfs_da_shrink_inode( + xfs_da_args_t *args, + xfs_dablk_t dead_blkno, + struct xfs_buf *dead_buf) { xfs_inode_t *dp; int done, error, w, count; @@ -1935,7 +2020,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, break; } } - xfs_da_binval(tp, dead_buf); + xfs_trans_binval(tp, dead_buf); return error; } @@ -1967,35 +2052,75 @@ xfs_da_map_covers_blocks( } /* - * Make a dabuf. - * Used for get_buf, read_buf, read_bufr, and reada_buf. + * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map. + * + * For the single map case, it is assumed that the caller has provided a pointer + * to a valid xfs_buf_map. For the multiple map case, this function will + * allocate the xfs_buf_map to hold all the maps and replace the caller's single + * map pointer with the allocated map. */ -STATIC int -xfs_da_do_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - xfs_daddr_t *mappedbnop, - xfs_dabuf_t **bpp, - int whichfork, - int caller) +static int +xfs_buf_map_from_irec( + struct xfs_mount *mp, + struct xfs_buf_map **mapp, + unsigned int *nmaps, + struct xfs_bmbt_irec *irecs, + unsigned int nirecs) { - xfs_buf_t *bp = NULL; - xfs_buf_t **bplist; - int error=0; - int i; - xfs_bmbt_irec_t map; - xfs_bmbt_irec_t *mapp; - xfs_daddr_t mappedbno; - xfs_mount_t *mp; - int nbplist=0; - int nfsb; - int nmap; - xfs_dabuf_t *rbp; + struct xfs_buf_map *map; + int i; + + ASSERT(*nmaps == 1); + ASSERT(nirecs >= 1); + + if (nirecs > 1) { + map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP); + if (!map) + return ENOMEM; + *mapp = map; + } + + *nmaps = nirecs; + map = *mapp; + for (i = 0; i < *nmaps; i++) { + ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK && + irecs[i].br_startblock != HOLESTARTBLOCK); + map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock); + map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount); + } + return 0; +} + +/* + * Map the block we are given ready for reading. There are three possible return + * values: + * -1 - will be returned if we land in a hole and mappedbno == -2 so the + * caller knows not to execute a subsequent read. + * 0 - if we mapped the block successfully + * >0 - positive error number if there was an error. + */ +static int +xfs_dabuf_map( + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + int whichfork, + struct xfs_buf_map **map, + int *nmaps) +{ + struct xfs_mount *mp = dp->i_mount; + int nfsb; + int error = 0; + struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec *irecs = &irec; + int nirecs; + + ASSERT(map && *map); + ASSERT(*nmaps == 1); - mp = dp->i_mount; nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1; - mappedbno = *mappedbnop; + /* * Caller doesn't have a mapping. -2 means don't complain * if we land in a hole. @@ -2004,112 +2129,151 @@ xfs_da_do_buf( /* * Optimize the one-block case. */ - if (nfsb == 1) - mapp = ↦ - else - mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP); + if (nfsb != 1) + irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP); - nmap = nfsb; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, mapp, - &nmap, xfs_bmapi_aflag(whichfork)); + nirecs = nfsb; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, + &nirecs, xfs_bmapi_aflag(whichfork)); if (error) - goto exit0; + goto out; } else { - map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); - map.br_startoff = (xfs_fileoff_t)bno; - map.br_blockcount = nfsb; - mapp = ↦ - nmap = 1; + irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); + irecs->br_startoff = (xfs_fileoff_t)bno; + irecs->br_blockcount = nfsb; + irecs->br_state = 0; + nirecs = 1; } - if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) { - error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED); + + if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) { + error = mappedbno == -2 ? -1 : XFS_ERROR(EFSCORRUPTED); if (unlikely(error == EFSCORRUPTED)) { if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + int i; xfs_alert(mp, "%s: bno %lld dir: inode %lld", __func__, (long long)bno, (long long)dp->i_ino); - for (i = 0; i < nmap; i++) { + for (i = 0; i < *nmaps; i++) { xfs_alert(mp, "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d", i, - (long long)mapp[i].br_startoff, - (long long)mapp[i].br_startblock, - (long long)mapp[i].br_blockcount, - mapp[i].br_state); + (long long)irecs[i].br_startoff, + (long long)irecs[i].br_startblock, + (long long)irecs[i].br_blockcount, + irecs[i].br_state); } } XFS_ERROR_REPORT("xfs_da_do_buf(1)", XFS_ERRLEVEL_LOW, mp); } - goto exit0; + goto out; } - if (caller != 3 && nmap > 1) { - bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP); - nbplist = 0; - } else - bplist = NULL; - /* - * Turn the mapping(s) into buffer(s). - */ - for (i = 0; i < nmap; i++) { - int nmapped; - - mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock); - if (i == 0) - *mappedbnop = mappedbno; - nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount); - switch (caller) { - case 0: - bp = xfs_trans_get_buf(trans, mp->m_ddev_targp, - mappedbno, nmapped, 0); - error = bp ? bp->b_error : XFS_ERROR(EIO); - break; - case 1: - case 2: - bp = NULL; - error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp, - mappedbno, nmapped, 0, &bp); - break; - case 3: - xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped); + error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs); +out: + if (irecs != &irec) + kmem_free(irecs); + return error; +} + +/* + * Get a buffer for the dir/attr block. + */ +int +xfs_da_get_buf( + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int whichfork) +{ + struct xfs_buf *bp; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + *bpp = NULL; + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) error = 0; - bp = NULL; - break; - } - if (error) { - if (bp) - xfs_trans_brelse(trans, bp); - goto exit1; - } - if (!bp) - continue; - if (caller == 1) { - if (whichfork == XFS_ATTR_FORK) - xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); - else - xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); - } - if (bplist) { - bplist[nbplist++] = bp; - } + goto out_free; } - /* - * Build a dabuf structure. - */ - if (bplist) { - rbp = xfs_da_buf_make(nbplist, bplist); - } else if (bp) - rbp = xfs_da_buf_make(1, &bp); + + bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp, + mapp, nmap, 0); + error = bp ? bp->b_error : XFS_ERROR(EIO); + if (error) { + xfs_trans_brelse(trans, bp); + goto out_free; + } + + *bpp = bp; + +out_free: + if (mapp != &map) + kmem_free(mapp); + + return error; +} + +/* + * Get a buffer for the dir/attr block, fill in the contents. + */ +int +xfs_da_read_buf( + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int whichfork, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + *bpp = NULL; + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; + } + + error = xfs_trans_read_buf_map(dp->i_mount, trans, + dp->i_mount->m_ddev_targp, + mapp, nmap, 0, &bp, ops); + if (error) + goto out_free; + + if (whichfork == XFS_ATTR_FORK) + xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); else - rbp = NULL; + xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); + /* - * For read_buf, check the magic number. + * This verification code will be moved to a CRC verification callback + * function so just leave it here unchanged until then. */ - if (caller == 1) { - xfs_dir2_data_hdr_t *hdr = rbp->data; - xfs_dir2_free_t *free = rbp->data; - xfs_da_blkinfo_t *info = rbp->data; + { + xfs_dir2_data_hdr_t *hdr = bp->b_addr; + xfs_dir2_free_t *free = bp->b_addr; + xfs_da_blkinfo_t *info = bp->b_addr; uint magic, magic1; + struct xfs_mount *mp = dp->i_mount; magic = be16_to_cpu(info->magic); magic1 = be32_to_cpu(hdr->magic); @@ -2123,66 +2287,20 @@ xfs_da_do_buf( (free->hdr.magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)), mp, XFS_ERRTAG_DA_READ_BUF, XFS_RANDOM_DA_READ_BUF))) { - trace_xfs_da_btree_corrupt(rbp->bps[0], _RET_IP_); + trace_xfs_da_btree_corrupt(bp, _RET_IP_); XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)", XFS_ERRLEVEL_LOW, mp, info); error = XFS_ERROR(EFSCORRUPTED); - xfs_da_brelse(trans, rbp); - nbplist = 0; - goto exit1; + xfs_trans_brelse(trans, bp); + goto out_free; } } - if (bplist) { - kmem_free(bplist); - } - if (mapp != &map) { - kmem_free(mapp); - } - if (bpp) - *bpp = rbp; - return 0; -exit1: - if (bplist) { - for (i = 0; i < nbplist; i++) - xfs_trans_brelse(trans, bplist[i]); - kmem_free(bplist); - } -exit0: + *bpp = bp; +out_free: if (mapp != &map) kmem_free(mapp); - if (bpp) - *bpp = NULL; - return error; -} - -/* - * Get a buffer for the dir/attr block. - */ -int -xfs_da_get_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - xfs_daddr_t mappedbno, - xfs_dabuf_t **bpp, - int whichfork) -{ - return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0); -} -/* - * Get a buffer for the dir/attr block, fill in the contents. - */ -int -xfs_da_read_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - xfs_daddr_t mappedbno, - xfs_dabuf_t **bpp, - int whichfork) -{ - return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1); + return error; } /* @@ -2190,22 +2308,42 @@ xfs_da_read_buf( */ xfs_daddr_t xfs_da_reada_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - int whichfork) + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + int whichfork, + const struct xfs_buf_ops *ops) { - xfs_daddr_t rval; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; + } + + mappedbno = mapp[0].bm_bn; + xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops); + +out_free: + if (mapp != &map) + kmem_free(mapp); - rval = -1; - if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3)) + if (error) return -1; - else - return rval; + return mappedbno; } kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ -kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */ /* * Allocate a dir-state structure. @@ -2225,13 +2363,8 @@ xfs_da_state_kill_altpath(xfs_da_state_t *state) { int i; - for (i = 0; i < state->altpath.active; i++) { - if (state->altpath.blk[i].bp) { - if (state->altpath.blk[i].bp != state->path.blk[i].bp) - xfs_da_buf_done(state->altpath.blk[i].bp); - state->altpath.blk[i].bp = NULL; - } - } + for (i = 0; i < state->altpath.active; i++) + state->altpath.blk[i].bp = NULL; state->altpath.active = 0; } @@ -2241,204 +2374,9 @@ xfs_da_state_kill_altpath(xfs_da_state_t *state) void xfs_da_state_free(xfs_da_state_t *state) { - int i; - xfs_da_state_kill_altpath(state); - for (i = 0; i < state->path.active; i++) { - if (state->path.blk[i].bp) - xfs_da_buf_done(state->path.blk[i].bp); - } - if (state->extravalid && state->extrablk.bp) - xfs_da_buf_done(state->extrablk.bp); #ifdef DEBUG memset((char *)state, 0, sizeof(*state)); #endif /* DEBUG */ kmem_zone_free(xfs_da_state_zone, state); } - -/* - * Create a dabuf. - */ -/* ARGSUSED */ -STATIC xfs_dabuf_t * -xfs_da_buf_make(int nbuf, xfs_buf_t **bps) -{ - xfs_buf_t *bp; - xfs_dabuf_t *dabuf; - int i; - int off; - - if (nbuf == 1) - dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS); - else - dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS); - dabuf->dirty = 0; - if (nbuf == 1) { - dabuf->nbuf = 1; - bp = bps[0]; - dabuf->bbcount = bp->b_length; - dabuf->data = bp->b_addr; - dabuf->bps[0] = bp; - } else { - dabuf->nbuf = nbuf; - for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) { - dabuf->bps[i] = bp = bps[i]; - dabuf->bbcount += bp->b_length; - } - dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); - for (i = off = 0; i < nbuf; i++, off += BBTOB(bp->b_length)) { - bp = bps[i]; - memcpy((char *)dabuf->data + off, bp->b_addr, - BBTOB(bp->b_length)); - } - } - return dabuf; -} - -/* - * Un-dirty a dabuf. - */ -STATIC void -xfs_da_buf_clean(xfs_dabuf_t *dabuf) -{ - xfs_buf_t *bp; - int i; - int off; - - if (dabuf->dirty) { - ASSERT(dabuf->nbuf > 1); - dabuf->dirty = 0; - for (i = off = 0; i < dabuf->nbuf; - i++, off += BBTOB(bp->b_length)) { - bp = dabuf->bps[i]; - memcpy(bp->b_addr, dabuf->data + off, - BBTOB(bp->b_length)); - } - } -} - -/* - * Release a dabuf. - */ -void -xfs_da_buf_done(xfs_dabuf_t *dabuf) -{ - ASSERT(dabuf); - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if (dabuf->dirty) - xfs_da_buf_clean(dabuf); - if (dabuf->nbuf > 1) { - kmem_free(dabuf->data); - kmem_free(dabuf); - } else { - kmem_zone_free(xfs_dabuf_zone, dabuf); - } -} - -/* - * Log transaction from a dabuf. - */ -void -xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) -{ - xfs_buf_t *bp; - uint f; - int i; - uint l; - int off; - - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if (dabuf->nbuf == 1) { - ASSERT(dabuf->data == dabuf->bps[0]->b_addr); - xfs_trans_log_buf(tp, dabuf->bps[0], first, last); - return; - } - dabuf->dirty = 1; - ASSERT(first <= last); - for (i = off = 0; i < dabuf->nbuf; i++, off += BBTOB(bp->b_length)) { - bp = dabuf->bps[i]; - f = off; - l = f + BBTOB(bp->b_length) - 1; - if (f < first) - f = first; - if (l > last) - l = last; - if (f <= l) - xfs_trans_log_buf(tp, bp, f - off, l - off); - /* - * B_DONE is set by xfs_trans_log buf. - * If we don't set it on a new buffer (get not read) - * then if we don't put anything in the buffer it won't - * be set, and at commit it it released into the cache, - * and then a read will fail. - */ - else if (!(XFS_BUF_ISDONE(bp))) - XFS_BUF_DONE(bp); - } - ASSERT(last < off); -} - -/* - * Release dabuf from a transaction. - * Have to free up the dabuf before the buffers are released, - * since the synchronization on the dabuf is really the lock on the buffer. - */ -void -xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf) -{ - xfs_buf_t *bp; - xfs_buf_t **bplist; - int i; - int nbuf; - - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if ((nbuf = dabuf->nbuf) == 1) { - bplist = &bp; - bp = dabuf->bps[0]; - } else { - bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP); - memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist)); - } - xfs_da_buf_done(dabuf); - for (i = 0; i < nbuf; i++) - xfs_trans_brelse(tp, bplist[i]); - if (bplist != &bp) - kmem_free(bplist); -} - -/* - * Invalidate dabuf from a transaction. - */ -void -xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf) -{ - xfs_buf_t *bp; - xfs_buf_t **bplist; - int i; - int nbuf; - - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if ((nbuf = dabuf->nbuf) == 1) { - bplist = &bp; - bp = dabuf->bps[0]; - } else { - bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP); - memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist)); - } - xfs_da_buf_done(dabuf); - for (i = 0; i < nbuf; i++) - xfs_trans_binval(tp, bplist[i]); - if (bplist != &bp) - kmem_free(bplist); -} - -/* - * Get the first daddr from a dabuf. - */ -xfs_daddr_t -xfs_da_blkno(xfs_dabuf_t *dabuf) -{ - ASSERT(dabuf->nbuf); - ASSERT(dabuf->data); - return XFS_BUF_ADDR(dabuf->bps[0]); -} diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index dbf7c074ae7..ee5170c46ae 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -18,7 +18,6 @@ #ifndef __XFS_DA_BTREE_H__ #define __XFS_DA_BTREE_H__ -struct xfs_buf; struct xfs_bmap_free; struct xfs_inode; struct xfs_mount; @@ -32,7 +31,7 @@ struct zone; /* * This structure is common to both leaf nodes and non-leaf nodes in the Btree. * - * Is is used to manage a doubly linked list of all blocks at the same + * It is used to manage a doubly linked list of all blocks at the same * level in the Btree, and to identify which type of block this is. */ #define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ @@ -133,24 +132,6 @@ typedef struct xfs_da_args { { XFS_DA_OP_CILOOKUP, "CILOOKUP" } /* - * Structure to describe buffer(s) for a block. - * This is needed in the directory version 2 format case, when - * multiple non-contiguous fsblocks might be needed to cover one - * logical directory block. - * If the buffer count is 1 then the data pointer points to the - * same place as the b_addr field for the buffer, else to kmem_alloced memory. - */ -typedef struct xfs_dabuf { - int nbuf; /* number of buffer pointers present */ - short dirty; /* data needs to be copied back */ - short bbcount; /* how large is data in bbs */ - void *data; /* pointer for buffers' data */ - struct xfs_buf *bps[1]; /* actually nbuf of these */ -} xfs_dabuf_t; -#define XFS_DA_BUF_SIZE(n) \ - (sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1)) - -/* * Storage for holding state during Btree searches and split/join ops. * * Only need space for 5 intermediate nodes. With a minimum of 62-way @@ -158,7 +139,7 @@ typedef struct xfs_dabuf { * which is slightly more than enough. */ typedef struct xfs_da_state_blk { - xfs_dabuf_t *bp; /* buffer containing block */ + struct xfs_buf *bp; /* buffer containing block */ xfs_dablk_t blkno; /* filesystem blkno of buffer */ xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */ int index; /* relevant index into block */ @@ -211,7 +192,7 @@ struct xfs_nameops { * Routines used for growing the Btree. */ int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - xfs_dabuf_t **bpp, int whichfork); + struct xfs_buf **bpp, int whichfork); int xfs_da_split(xfs_da_state_t *state); /* @@ -232,6 +213,9 @@ int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, */ int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_state_blk_t *new_blk); +int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp, int which_fork); /* * Utility routines. @@ -241,14 +225,16 @@ int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, int count); int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, - xfs_dabuf_t **bp, int whichfork); + struct xfs_buf **bp, int whichfork); int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, - xfs_dabuf_t **bpp, int whichfork); + struct xfs_buf **bpp, int whichfork, + const struct xfs_buf_ops *ops); xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, int whichfork); + xfs_dablk_t bno, xfs_daddr_t mapped_bno, + int whichfork, const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, - xfs_dabuf_t *dead_buf); + struct xfs_buf *dead_buf); uint xfs_da_hashname(const __uint8_t *name_string, int name_length); enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, @@ -258,15 +244,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, xfs_da_state_t *xfs_da_state_alloc(void); void xfs_da_state_free(xfs_da_state_t *state); -void xfs_da_buf_done(xfs_dabuf_t *dabuf); -void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first, - uint last); -void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf); -void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf); -xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf); - extern struct kmem_zone *xfs_da_state_zone; -extern struct kmem_zone *xfs_dabuf_zone; extern const struct xfs_nameops xfs_default_nameops; #endif /* __XFS_DA_BTREE_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index e00de08dc8a..a8bd26b82ec 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -48,44 +48,44 @@ xfs_swapext( xfs_swapext_t *sxp) { xfs_inode_t *ip, *tip; - struct file *file, *tmp_file; + struct fd f, tmp; int error = 0; /* Pull information for the target fd */ - file = fget((int)sxp->sx_fdtarget); - if (!file) { + f = fdget((int)sxp->sx_fdtarget); + if (!f.file) { error = XFS_ERROR(EINVAL); goto out; } - if (!(file->f_mode & FMODE_WRITE) || - !(file->f_mode & FMODE_READ) || - (file->f_flags & O_APPEND)) { + if (!(f.file->f_mode & FMODE_WRITE) || + !(f.file->f_mode & FMODE_READ) || + (f.file->f_flags & O_APPEND)) { error = XFS_ERROR(EBADF); goto out_put_file; } - tmp_file = fget((int)sxp->sx_fdtmp); - if (!tmp_file) { + tmp = fdget((int)sxp->sx_fdtmp); + if (!tmp.file) { error = XFS_ERROR(EINVAL); goto out_put_file; } - if (!(tmp_file->f_mode & FMODE_WRITE) || - !(tmp_file->f_mode & FMODE_READ) || - (tmp_file->f_flags & O_APPEND)) { + if (!(tmp.file->f_mode & FMODE_WRITE) || + !(tmp.file->f_mode & FMODE_READ) || + (tmp.file->f_flags & O_APPEND)) { error = XFS_ERROR(EBADF); goto out_put_tmp_file; } - if (IS_SWAPFILE(file->f_path.dentry->d_inode) || - IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) { + if (IS_SWAPFILE(f.file->f_path.dentry->d_inode) || + IS_SWAPFILE(tmp.file->f_path.dentry->d_inode)) { error = XFS_ERROR(EINVAL); goto out_put_tmp_file; } - ip = XFS_I(file->f_path.dentry->d_inode); - tip = XFS_I(tmp_file->f_path.dentry->d_inode); + ip = XFS_I(f.file->f_path.dentry->d_inode); + tip = XFS_I(tmp.file->f_path.dentry->d_inode); if (ip->i_mount != tip->i_mount) { error = XFS_ERROR(EINVAL); @@ -105,9 +105,9 @@ xfs_swapext( error = xfs_swap_extents(ip, tip, sxp); out_put_tmp_file: - fput(tmp_file); + fdput(tmp); out_put_file: - fput(file); + fdput(f); out: return error; } @@ -246,12 +246,10 @@ xfs_swap_extents( goto out_unlock; } - if (VN_CACHED(VFS_I(tip)) != 0) { - error = xfs_flushinval_pages(tip, 0, -1, - FI_REMAPF_LOCKED); - if (error) - goto out_unlock; - } + error = -filemap_write_and_wait(VFS_I(tip)->i_mapping); + if (error) + goto out_unlock; + truncate_pagecache_range(VFS_I(tip), 0, -1); /* Verify O_DIRECT for ftmp */ if (VN_CACHED(VFS_I(tip)) != 0) { @@ -315,8 +313,7 @@ xfs_swap_extents( * are safe. We don't really care if non-io related * fields change. */ - - xfs_tosspages(ip, 0, -1, FI_REMAPF); + truncate_pagecache_range(VFS_I(ip), 0, -1); tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); if ((error = xfs_trans_reserve(tp, 0, diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index a3721633abc..1d9643b3dce 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -33,7 +33,7 @@ typedef struct xfs_timestamp { * variable size the leftover area split into a data and an attribute fork. * The format of the data and attribute fork depends on the format of the * inode as indicated by di_format and di_aformat. To access the data and - * attribute use the XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR macros + * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros * below. * * There is a very similar struct icdinode in xfs_inode which matches the diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 67a250c36d4..b26a50f9921 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -592,7 +592,7 @@ int xfs_dir2_shrink_inode( xfs_da_args_t *args, xfs_dir2_db_t db, - xfs_dabuf_t *bp) + struct xfs_buf *bp) { xfs_fileoff_t bno; /* directory file offset */ xfs_dablk_t da; /* directory file offset */ @@ -634,7 +634,7 @@ xfs_dir2_shrink_inode( /* * Invalidate the buffer from the transaction. */ - xfs_da_binval(tp, bp); + xfs_trans_binval(tp, bp); /* * If it's not a data block, we're done. */ diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 586732f2d80..12afe07a91d 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -37,10 +37,10 @@ /* * Local function prototypes. */ -static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first, - int last); -static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp); -static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp, +static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp, + int first, int last); +static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp); +static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp, int *entno); static int xfs_dir2_block_sort(const void *a, const void *b); @@ -56,6 +56,214 @@ xfs_dir_startup(void) xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } +static void +xfs_dir2_block_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; + + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_dir2_block_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_block_verify(bp); +} + +static void +xfs_dir2_block_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_block_verify(bp); +} + +const struct xfs_buf_ops xfs_dir2_block_buf_ops = { + .verify_read = xfs_dir2_block_read_verify, + .verify_write = xfs_dir2_block_write_verify, +}; + +static int +xfs_dir2_block_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + + return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, + XFS_DATA_FORK, &xfs_dir2_block_buf_ops); +} + +static void +xfs_dir2_block_need_space( + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + __be16 **tagpp, + struct xfs_dir2_data_unused **dupp, + struct xfs_dir2_data_unused **enddupp, + int *compact, + int len) +{ + struct xfs_dir2_data_free *bf; + __be16 *tagp = NULL; + struct xfs_dir2_data_unused *dup = NULL; + struct xfs_dir2_data_unused *enddup = NULL; + + *compact = 0; + bf = hdr->bestfree; + + /* + * If there are stale entries we'll use one for the leaf. + */ + if (btp->stale) { + if (be16_to_cpu(bf[0].length) >= len) { + /* + * The biggest entry enough to avoid compaction. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + goto out; + } + + /* + * Will need to compact to make this work. + * Tag just before the first leaf entry. + */ + *compact = 1; + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then the data will go where the + * leaf data starts now, if it works at all. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * + (uint)sizeof(*blp) < len) + dup = NULL; + } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) + dup = NULL; + else + dup = (xfs_dir2_data_unused_t *)blp; + goto out; + } + + /* + * no stale entries, so just use free space. + * Tag just before the first leaf entry. + */ + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then can't do this add without cleaning up: + * the space before the first leaf entry needs to be free so it + * can be expanded to hold the pointer to the new entry. + */ + if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + /* + * Check out the biggest freespace and see if it's the same one. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + if (dup != enddup) { + /* + * Not the same free entry, just check its length. + */ + if (be16_to_cpu(dup->length) < len) + dup = NULL; + goto out; + } + + /* + * It is the biggest freespace, can it hold the leaf too? + */ + if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { + /* + * Yes, use the second-largest entry instead if it works. + */ + if (be16_to_cpu(bf[1].length) >= len) + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[1].offset)); + else + dup = NULL; + } + } +out: + *tagpp = tagp; + *dupp = dup; + *enddupp = enddup; +} + +/* + * compact the leaf entries. + * Leave the highest-numbered stale entry stale. + * XXX should be the one closest to mid but mid is not yet computed. + */ +static void +xfs_dir2_block_compact( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + int *needlog, + int *lfloghigh, + int *lfloglow) +{ + int fromidx; /* source leaf index */ + int toidx; /* target leaf index */ + int needscan = 0; + int highstale; /* high stale index */ + + fromidx = toidx = be32_to_cpu(btp->count) - 1; + highstale = *lfloghigh = -1; + for (; fromidx >= 0; fromidx--) { + if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { + if (highstale == -1) + highstale = toidx; + else { + if (*lfloghigh == -1) + *lfloghigh = toidx; + continue; + } + } + if (fromidx < toidx) + blp[toidx] = blp[fromidx]; + toidx--; + } + *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); + *lfloghigh -= be32_to_cpu(btp->stale) - 1; + be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); + xfs_dir2_data_make_free(tp, bp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), + (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), + needlog, &needscan); + blp += be32_to_cpu(btp->stale) - 1; + btp->stale = cpu_to_be32(1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) + xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog); +} + /* * Add an entry to a block directory. */ @@ -63,10 +271,9 @@ int /* error */ xfs_dir2_block_addname( xfs_da_args_t *args) /* directory op arguments */ { - xfs_dir2_data_free_t *bf; /* bestfree table in block */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* buffer for block */ + struct xfs_buf *bp; /* buffer for block */ xfs_dir2_block_tail_t *btp; /* block tail */ int compact; /* need to compact leaf ents */ xfs_dir2_data_entry_t *dep; /* block data entry */ @@ -94,204 +301,74 @@ xfs_dir2_block_addname( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the (one and only) directory block into dabuf bp. - */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + + /* Read the (one and only) directory block into bp. */ + error = xfs_dir2_block_read(tp, dp, &bp); + if (error) return error; - } - ASSERT(bp != NULL); - hdr = bp->data; - /* - * Check the magic number, corrupted if wrong. - */ - if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_dir2_block_addname", - XFS_ERRLEVEL_LOW, mp, hdr); - xfs_da_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } + len = xfs_dir2_data_entsize(args->namelen); + /* * Set up pointers to parts of the block. */ - bf = hdr->bestfree; + hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); + /* - * No stale entries? Need space for entry and new leaf. - */ - if (!btp->stale) { - /* - * Tag just before the first leaf entry. - */ - tagp = (__be16 *)blp - 1; - /* - * Data object just before the first leaf entry. - */ - enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); - /* - * If it's not free then can't do this add without cleaning up: - * the space before the first leaf entry needs to be free so it - * can be expanded to hold the pointer to the new entry. - */ - if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG) - dup = enddup = NULL; - /* - * Check out the biggest freespace and see if it's the same one. - */ - else { - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(bf[0].offset)); - if (dup == enddup) { - /* - * It is the biggest freespace, is it too small - * to hold the new leaf too? - */ - if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { - /* - * Yes, we use the second-largest - * entry instead if it works. - */ - if (be16_to_cpu(bf[1].length) >= len) - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + - be16_to_cpu(bf[1].offset)); - else - dup = NULL; - } - } else { - /* - * Not the same free entry, - * just check its length. - */ - if (be16_to_cpu(dup->length) < len) { - dup = NULL; - } - } - } - compact = 0; - } - /* - * If there are stale entries we'll use one for the leaf. - * Is the biggest entry enough to avoid compaction? + * Find out if we can reuse stale entries or whether we need extra + * space for entry and new leaf. */ - else if (be16_to_cpu(bf[0].length) >= len) { - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(bf[0].offset)); - compact = 0; - } + xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup, + &enddup, &compact, len); + /* - * Will need to compact to make this work. + * Done everything we need for a space check now. */ - else { - /* - * Tag just before the first leaf entry. - */ - tagp = (__be16 *)blp - 1; - /* - * Data object just before the first leaf entry. - */ - dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); - /* - * If it's not free then the data will go where the - * leaf data starts now, if it works at all. - */ - if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * - (uint)sizeof(*blp) < len) - dup = NULL; - } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) - dup = NULL; - else - dup = (xfs_dir2_data_unused_t *)blp; - compact = 1; + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { + xfs_trans_brelse(tp, bp); + if (!dup) + return XFS_ERROR(ENOSPC); + return 0; } - /* - * If this isn't a real add, we're done with the buffer. - */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) - xfs_da_brelse(tp, bp); + /* * If we don't have space for the new entry & leaf ... */ if (!dup) { - /* - * Not trying to actually do anything, or don't have - * a space reservation: return no-space. - */ - if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) + /* Don't have a space reservation: return no-space. */ + if (args->total == 0) return XFS_ERROR(ENOSPC); /* * Convert to the next larger format. * Then add the new entry in that format. */ error = xfs_dir2_block_to_leaf(args, bp); - xfs_da_buf_done(bp); if (error) return error; return xfs_dir2_leaf_addname(args); } - /* - * Just checking, and it would work, so say so. - */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) - return 0; + needlog = needscan = 0; + /* * If need to compact the leaf entries, do it now. - * Leave the highest-numbered stale entry stale. - * XXX should be the one closest to mid but mid is not yet computed. */ if (compact) { - int fromidx; /* source leaf index */ - int toidx; /* target leaf index */ - - for (fromidx = toidx = be32_to_cpu(btp->count) - 1, - highstale = lfloghigh = -1; - fromidx >= 0; - fromidx--) { - if (blp[fromidx].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { - if (highstale == -1) - highstale = toidx; - else { - if (lfloghigh == -1) - lfloghigh = toidx; - continue; - } - } - if (fromidx < toidx) - blp[toidx] = blp[fromidx]; - toidx--; - } - lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); - lfloghigh -= be32_to_cpu(btp->stale) - 1; - be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); - xfs_dir2_data_make_free(tp, bp, - (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), - (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), - &needlog, &needscan); - blp += be32_to_cpu(btp->stale) - 1; - btp->stale = cpu_to_be32(1); + xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog, + &lfloghigh, &lfloglow); + /* recalculate blp post-compaction */ + blp = xfs_dir2_block_leaf_p(btp); + } else if (btp->stale) { /* - * If we now need to rebuild the bestfree map, do so. - * This needs to happen before the next call to use_free. + * Set leaf logging boundaries to impossible state. + * For the no-stale case they're set explicitly. */ - if (needscan) { - xfs_dir2_data_freescan(mp, hdr, &needlog); - needscan = 0; - } - } - /* - * Set leaf logging boundaries to impossible state. - * For the no-stale case they're set explicitly. - */ - else if (btp->stale) { lfloglow = be32_to_cpu(btp->count); lfloghigh = -1; } + /* * Find the slot that's first lower than our hash value, -1 if none. */ @@ -422,7 +499,6 @@ xfs_dir2_block_addname( xfs_dir2_block_log_tail(tp, bp); xfs_dir2_data_log_entry(tp, bp, dep); xfs_dir2_data_check(dp, bp); - xfs_da_buf_done(bp); return 0; } @@ -437,7 +513,7 @@ xfs_dir2_block_getdents( filldir_t filldir) { xfs_dir2_data_hdr_t *hdr; /* block header */ - xfs_dabuf_t *bp; /* buffer for block */ + struct xfs_buf *bp; /* buffer for block */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_dir2_data_unused_t *dup; /* block unused entry */ @@ -452,24 +528,19 @@ xfs_dir2_block_getdents( /* * If the block number in the offset is out of range, we're done. */ - if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) { + if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) return 0; - } - /* - * Can't read the block, give up, else get dabuf in bp. - */ - error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1, - &bp, XFS_DATA_FORK); + + error = xfs_dir2_block_read(NULL, dp, &bp); if (error) return error; - ASSERT(bp != NULL); /* * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. */ wantoff = xfs_dir2_dataptr_to_off(mp, *offset); - hdr = bp->data; + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); /* * Set up values for the loop. @@ -514,7 +585,7 @@ xfs_dir2_block_getdents( cook & 0x7fffffff, be64_to_cpu(dep->inumber), DT_UNKNOWN)) { *offset = cook & 0x7fffffff; - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return 0; } } @@ -525,7 +596,7 @@ xfs_dir2_block_getdents( */ *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & 0x7fffffff; - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return 0; } @@ -535,17 +606,17 @@ xfs_dir2_block_getdents( static void xfs_dir2_block_log_leaf( xfs_trans_t *tp, /* transaction structure */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_buf *bp, /* block buffer */ int first, /* index of first logged leaf */ int last) /* index of last logged leaf */ { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; xfs_dir2_leaf_entry_t *blp; xfs_dir2_block_tail_t *btp; btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr); blp = xfs_dir2_block_leaf_p(btp); - xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), + xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), (uint)((char *)&blp[last + 1] - (char *)hdr - 1)); } @@ -555,13 +626,13 @@ xfs_dir2_block_log_leaf( static void xfs_dir2_block_log_tail( xfs_trans_t *tp, /* transaction structure */ - xfs_dabuf_t *bp) /* block buffer */ + struct xfs_buf *bp) /* block buffer */ { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; xfs_dir2_block_tail_t *btp; btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr); - xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), + xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), (uint)((char *)(btp + 1) - (char *)hdr - 1)); } @@ -575,7 +646,7 @@ xfs_dir2_block_lookup( { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -593,7 +664,7 @@ xfs_dir2_block_lookup( return error; dp = args->dp; mp = dp->i_mount; - hdr = bp->data; + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); @@ -607,7 +678,7 @@ xfs_dir2_block_lookup( */ args->inumber = be64_to_cpu(dep->inumber); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return XFS_ERROR(error); } @@ -617,13 +688,13 @@ xfs_dir2_block_lookup( static int /* error */ xfs_dir2_block_lookup_int( xfs_da_args_t *args, /* dir lookup arguments */ - xfs_dabuf_t **bpp, /* returned block buffer */ + struct xfs_buf **bpp, /* returned block buffer */ int *entno) /* returned entry number */ { xfs_dir2_dataptr_t addr; /* data entry address */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -639,15 +710,12 @@ xfs_dir2_block_lookup_int( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the buffer, return error if we can't get it. - */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + + error = xfs_dir2_block_read(tp, dp, &bp); + if (error) return error; - } - ASSERT(bp != NULL); - hdr = bp->data; + + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); @@ -666,7 +734,7 @@ xfs_dir2_block_lookup_int( high = mid - 1; if (low > high) { ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); return XFS_ERROR(ENOENT); } } @@ -714,7 +782,7 @@ xfs_dir2_block_lookup_int( /* * No match, release the buffer and return ENOENT. */ - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); return XFS_ERROR(ENOENT); } @@ -728,7 +796,7 @@ xfs_dir2_block_removename( { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -753,7 +821,7 @@ xfs_dir2_block_removename( dp = args->dp; tp = args->trans; mp = dp->i_mount; - hdr = bp->data; + hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -790,10 +858,9 @@ xfs_dir2_block_removename( * See if the size as a shortform is good enough. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); - if (size > XFS_IFORK_DSIZE(dp)) { - xfs_da_buf_done(bp); + if (size > XFS_IFORK_DSIZE(dp)) return 0; - } + /* * If it works, do the conversion. */ @@ -810,7 +877,7 @@ xfs_dir2_block_replace( { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -829,7 +896,7 @@ xfs_dir2_block_replace( } dp = args->dp; mp = dp->i_mount; - hdr = bp->data; + hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -844,7 +911,6 @@ xfs_dir2_block_replace( dep->inumber = cpu_to_be64(args->inumber); xfs_dir2_data_log_entry(args->trans, bp, dep); xfs_dir2_data_check(dp, bp); - xfs_da_buf_done(bp); return 0; } @@ -871,8 +937,8 @@ xfs_dir2_block_sort( int /* error */ xfs_dir2_leaf_to_block( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp, /* leaf buffer */ - xfs_dabuf_t *dbp) /* data buffer */ + struct xfs_buf *lbp, /* leaf buffer */ + struct xfs_buf *dbp) /* data buffer */ { __be16 *bestsp; /* leaf bests table */ xfs_dir2_data_hdr_t *hdr; /* block header */ @@ -898,7 +964,7 @@ xfs_dir2_leaf_to_block( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = lbp->data; + leaf = lbp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(mp, leaf); /* @@ -914,21 +980,19 @@ xfs_dir2_leaf_to_block( if ((error = xfs_dir2_leaf_trim_data(args, lbp, (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) - goto out; - } else { - error = 0; - goto out; - } + return error; + } else + return 0; } /* * Read the data block if we don't already have it, give up if it fails. */ - if (dbp == NULL && - (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, - XFS_DATA_FORK))) { - goto out; + if (!dbp) { + error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp); + if (error) + return error; } - hdr = dbp->data; + hdr = dbp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); /* * Size of the "leaf" area in the block. @@ -944,13 +1008,13 @@ xfs_dir2_leaf_to_block( * If it's not free or is too short we can't do it. */ if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG || - be16_to_cpu(dup->length) < size) { - error = 0; - goto out; - } + be16_to_cpu(dup->length) < size) + return 0; + /* * Start converting it to block form. */ + dbp->b_ops = &xfs_dir2_block_buf_ops; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); needlog = 1; needscan = 0; @@ -989,25 +1053,17 @@ xfs_dir2_leaf_to_block( * Pitch the old leaf block. */ error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp); - lbp = NULL; - if (error) { - goto out; - } + if (error) + return error; + /* * Now see if the resulting block can be shrunken to shortform. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); - if (size > XFS_IFORK_DSIZE(dp)) { - error = 0; - goto out; - } + if (size > XFS_IFORK_DSIZE(dp)) + return 0; + return xfs_dir2_block_to_sf(args, dbp, size, &sfh); -out: - if (lbp) - xfs_da_buf_done(lbp); - if (dbp) - xfs_da_buf_done(dbp); - return error; } /* @@ -1020,7 +1076,7 @@ xfs_dir2_sf_to_block( xfs_dir2_db_t blkno; /* dir-relative block # (0) */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail pointer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ xfs_inode_t *dp; /* incore directory inode */ @@ -1088,7 +1144,8 @@ xfs_dir2_sf_to_block( kmem_free(sfp); return error; } - hdr = bp->data; + bp->b_ops = &xfs_dir2_block_buf_ops; + hdr = bp->b_addr; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); /* * Compute size of block "tail" area. @@ -1217,6 +1274,5 @@ xfs_dir2_sf_to_block( xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1); xfs_dir2_block_log_tail(tp, bp); xfs_dir2_data_check(dp, bp); - xfs_da_buf_done(bp); return 0; } diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 2046988e9eb..ffcf1774152 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -34,16 +34,15 @@ STATIC xfs_dir2_data_free_t * xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); -#ifdef DEBUG /* * Check the consistency of the data block. * The input can also be a block-format directory. - * Pop an assert if we find anything bad. + * Return 0 is the buffer is good, otherwise an error. */ -void -xfs_dir2_data_check( - xfs_inode_t *dp, /* incore inode pointer */ - xfs_dabuf_t *bp) /* data block's buffer */ +int +__xfs_dir2_data_check( + struct xfs_inode *dp, /* incore inode pointer */ + struct xfs_buf *bp) /* data block's buffer */ { xfs_dir2_dataptr_t addr; /* addr for leaf lookup */ xfs_dir2_data_free_t *bf; /* bestfree table */ @@ -64,18 +63,23 @@ xfs_dir2_data_check( int stale; /* count of stale leaves */ struct xfs_name name; - mp = dp->i_mount; - hdr = bp->data; + mp = bp->b_target->bt_mount; + hdr = bp->b_addr; bf = hdr->bestfree; p = (char *)(hdr + 1); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): btp = xfs_dir2_block_tail_p(mp, hdr); lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; - } else { - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + break; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): endp = (char *)hdr + mp->m_dirblksize; + break; + default: + XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); + return EFSCORRUPTED; } count = lastfree = freeseen = 0; @@ -83,19 +87,22 @@ xfs_dir2_data_check( * Account for zero bestfree entries. */ if (!bf[0].length) { - ASSERT(!bf[0].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[0].offset); freeseen |= 1 << 0; } if (!bf[1].length) { - ASSERT(!bf[1].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[1].offset); freeseen |= 1 << 1; } if (!bf[2].length) { - ASSERT(!bf[2].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[2].offset); freeseen |= 1 << 2; } - ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); - ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); + + XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >= + be16_to_cpu(bf[1].length)); + XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >= + be16_to_cpu(bf[2].length)); /* * Loop over the data/unused entries. */ @@ -107,17 +114,20 @@ xfs_dir2_data_check( * doesn't need to be there. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ASSERT(lastfree == 0); - ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == - (char *)dup - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN(lastfree == 0); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == + (char *)dup - (char *)hdr); dfp = xfs_dir2_data_freefind(hdr, dup); if (dfp) { i = (int)(dfp - bf); - ASSERT((freeseen & (1 << i)) == 0); + XFS_WANT_CORRUPTED_RETURN( + (freeseen & (1 << i)) == 0); freeseen |= 1 << i; } else { - ASSERT(be16_to_cpu(dup->length) <= - be16_to_cpu(bf[2].length)); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(dup->length) <= + be16_to_cpu(bf[2].length)); } p += be16_to_cpu(dup->length); lastfree = 1; @@ -130,10 +140,12 @@ xfs_dir2_data_check( * The linear search is crude but this is DEBUG code. */ dep = (xfs_dir2_data_entry_t *)p; - ASSERT(dep->namelen != 0); - ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); - ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == - (char *)dep - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0); + XFS_WANT_CORRUPTED_RETURN( + !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == + (char *)dep - (char *)hdr); count++; lastfree = 0; if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { @@ -148,27 +160,122 @@ xfs_dir2_data_check( be32_to_cpu(lep[i].hashval) == hash) break; } - ASSERT(i < be32_to_cpu(btp->count)); + XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); } p += xfs_dir2_data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. */ - ASSERT(freeseen == 7); + XFS_WANT_CORRUPTED_RETURN(freeseen == 7); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { if (lep[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; if (i > 0) - ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); + XFS_WANT_CORRUPTED_RETURN( + be32_to_cpu(lep[i].hashval) >= + be32_to_cpu(lep[i - 1].hashval)); } - ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); - ASSERT(stale == be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(count == + be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale)); } + return 0; +} + +static void +xfs_dir2_data_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC); + block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; + + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +/* + * Readahead of the first block of the directory when it is opened is completely + * oblivious to the format of the directory. Hence we can either get a block + * format buffer or a data format buffer on readahead. + */ +static void +xfs_dir2_data_reada_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + bp->b_ops = &xfs_dir2_block_buf_ops; + bp->b_ops->verify_read(bp); + return; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + xfs_dir2_data_verify(bp); + return; + default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } +} + +static void +xfs_dir2_data_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_data_verify(bp); +} + +static void +xfs_dir2_data_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_data_verify(bp); +} + +const struct xfs_buf_ops xfs_dir2_data_buf_ops = { + .verify_read = xfs_dir2_data_read_verify, + .verify_write = xfs_dir2_data_write_verify, +}; + +static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = { + .verify_read = xfs_dir2_data_reada_verify, + .verify_write = xfs_dir2_data_write_verify, +}; + + +int +xfs_dir2_data_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, + XFS_DATA_FORK, &xfs_dir2_data_buf_ops); +} + +int +xfs_dir2_data_readahead( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno) +{ + return xfs_da_reada_buf(tp, dp, bno, mapped_bno, + XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops); } -#endif /* * Given a data block and an unused entry from that block, @@ -389,9 +496,9 @@ int /* error */ xfs_dir2_data_init( xfs_da_args_t *args, /* directory operation args */ xfs_dir2_db_t blkno, /* logical dir block number */ - xfs_dabuf_t **bpp) /* output block buffer */ + struct xfs_buf **bpp) /* output block buffer */ { - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused entry pointer */ @@ -409,15 +516,14 @@ xfs_dir2_data_init( */ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, XFS_DATA_FORK); - if (error) { + if (error) return error; - } - ASSERT(bp != NULL); + bp->b_ops = &xfs_dir2_data_buf_ops; /* * Initialize the header. */ - hdr = bp->data; + hdr = bp->b_addr; hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); hdr->bestfree[0].offset = cpu_to_be16(sizeof(*hdr)); for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { @@ -449,16 +555,16 @@ xfs_dir2_data_init( */ void xfs_dir2_data_log_entry( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, xfs_dir2_data_entry_t *dep) /* data entry pointer */ { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); - xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), + xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) - (char *)hdr - 1)); } @@ -468,15 +574,15 @@ xfs_dir2_data_log_entry( */ void xfs_dir2_data_log_header( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp) { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); - xfs_da_log_buf(tp, bp, 0, sizeof(*hdr) - 1); + xfs_trans_log_buf(tp, bp, 0, sizeof(*hdr) - 1); } /* @@ -484,11 +590,11 @@ xfs_dir2_data_log_header( */ void xfs_dir2_data_log_unused( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, xfs_dir2_data_unused_t *dup) /* data unused pointer */ { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); @@ -496,13 +602,13 @@ xfs_dir2_data_log_unused( /* * Log the first part of the unused entry. */ - xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr), + xfs_trans_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr), (uint)((char *)&dup->length + sizeof(dup->length) - 1 - (char *)hdr)); /* * Log the end (tag) of the unused entry. */ - xfs_da_log_buf(tp, bp, + xfs_trans_log_buf(tp, bp, (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr), (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr + sizeof(xfs_dir2_data_off_t) - 1)); @@ -514,8 +620,8 @@ xfs_dir2_data_log_unused( */ void xfs_dir2_data_make_free( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, /* starting byte offset */ xfs_dir2_data_aoff_t len, /* length in bytes */ int *needlogp, /* out: log header */ @@ -531,7 +637,7 @@ xfs_dir2_data_make_free( xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ mp = tp->t_mountp; - hdr = bp->data; + hdr = bp->b_addr; /* * Figure out where the end of the data area is. @@ -696,8 +802,8 @@ xfs_dir2_data_make_free( */ void xfs_dir2_data_use_free( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* data block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, xfs_dir2_data_unused_t *dup, /* unused entry */ xfs_dir2_data_aoff_t offset, /* starting offset to use */ xfs_dir2_data_aoff_t len, /* length to use */ @@ -713,7 +819,7 @@ xfs_dir2_data_use_free( xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ int oldlen; /* old unused entry's length */ - hdr = bp->data; + hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 397ffbcbab1..60cd2fa4e04 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -38,16 +38,93 @@ * Local function declarations. */ #ifdef DEBUG -static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp); +static void xfs_dir2_leaf_check(struct xfs_inode *dp, struct xfs_buf *bp); #else #define xfs_dir2_leaf_check(dp, bp) #endif -static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp, - int *indexp, xfs_dabuf_t **dbpp); -static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp, +static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, + int *indexp, struct xfs_buf **dbpp); +static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); -static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp); +static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); +static void +xfs_dir2_leaf_verify( + struct xfs_buf *bp, + __be16 magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_leaf_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == magic; + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_dir2_leaf1_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); +} + +static void +xfs_dir2_leaf1_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); +} + +void +xfs_dir2_leafn_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); +} + +void +xfs_dir2_leafn_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); +} + +static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = { + .verify_read = xfs_dir2_leaf1_read_verify, + .verify_write = xfs_dir2_leaf1_write_verify, +}; + +const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = { + .verify_read = xfs_dir2_leafn_read_verify, + .verify_write = xfs_dir2_leafn_write_verify, +}; + +static int +xfs_dir2_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops); +} + +int +xfs_dir2_leafn_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops); +} /* * Convert a block form directory to a leaf form directory. @@ -55,7 +132,7 @@ static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp); int /* error */ xfs_dir2_block_to_leaf( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *dbp) /* input block's buffer */ + struct xfs_buf *dbp) /* input block's buffer */ { __be16 *bestsp; /* leaf's bestsp entries */ xfs_dablk_t blkno; /* leaf block's bno */ @@ -64,7 +141,7 @@ xfs_dir2_block_to_leaf( xfs_dir2_block_tail_t *btp; /* block's tail */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ - xfs_dabuf_t *lbp; /* leaf block's buffer */ + struct xfs_buf *lbp; /* leaf block's buffer */ xfs_dir2_db_t ldb; /* leaf block's bno */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */ @@ -95,8 +172,8 @@ xfs_dir2_block_to_leaf( return error; } ASSERT(lbp != NULL); - leaf = lbp->data; - hdr = dbp->data; + leaf = lbp->b_addr; + hdr = dbp->b_addr; xfs_dir2_data_check(dp, dbp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); @@ -125,6 +202,7 @@ xfs_dir2_block_to_leaf( /* * Fix up the block header, make it a data block. */ + dbp->b_ops = &xfs_dir2_data_buf_ops; hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); if (needscan) xfs_dir2_data_freescan(mp, hdr, &needlog); @@ -143,7 +221,6 @@ xfs_dir2_block_to_leaf( xfs_dir2_leaf_check(dp, lbp); xfs_dir2_data_check(dp, dbp); xfs_dir2_leaf_log_bests(tp, lbp, 0, 0); - xfs_da_buf_done(lbp); return 0; } @@ -282,7 +359,7 @@ xfs_dir2_leaf_addname( __be16 *bestsp; /* freespace table in leaf */ int compact; /* need to compact leaves */ xfs_dir2_data_hdr_t *hdr; /* data block header */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* data unused entry */ @@ -291,7 +368,7 @@ xfs_dir2_leaf_addname( int highstale; /* index of next stale leaf */ int i; /* temporary, index */ int index; /* leaf table position */ - xfs_dabuf_t *lbp; /* leaf's buffer */ + struct xfs_buf *lbp; /* leaf's buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int length; /* length of new entry */ xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ @@ -312,15 +389,11 @@ xfs_dir2_leaf_addname( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the leaf block. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK); - if (error) { + + error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); + if (error) return error; - } - ASSERT(lbp != NULL); + /* * Look up the entry by hash value and name. * We know it's not there, our caller has already done a lookup. @@ -328,7 +401,7 @@ xfs_dir2_leaf_addname( * But if there are dup hash values the index is of the first of those. */ index = xfs_dir2_leaf_search_hash(args, lbp); - leaf = lbp->data; + leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); length = xfs_dir2_data_entsize(args->namelen); @@ -402,14 +475,13 @@ xfs_dir2_leaf_addname( */ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return XFS_ERROR(ENOSPC); } /* * Convert to node form. */ error = xfs_dir2_leaf_to_node(args, lbp); - xfs_da_buf_done(lbp); if (error) return error; /* @@ -427,7 +499,7 @@ xfs_dir2_leaf_addname( * a new data block. */ if (args->op_flags & XFS_DA_OP_JUSTCHECK) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return use_block == -1 ? XFS_ERROR(ENOSPC) : 0; } /* @@ -435,7 +507,7 @@ xfs_dir2_leaf_addname( * changed anything. */ if (args->total == 0 && use_block == -1) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return XFS_ERROR(ENOSPC); } /* @@ -466,14 +538,14 @@ xfs_dir2_leaf_addname( */ if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &use_block))) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } /* * Initialize the block. */ if ((error = xfs_dir2_data_init(args, use_block, &dbp))) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } /* @@ -493,25 +565,24 @@ xfs_dir2_leaf_addname( */ else xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); - hdr = dbp->data; + hdr = dbp->b_addr; bestsp[use_block] = hdr->bestfree[0].length; grown = 1; - } - /* - * Already had space in some data block. - * Just read that one in. - */ - else { - if ((error = - xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), - -1, &dbp, XFS_DATA_FORK))) { - xfs_da_brelse(tp, lbp); + } else { + /* + * Already had space in some data block. + * Just read that one in. + */ + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, use_block), + -1, &dbp); + if (error) { + xfs_trans_brelse(tp, lbp); return error; } - hdr = dbp->data; + hdr = dbp->b_addr; grown = 0; } - xfs_dir2_data_check(dp, dbp); /* * Point to the biggest freespace in our data block. */ @@ -570,9 +641,7 @@ xfs_dir2_leaf_addname( xfs_dir2_leaf_log_header(tp, lbp); xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh); xfs_dir2_leaf_check(dp, lbp); - xfs_da_buf_done(lbp); xfs_dir2_data_check(dp, dbp); - xfs_da_buf_done(dbp); return 0; } @@ -583,8 +652,8 @@ xfs_dir2_leaf_addname( */ STATIC void xfs_dir2_leaf_check( - xfs_inode_t *dp, /* incore directory inode */ - xfs_dabuf_t *bp) /* leaf's buffer */ + struct xfs_inode *dp, /* incore directory inode */ + struct xfs_buf *bp) /* leaf's buffer */ { int i; /* leaf index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -592,7 +661,7 @@ xfs_dir2_leaf_check( xfs_mount_t *mp; /* filesystem mount point */ int stale; /* count of stale leaves */ - leaf = bp->data; + leaf = bp->b_addr; mp = dp->i_mount; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); /* @@ -628,14 +697,14 @@ xfs_dir2_leaf_check( void xfs_dir2_leaf_compact( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp) /* leaf buffer */ + struct xfs_buf *bp) /* leaf buffer */ { int from; /* source leaf index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int loglow; /* first leaf entry to log */ int to; /* target leaf index */ - leaf = bp->data; + leaf = bp->b_addr; if (!leaf->hdr.stale) { return; } @@ -677,7 +746,7 @@ xfs_dir2_leaf_compact( */ void xfs_dir2_leaf_compact_x1( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int *indexp, /* insertion index */ int *lowstalep, /* out: stale entry before us */ int *highstalep, /* out: stale entry after us */ @@ -693,7 +762,7 @@ xfs_dir2_leaf_compact_x1( int newindex=0; /* new insertion index */ int to; /* destination copy index */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(be16_to_cpu(leaf->hdr.stale) > 1); index = *indexp; @@ -763,6 +832,216 @@ xfs_dir2_leaf_compact_x1( *highstalep = highstale; } +struct xfs_dir2_leaf_map_info { + xfs_extlen_t map_blocks; /* number of fsbs in map */ + xfs_dablk_t map_off; /* last mapped file offset */ + int map_size; /* total entries in *map */ + int map_valid; /* valid entries in *map */ + int nmap; /* mappings to ask xfs_bmapi */ + xfs_dir2_db_t curdb; /* db for current block */ + int ra_current; /* number of read-ahead blks */ + int ra_index; /* *map index for read-ahead */ + int ra_offset; /* map entry offset for ra */ + int ra_want; /* readahead count wanted */ + struct xfs_bmbt_irec map[]; /* map vector for blocks */ +}; + +STATIC int +xfs_dir2_leaf_readbuf( + struct xfs_inode *dp, + size_t bufsize, + struct xfs_dir2_leaf_map_info *mip, + xfs_dir2_off_t *curoff, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp = *bpp; + struct xfs_bmbt_irec *map = mip->map; + int error = 0; + int length; + int i; + int j; + + /* + * If we have a buffer, we need to release it and + * take it out of the mapping. + */ + + if (bp) { + xfs_trans_brelse(NULL, bp); + bp = NULL; + mip->map_blocks -= mp->m_dirblkfsbs; + /* + * Loop to get rid of the extents for the + * directory block. + */ + for (i = mp->m_dirblkfsbs; i > 0; ) { + j = min_t(int, map->br_blockcount, i); + map->br_blockcount -= j; + map->br_startblock += j; + map->br_startoff += j; + /* + * If mapping is done, pitch it from + * the table. + */ + if (!map->br_blockcount && --mip->map_valid) + memmove(&map[0], &map[1], + sizeof(map[0]) * mip->map_valid); + i -= j; + } + } + + /* + * Recalculate the readahead blocks wanted. + */ + mip->ra_want = howmany(bufsize + mp->m_dirblksize, + mp->m_sb.sb_blocksize) - 1; + ASSERT(mip->ra_want >= 0); + + /* + * If we don't have as many as we want, and we haven't + * run out of data blocks, get some more mappings. + */ + if (1 + mip->ra_want > mip->map_blocks && + mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) { + /* + * Get more bmaps, fill in after the ones + * we already have in the table. + */ + mip->nmap = mip->map_size - mip->map_valid; + error = xfs_bmapi_read(dp, mip->map_off, + xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - + mip->map_off, + &map[mip->map_valid], &mip->nmap, 0); + + /* + * Don't know if we should ignore this or try to return an + * error. The trouble with returning errors is that readdir + * will just stop without actually passing the error through. + */ + if (error) + goto out; /* XXX */ + + /* + * If we got all the mappings we asked for, set the final map + * offset based on the last bmap value received. Otherwise, + * we've reached the end. + */ + if (mip->nmap == mip->map_size - mip->map_valid) { + i = mip->map_valid + mip->nmap - 1; + mip->map_off = map[i].br_startoff + map[i].br_blockcount; + } else + mip->map_off = xfs_dir2_byte_to_da(mp, + XFS_DIR2_LEAF_OFFSET); + + /* + * Look for holes in the mapping, and eliminate them. Count up + * the valid blocks. + */ + for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) { + if (map[i].br_startblock == HOLESTARTBLOCK) { + mip->nmap--; + length = mip->map_valid + mip->nmap - i; + if (length) + memmove(&map[i], &map[i + 1], + sizeof(map[i]) * length); + } else { + mip->map_blocks += map[i].br_blockcount; + i++; + } + } + mip->map_valid += mip->nmap; + } + + /* + * No valid mappings, so no more data blocks. + */ + if (!mip->map_valid) { + *curoff = xfs_dir2_da_to_byte(mp, mip->map_off); + goto out; + } + + /* + * Read the directory block starting at the first mapping. + */ + mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); + error = xfs_dir2_data_read(NULL, dp, map->br_startoff, + map->br_blockcount >= mp->m_dirblkfsbs ? + XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp); + + /* + * Should just skip over the data block instead of giving up. + */ + if (error) + goto out; /* XXX */ + + /* + * Adjust the current amount of read-ahead: we just read a block that + * was previously ra. + */ + if (mip->ra_current) + mip->ra_current -= mp->m_dirblkfsbs; + + /* + * Do we need more readahead? + */ + for (mip->ra_index = mip->ra_offset = i = 0; + mip->ra_want > mip->ra_current && i < mip->map_blocks; + i += mp->m_dirblkfsbs) { + ASSERT(mip->ra_index < mip->map_valid); + /* + * Read-ahead a contiguous directory block. + */ + if (i > mip->ra_current && + map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { + xfs_dir2_data_readahead(NULL, dp, + map[mip->ra_index].br_startoff + mip->ra_offset, + XFS_FSB_TO_DADDR(mp, + map[mip->ra_index].br_startblock + + mip->ra_offset)); + mip->ra_current = i; + } + + /* + * Read-ahead a non-contiguous directory block. This doesn't + * use our mapping, but this is a very rare case. + */ + else if (i > mip->ra_current) { + xfs_dir2_data_readahead(NULL, dp, + map[mip->ra_index].br_startoff + + mip->ra_offset, -1); + mip->ra_current = i; + } + + /* + * Advance offset through the mapping table. + */ + for (j = 0; j < mp->m_dirblkfsbs; j++) { + /* + * The rest of this extent but not more than a dir + * block. + */ + length = min_t(int, mp->m_dirblkfsbs, + map[mip->ra_index].br_blockcount - + mip->ra_offset); + j += length; + mip->ra_offset += length; + + /* + * Advance to the next mapping if this one is used up. + */ + if (mip->ra_offset == map[mip->ra_index].br_blockcount) { + mip->ra_offset = 0; + mip->ra_index++; + } + } + } + +out: + *bpp = bp; + return error; +} + /* * Getdents (readdir) for leaf and node directories. * This reads the data blocks only, so is the same for both forms. @@ -775,30 +1054,18 @@ xfs_dir2_leaf_getdents( xfs_off_t *offset, filldir_t filldir) { - xfs_dabuf_t *bp; /* data block buffer */ - int byteoff; /* offset in current block */ - xfs_dir2_db_t curdb; /* db for current block */ - xfs_dir2_off_t curoff; /* current overall offset */ + struct xfs_buf *bp = NULL; /* data block buffer */ xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ int error = 0; /* error return value */ - int i; /* temporary loop index */ - int j; /* temporary loop index */ int length; /* temporary length value */ - xfs_bmbt_irec_t *map; /* map vector for blocks */ - xfs_extlen_t map_blocks; /* number of fsbs in map */ - xfs_dablk_t map_off; /* last mapped file offset */ - int map_size; /* total entries in *map */ - int map_valid; /* valid entries in *map */ xfs_mount_t *mp; /* filesystem mount point */ + int byteoff; /* offset in current block */ + xfs_dir2_off_t curoff; /* current overall offset */ xfs_dir2_off_t newoff; /* new curoff after new blk */ - int nmap; /* mappings to ask xfs_bmapi */ char *ptr = NULL; /* pointer to current data */ - int ra_current; /* number of read-ahead blks */ - int ra_index; /* *map index for read-ahead */ - int ra_offset; /* map entry offset for ra */ - int ra_want; /* readahead count wanted */ + struct xfs_dir2_leaf_map_info *map_info; /* * If the offset is at or past the largest allowed value, @@ -814,10 +1081,12 @@ xfs_dir2_leaf_getdents( * buffer size, the directory block size, and the filesystem * block size. */ - map_size = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize); - map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP); - map_valid = ra_index = ra_offset = ra_current = map_blocks = 0; - bp = NULL; + length = howmany(bufsize + mp->m_dirblksize, + mp->m_sb.sb_blocksize); + map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + + (length * sizeof(struct xfs_bmbt_irec)), + KM_SLEEP); + map_info->map_size = length; /* * Inside the loop we keep the main offset value as a byte offset @@ -829,7 +1098,9 @@ xfs_dir2_leaf_getdents( * Force this conversion through db so we truncate the offset * down to get the start of the data block. */ - map_off = xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, curoff)); + map_info->map_off = xfs_dir2_db_to_da(mp, + xfs_dir2_byte_to_db(mp, curoff)); + /* * Loop over directory entries until we reach the end offset. * Get more blocks and readahead as necessary. @@ -839,191 +1110,17 @@ xfs_dir2_leaf_getdents( * If we have no buffer, or we're off the end of the * current buffer, need to get another one. */ - if (!bp || ptr >= (char *)bp->data + mp->m_dirblksize) { - /* - * If we have a buffer, we need to release it and - * take it out of the mapping. - */ - if (bp) { - xfs_da_brelse(NULL, bp); - bp = NULL; - map_blocks -= mp->m_dirblkfsbs; - /* - * Loop to get rid of the extents for the - * directory block. - */ - for (i = mp->m_dirblkfsbs; i > 0; ) { - j = MIN((int)map->br_blockcount, i); - map->br_blockcount -= j; - map->br_startblock += j; - map->br_startoff += j; - /* - * If mapping is done, pitch it from - * the table. - */ - if (!map->br_blockcount && --map_valid) - memmove(&map[0], &map[1], - sizeof(map[0]) * - map_valid); - i -= j; - } - } - /* - * Recalculate the readahead blocks wanted. - */ - ra_want = howmany(bufsize + mp->m_dirblksize, - mp->m_sb.sb_blocksize) - 1; - ASSERT(ra_want >= 0); + if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) { - /* - * If we don't have as many as we want, and we haven't - * run out of data blocks, get some more mappings. - */ - if (1 + ra_want > map_blocks && - map_off < - xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) { - /* - * Get more bmaps, fill in after the ones - * we already have in the table. - */ - nmap = map_size - map_valid; - error = xfs_bmapi_read(dp, map_off, - xfs_dir2_byte_to_da(mp, - XFS_DIR2_LEAF_OFFSET) - map_off, - &map[map_valid], &nmap, 0); - /* - * Don't know if we should ignore this or - * try to return an error. - * The trouble with returning errors - * is that readdir will just stop without - * actually passing the error through. - */ - if (error) - break; /* XXX */ - /* - * If we got all the mappings we asked for, - * set the final map offset based on the - * last bmap value received. - * Otherwise, we've reached the end. - */ - if (nmap == map_size - map_valid) - map_off = - map[map_valid + nmap - 1].br_startoff + - map[map_valid + nmap - 1].br_blockcount; - else - map_off = - xfs_dir2_byte_to_da(mp, - XFS_DIR2_LEAF_OFFSET); - /* - * Look for holes in the mapping, and - * eliminate them. Count up the valid blocks. - */ - for (i = map_valid; i < map_valid + nmap; ) { - if (map[i].br_startblock == - HOLESTARTBLOCK) { - nmap--; - length = map_valid + nmap - i; - if (length) - memmove(&map[i], - &map[i + 1], - sizeof(map[i]) * - length); - } else { - map_blocks += - map[i].br_blockcount; - i++; - } - } - map_valid += nmap; - } - /* - * No valid mappings, so no more data blocks. - */ - if (!map_valid) { - curoff = xfs_dir2_da_to_byte(mp, map_off); + error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info, + &curoff, &bp); + if (error || !map_info->map_valid) break; - } - /* - * Read the directory block starting at the first - * mapping. - */ - curdb = xfs_dir2_da_to_db(mp, map->br_startoff); - error = xfs_da_read_buf(NULL, dp, map->br_startoff, - map->br_blockcount >= mp->m_dirblkfsbs ? - XFS_FSB_TO_DADDR(mp, map->br_startblock) : - -1, - &bp, XFS_DATA_FORK); - /* - * Should just skip over the data block instead - * of giving up. - */ - if (error) - break; /* XXX */ - /* - * Adjust the current amount of read-ahead: we just - * read a block that was previously ra. - */ - if (ra_current) - ra_current -= mp->m_dirblkfsbs; - /* - * Do we need more readahead? - */ - for (ra_index = ra_offset = i = 0; - ra_want > ra_current && i < map_blocks; - i += mp->m_dirblkfsbs) { - ASSERT(ra_index < map_valid); - /* - * Read-ahead a contiguous directory block. - */ - if (i > ra_current && - map[ra_index].br_blockcount >= - mp->m_dirblkfsbs) { - xfs_buf_readahead(mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, - map[ra_index].br_startblock + - ra_offset), - (int)BTOBB(mp->m_dirblksize)); - ra_current = i; - } - /* - * Read-ahead a non-contiguous directory block. - * This doesn't use our mapping, but this - * is a very rare case. - */ - else if (i > ra_current) { - (void)xfs_da_reada_buf(NULL, dp, - map[ra_index].br_startoff + - ra_offset, XFS_DATA_FORK); - ra_current = i; - } - /* - * Advance offset through the mapping table. - */ - for (j = 0; j < mp->m_dirblkfsbs; j++) { - /* - * The rest of this extent but not - * more than a dir block. - */ - length = MIN(mp->m_dirblkfsbs, - (int)(map[ra_index].br_blockcount - - ra_offset)); - j += length; - ra_offset += length; - /* - * Advance to the next mapping if - * this one is used up. - */ - if (ra_offset == - map[ra_index].br_blockcount) { - ra_offset = 0; - ra_index++; - } - } - } + /* * Having done a read, we need to set a new offset. */ - newoff = xfs_dir2_db_off_to_byte(mp, curdb, 0); + newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0); /* * Start of the current block. */ @@ -1034,8 +1131,8 @@ xfs_dir2_leaf_getdents( */ else if (curoff > newoff) ASSERT(xfs_dir2_byte_to_db(mp, curoff) == - curdb); - hdr = bp->data; + map_info->curdb); + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); /* * Find our position in the block. @@ -1117,9 +1214,9 @@ xfs_dir2_leaf_getdents( *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; else *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; - kmem_free(map); + kmem_free(map_info); if (bp) - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return error; } @@ -1130,10 +1227,10 @@ int xfs_dir2_leaf_init( xfs_da_args_t *args, /* operation arguments */ xfs_dir2_db_t bno, /* directory block number */ - xfs_dabuf_t **bpp, /* out: leaf buffer */ + struct xfs_buf **bpp, /* out: leaf buffer */ int magic) /* magic number for block */ { - xfs_dabuf_t *bp; /* leaf buffer */ + struct xfs_buf *bp; /* leaf buffer */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -1151,15 +1248,14 @@ xfs_dir2_leaf_init( * Get the buffer for the block. */ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, - XFS_DATA_FORK); - if (error) { + XFS_DATA_FORK); + if (error) return error; - } - ASSERT(bp != NULL); - leaf = bp->data; + /* * Initialize the header. */ + leaf = bp->b_addr; leaf->hdr.info.magic = cpu_to_be16(magic); leaf->hdr.info.forw = 0; leaf->hdr.info.back = 0; @@ -1172,10 +1268,12 @@ xfs_dir2_leaf_init( * the block. */ if (magic == XFS_DIR2_LEAF1_MAGIC) { + bp->b_ops = &xfs_dir2_leaf1_buf_ops; ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = 0; xfs_dir2_leaf_log_tail(tp, bp); - } + } else + bp->b_ops = &xfs_dir2_leafn_buf_ops; *bpp = bp; return 0; } @@ -1186,7 +1284,7 @@ xfs_dir2_leaf_init( static void xfs_dir2_leaf_log_bests( xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ int last) /* last entry to log */ { @@ -1195,12 +1293,12 @@ xfs_dir2_leaf_log_bests( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf); firstb = xfs_dir2_leaf_bests_p(ltp) + first; lastb = xfs_dir2_leaf_bests_p(ltp) + last; - xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), + xfs_trans_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1)); } @@ -1210,7 +1308,7 @@ xfs_dir2_leaf_log_bests( void xfs_dir2_leaf_log_ents( xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ int last) /* last entry to log */ { @@ -1218,12 +1316,12 @@ xfs_dir2_leaf_log_ents( xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ xfs_dir2_leaf_t *leaf; /* leaf structure */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); firstlep = &leaf->ents[first]; lastlep = &leaf->ents[last]; - xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), + xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); } @@ -1232,15 +1330,15 @@ xfs_dir2_leaf_log_ents( */ void xfs_dir2_leaf_log_header( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* leaf buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp) { xfs_dir2_leaf_t *leaf; /* leaf structure */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), + xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), (uint)(sizeof(leaf->hdr) - 1)); } @@ -1249,18 +1347,18 @@ xfs_dir2_leaf_log_header( */ STATIC void xfs_dir2_leaf_log_tail( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* leaf buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp) { xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ xfs_mount_t *mp; /* filesystem mount point */ mp = tp->t_mountp; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(mp, leaf); - xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), + xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), (uint)(mp->m_dirblksize - 1)); } @@ -1273,12 +1371,12 @@ int xfs_dir2_leaf_lookup( xfs_da_args_t *args) /* operation arguments */ { - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ int index; /* found entry index */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ @@ -1294,7 +1392,7 @@ xfs_dir2_leaf_lookup( tp = args->trans; dp = args->dp; xfs_dir2_leaf_check(dp, lbp); - leaf = lbp->data; + leaf = lbp->b_addr; /* * Get to the leaf entry and contained data entry address. */ @@ -1303,15 +1401,15 @@ xfs_dir2_leaf_lookup( * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *) - ((char *)dbp->data + + ((char *)dbp->b_addr + xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); /* * Return the found inode number & CI name if appropriate */ args->inumber = be64_to_cpu(dep->inumber); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); - xfs_da_brelse(tp, dbp); - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, dbp); + xfs_trans_brelse(tp, lbp); return XFS_ERROR(error); } @@ -1324,17 +1422,17 @@ xfs_dir2_leaf_lookup( static int /* error */ xfs_dir2_leaf_lookup_int( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t **lbpp, /* out: leaf buffer */ + struct xfs_buf **lbpp, /* out: leaf buffer */ int *indexp, /* out: index in leaf block */ - xfs_dabuf_t **dbpp) /* out: data buffer */ + struct xfs_buf **dbpp) /* out: data buffer */ { xfs_dir2_db_t curdb = -1; /* current data block number */ - xfs_dabuf_t *dbp = NULL; /* data buffer */ + struct xfs_buf *dbp = NULL; /* data buffer */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ int index; /* index in leaf block */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_mount_t *mp; /* filesystem mount point */ @@ -1346,15 +1444,13 @@ xfs_dir2_leaf_lookup_int( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the leaf block into the buffer. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK); + + error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); if (error) return error; + *lbpp = lbp; - leaf = lbp->data; + leaf = lbp->b_addr; xfs_dir2_leaf_check(dp, lbp); /* * Look for the first leaf entry with our hash value. @@ -1382,21 +1478,20 @@ xfs_dir2_leaf_lookup_int( */ if (newdb != curdb) { if (dbp) - xfs_da_brelse(tp, dbp); - error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, newdb), - -1, &dbp, XFS_DATA_FORK); + xfs_trans_brelse(tp, dbp); + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, newdb), + -1, &dbp); if (error) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } - xfs_dir2_data_check(dp, dbp); curdb = newdb; } /* * Point to the data entry. */ - dep = (xfs_dir2_data_entry_t *)((char *)dbp->data + + dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); /* * Compare name and if it's an exact match, return the index @@ -1424,12 +1519,12 @@ xfs_dir2_leaf_lookup_int( if (args->cmpresult == XFS_CMP_CASE) { ASSERT(cidb != -1); if (cidb != curdb) { - xfs_da_brelse(tp, dbp); - error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, cidb), - -1, &dbp, XFS_DATA_FORK); + xfs_trans_brelse(tp, dbp); + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, cidb), + -1, &dbp); if (error) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } } @@ -1441,8 +1536,8 @@ xfs_dir2_leaf_lookup_int( */ ASSERT(cidb == -1); if (dbp) - xfs_da_brelse(tp, dbp); - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, dbp); + xfs_trans_brelse(tp, lbp); return XFS_ERROR(ENOENT); } @@ -1456,13 +1551,13 @@ xfs_dir2_leaf_removename( __be16 *bestsp; /* leaf block best freespace */ xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t db; /* data block number */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data entry structure */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ xfs_dir2_db_t i; /* temporary data block # */ int index; /* index into leaf entries */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ @@ -1483,8 +1578,8 @@ xfs_dir2_leaf_removename( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = lbp->data; - hdr = dbp->data; + leaf = lbp->b_addr; + hdr = dbp->b_addr; xfs_dir2_data_check(dp, dbp); /* * Point to the leaf entry, use that to point to the data entry. @@ -1541,12 +1636,9 @@ xfs_dir2_leaf_removename( * Just go on, returning success, leaving the * empty block in place. */ - if (error == ENOSPC && args->total == 0) { - xfs_da_buf_done(dbp); + if (error == ENOSPC && args->total == 0) error = 0; - } xfs_dir2_leaf_check(dp, lbp); - xfs_da_buf_done(lbp); return error; } dbp = NULL; @@ -1577,10 +1669,9 @@ xfs_dir2_leaf_removename( /* * If the data block was not the first one, drop it. */ - else if (db != mp->m_dirdatablk && dbp != NULL) { - xfs_da_buf_done(dbp); + else if (db != mp->m_dirdatablk) dbp = NULL; - } + xfs_dir2_leaf_check(dp, lbp); /* * See if we can convert to block form. @@ -1595,12 +1686,12 @@ int /* error */ xfs_dir2_leaf_replace( xfs_da_args_t *args) /* operation arguments */ { - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ int index; /* index of leaf entry */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ @@ -1614,7 +1705,7 @@ xfs_dir2_leaf_replace( return error; } dp = args->dp; - leaf = lbp->data; + leaf = lbp->b_addr; /* * Point to the leaf entry, get data address from it. */ @@ -1623,7 +1714,7 @@ xfs_dir2_leaf_replace( * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *) - ((char *)dbp->data + + ((char *)dbp->b_addr + xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); ASSERT(args->inumber != be64_to_cpu(dep->inumber)); /* @@ -1632,9 +1723,8 @@ xfs_dir2_leaf_replace( dep->inumber = cpu_to_be64(args->inumber); tp = args->trans; xfs_dir2_data_log_entry(tp, dbp, dep); - xfs_da_buf_done(dbp); xfs_dir2_leaf_check(dp, lbp); - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return 0; } @@ -1646,7 +1736,7 @@ xfs_dir2_leaf_replace( int /* index value */ xfs_dir2_leaf_search_hash( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp) /* leaf buffer */ + struct xfs_buf *lbp) /* leaf buffer */ { xfs_dahash_t hash=0; /* hash from this entry */ xfs_dahash_t hashwant; /* hash value looking for */ @@ -1656,7 +1746,7 @@ xfs_dir2_leaf_search_hash( xfs_dir2_leaf_entry_t *lep; /* leaf entry */ int mid=0; /* current leaf index */ - leaf = lbp->data; + leaf = lbp->b_addr; #ifndef __KERNEL__ if (!leaf->hdr.count) return 0; @@ -1699,11 +1789,11 @@ xfs_dir2_leaf_search_hash( int /* error */ xfs_dir2_leaf_trim_data( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp, /* leaf buffer */ + struct xfs_buf *lbp, /* leaf buffer */ xfs_dir2_db_t db) /* data block number */ { __be16 *bestsp; /* leaf bests table */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -1717,17 +1807,16 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, - XFS_DATA_FORK))) { + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp); + if (error) return error; - } - leaf = lbp->data; + leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); #ifdef DEBUG { - struct xfs_dir2_data_hdr *hdr = dbp->data; + struct xfs_dir2_data_hdr *hdr = dbp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); ASSERT(be16_to_cpu(hdr->bestfree[0].length) == @@ -1741,7 +1830,7 @@ xfs_dir2_leaf_trim_data( */ if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { ASSERT(error != ENOSPC); - xfs_da_brelse(tp, dbp); + xfs_trans_brelse(tp, dbp); return error; } /* @@ -1781,10 +1870,10 @@ xfs_dir2_node_to_leaf( xfs_da_args_t *args; /* operation arguments */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ - xfs_dabuf_t *fbp; /* buffer for freespace block */ + struct xfs_buf *fbp; /* buffer for freespace block */ xfs_fileoff_t fo; /* freespace file offset */ xfs_dir2_free_t *free; /* freespace structure */ - xfs_dabuf_t *lbp; /* buffer for leaf block */ + struct xfs_buf *lbp; /* buffer for leaf block */ xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_mount_t *mp; /* filesystem mount point */ @@ -1838,16 +1927,15 @@ xfs_dir2_node_to_leaf( if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize) return 0; lbp = state->path.blk[0].bp; - leaf = lbp->data; + leaf = lbp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); /* * Read the freespace block. */ - if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, - XFS_DATA_FORK))) { + error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp); + if (error) return error; - } - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(!free->hdr.firstdb); @@ -1857,7 +1945,7 @@ xfs_dir2_node_to_leaf( */ if (xfs_dir2_leaf_size(&leaf->hdr, be32_to_cpu(free->hdr.nvalid)) > mp->m_dirblksize) { - xfs_da_brelse(tp, fbp); + xfs_trans_brelse(tp, fbp); return 0; } @@ -1869,7 +1957,10 @@ xfs_dir2_node_to_leaf( xfs_dir2_leaf_compact(args, lbp); else xfs_dir2_leaf_log_header(tp, lbp); + + lbp->b_ops = &xfs_dir2_leaf1_buf_ops; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); + /* * Set up the leaf tail from the freespace block. */ diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index b0f26780449..5980f9b7fa9 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -36,40 +36,108 @@ /* * Function declarations. */ -static void xfs_dir2_free_log_header(xfs_trans_t *tp, xfs_dabuf_t *bp); -static int xfs_dir2_leafn_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index); +static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args, + int index); #ifdef DEBUG -static void xfs_dir2_leafn_check(xfs_inode_t *dp, xfs_dabuf_t *bp); +static void xfs_dir2_leafn_check(struct xfs_inode *dp, struct xfs_buf *bp); #else #define xfs_dir2_leafn_check(dp, bp) #endif -static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, xfs_dabuf_t *bp_s, - int start_s, xfs_dabuf_t *bp_d, int start_d, - int count); +static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, struct xfs_buf *bp_s, + int start_s, struct xfs_buf *bp_d, + int start_d, int count); static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); -static int xfs_dir2_leafn_remove(xfs_da_args_t *args, xfs_dabuf_t *bp, +static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, int index, xfs_da_state_blk_t *dblk, int *rval); static int xfs_dir2_node_addname_int(xfs_da_args_t *args, xfs_da_state_blk_t *fblk); +static void +xfs_dir2_free_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_free_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC); + if (!block_ok) { + XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic", + XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_dir2_free_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_free_verify(bp); +} + +static void +xfs_dir2_free_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_free_verify(bp); +} + +static const struct xfs_buf_ops xfs_dir2_free_buf_ops = { + .verify_read = xfs_dir2_free_read_verify, + .verify_write = xfs_dir2_free_write_verify, +}; + + +static int +__xfs_dir2_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir2_free_buf_ops); +} + +int +xfs_dir2_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp); +} + +static int +xfs_dir2_free_try_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp); +} + /* * Log entries from a freespace block. */ STATIC void xfs_dir2_free_log_bests( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* freespace buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, int first, /* first entry to log */ int last) /* last entry to log */ { xfs_dir2_free_t *free; /* freespace structure */ - free = bp->data; + free = bp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - xfs_da_log_buf(tp, bp, + xfs_trans_log_buf(tp, bp, (uint)((char *)&free->bests[first] - (char *)free), (uint)((char *)&free->bests[last] - (char *)free + sizeof(free->bests[0]) - 1)); @@ -80,14 +148,14 @@ xfs_dir2_free_log_bests( */ static void xfs_dir2_free_log_header( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* freespace buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp) { xfs_dir2_free_t *free; /* freespace structure */ - free = bp->data; + free = bp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free), + xfs_trans_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free), (uint)(sizeof(xfs_dir2_free_hdr_t) - 1)); } @@ -99,11 +167,11 @@ xfs_dir2_free_log_header( int /* error */ xfs_dir2_leaf_to_node( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp) /* leaf buffer */ + struct xfs_buf *lbp) /* leaf buffer */ { xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ - xfs_dabuf_t *fbp; /* freespace buffer */ + struct xfs_buf *fbp; /* freespace buffer */ xfs_dir2_db_t fdb; /* freespace block number */ xfs_dir2_free_t *free; /* freespace structure */ __be16 *from; /* pointer to freespace entry */ @@ -131,13 +199,14 @@ xfs_dir2_leaf_to_node( /* * Get the buffer for the new freespace block. */ - if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, - XFS_DATA_FORK))) { + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, + XFS_DATA_FORK); + if (error) return error; - } - ASSERT(fbp != NULL); - free = fbp->data; - leaf = lbp->data; + fbp->b_ops = &xfs_dir2_free_buf_ops; + + free = fbp->b_addr; + leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); /* * Initialize the freespace block header. @@ -157,14 +226,16 @@ xfs_dir2_leaf_to_node( *to = cpu_to_be16(off); } free->hdr.nused = cpu_to_be32(n); + + lbp->b_ops = &xfs_dir2_leafn_buf_ops; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + /* * Log everything. */ xfs_dir2_leaf_log_header(tp, lbp); xfs_dir2_free_log_header(tp, fbp); xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1); - xfs_da_buf_done(fbp); xfs_dir2_leafn_check(dp, lbp); return 0; } @@ -175,7 +246,7 @@ xfs_dir2_leaf_to_node( */ static int /* error */ xfs_dir2_leafn_add( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int index) /* insertion pt for new entry */ { @@ -195,7 +266,7 @@ xfs_dir2_leafn_add( dp = args->dp; mp = dp->i_mount; tp = args->trans; - leaf = bp->data; + leaf = bp->b_addr; /* * Quick check just to make sure we are not going to index @@ -261,15 +332,15 @@ xfs_dir2_leafn_add( */ void xfs_dir2_leafn_check( - xfs_inode_t *dp, /* incore directory inode */ - xfs_dabuf_t *bp) /* leaf buffer */ + struct xfs_inode *dp, + struct xfs_buf *bp) { int i; /* leaf index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_mount_t *mp; /* filesystem mount point */ int stale; /* count of stale leaves */ - leaf = bp->data; + leaf = bp->b_addr; mp = dp->i_mount; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); @@ -291,12 +362,12 @@ xfs_dir2_leafn_check( */ xfs_dahash_t /* hash value */ xfs_dir2_leafn_lasthash( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int *count) /* count of entries in leaf */ { xfs_dir2_leaf_t *leaf; /* leaf structure */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); if (count) *count = be16_to_cpu(leaf->hdr.count); @@ -311,12 +382,12 @@ xfs_dir2_leafn_lasthash( */ STATIC int xfs_dir2_leafn_lookup_for_addname( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int *indexp, /* out: leaf entry index */ xfs_da_state_t *state) /* state to fill in */ { - xfs_dabuf_t *curbp = NULL; /* current data/free buffer */ + struct xfs_buf *curbp = NULL; /* current data/free buffer */ xfs_dir2_db_t curdb = -1; /* current data block number */ xfs_dir2_db_t curfdb = -1; /* current free block number */ xfs_inode_t *dp; /* incore directory inode */ @@ -335,7 +406,7 @@ xfs_dir2_leafn_lookup_for_addname( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); #ifdef __KERNEL__ ASSERT(be16_to_cpu(leaf->hdr.count) > 0); @@ -352,7 +423,7 @@ xfs_dir2_leafn_lookup_for_addname( /* If so, it's a free block buffer, get the block number. */ curbp = state->extrablk.bp; curfdb = state->extrablk.blkno; - free = curbp->data; + free = curbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); } length = xfs_dir2_data_entsize(args->namelen); @@ -394,16 +465,14 @@ xfs_dir2_leafn_lookup_for_addname( * If we had one before, drop it. */ if (curbp) - xfs_da_brelse(tp, curbp); - /* - * Read the free block. - */ - error = xfs_da_read_buf(tp, dp, + xfs_trans_brelse(tp, curbp); + + error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, newfdb), - -1, &curbp, XFS_DATA_FORK); + &curbp); if (error) return error; - free = curbp->data; + free = curbp->b_addr; ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); ASSERT((be32_to_cpu(free->hdr.firstdb) % @@ -424,7 +493,7 @@ xfs_dir2_leafn_lookup_for_addname( XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", XFS_ERRLEVEL_LOW, mp); if (curfdb != newfdb) - xfs_da_brelse(tp, curbp); + xfs_trans_brelse(tp, curbp); return XFS_ERROR(EFSCORRUPTED); } curfdb = newfdb; @@ -459,12 +528,12 @@ out: */ STATIC int xfs_dir2_leafn_lookup_for_entry( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int *indexp, /* out: leaf entry index */ xfs_da_state_t *state) /* state to fill in */ { - xfs_dabuf_t *curbp = NULL; /* current data/free buffer */ + struct xfs_buf *curbp = NULL; /* current data/free buffer */ xfs_dir2_db_t curdb = -1; /* current data block number */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ @@ -480,7 +549,7 @@ xfs_dir2_leafn_lookup_for_entry( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); #ifdef __KERNEL__ ASSERT(be16_to_cpu(leaf->hdr.count) > 0); @@ -525,7 +594,7 @@ xfs_dir2_leafn_lookup_for_entry( */ if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT || curdb != state->extrablk.blkno)) - xfs_da_brelse(tp, curbp); + xfs_trans_brelse(tp, curbp); /* * If needing the block that is saved with a CI match, * use it otherwise read in the new data block. @@ -535,9 +604,9 @@ xfs_dir2_leafn_lookup_for_entry( ASSERT(state->extravalid); curbp = state->extrablk.bp; } else { - error = xfs_da_read_buf(tp, dp, + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, newdb), - -1, &curbp, XFS_DATA_FORK); + -1, &curbp); if (error) return error; } @@ -547,7 +616,7 @@ xfs_dir2_leafn_lookup_for_entry( /* * Point to the data entry. */ - dep = (xfs_dir2_data_entry_t *)((char *)curbp->data + + dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); /* * Compare the entry and if it's an exact match, return @@ -559,7 +628,7 @@ xfs_dir2_leafn_lookup_for_entry( /* If there is a CI match block, drop it */ if (args->cmpresult != XFS_CMP_DIFFERENT && curdb != state->extrablk.blkno) - xfs_da_brelse(tp, state->extrablk.bp); + xfs_trans_brelse(tp, state->extrablk.bp); args->cmpresult = cmp; args->inumber = be64_to_cpu(dep->inumber); *indexp = index; @@ -567,8 +636,9 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.bp = curbp; state->extrablk.blkno = curdb; state->extrablk.index = (int)((char *)dep - - (char *)curbp->data); + (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_ops = &xfs_dir2_data_buf_ops; if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); } @@ -583,10 +653,11 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = -1; state->extrablk.blkno = curdb; state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_ops = &xfs_dir2_data_buf_ops; } else { /* If the curbp is not the CI match block, drop it */ if (state->extrablk.bp != curbp) - xfs_da_brelse(tp, curbp); + xfs_trans_brelse(tp, curbp); } } else { state->extravalid = 0; @@ -602,7 +673,7 @@ xfs_dir2_leafn_lookup_for_entry( */ int xfs_dir2_leafn_lookup_int( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int *indexp, /* out: leaf entry index */ xfs_da_state_t *state) /* state to fill in */ @@ -620,9 +691,9 @@ xfs_dir2_leafn_lookup_int( static void xfs_dir2_leafn_moveents( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp_s, /* source leaf buffer */ + struct xfs_buf *bp_s, /* source leaf buffer */ int start_s, /* source leaf index */ - xfs_dabuf_t *bp_d, /* destination leaf buffer */ + struct xfs_buf *bp_d, /* destination leaf buffer */ int start_d, /* destination leaf index */ int count) /* count of leaves to copy */ { @@ -640,8 +711,8 @@ xfs_dir2_leafn_moveents( return; } tp = args->trans; - leaf_s = bp_s->data; - leaf_d = bp_d->data; + leaf_s = bp_s->b_addr; + leaf_d = bp_d->b_addr; /* * If the destination index is not the end of the current * destination leaf entries, open up a hole in the destination @@ -702,14 +773,14 @@ xfs_dir2_leafn_moveents( */ int /* sort order */ xfs_dir2_leafn_order( - xfs_dabuf_t *leaf1_bp, /* leaf1 buffer */ - xfs_dabuf_t *leaf2_bp) /* leaf2 buffer */ + struct xfs_buf *leaf1_bp, /* leaf1 buffer */ + struct xfs_buf *leaf2_bp) /* leaf2 buffer */ { xfs_dir2_leaf_t *leaf1; /* leaf1 structure */ xfs_dir2_leaf_t *leaf2; /* leaf2 structure */ - leaf1 = leaf1_bp->data; - leaf2 = leaf2_bp->data; + leaf1 = leaf1_bp->b_addr; + leaf2 = leaf2_bp->b_addr; ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); if (be16_to_cpu(leaf1->hdr.count) > 0 && @@ -757,8 +828,8 @@ xfs_dir2_leafn_rebalance( blk1 = blk2; blk2 = tmp; } - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count); #ifdef DEBUG oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale); @@ -826,6 +897,77 @@ xfs_dir2_leafn_rebalance( } } +static int +xfs_dir2_data_block_free( + xfs_da_args_t *args, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_free *free, + xfs_dir2_db_t fdb, + int findex, + struct xfs_buf *fbp, + int longest) +{ + struct xfs_trans *tp = args->trans; + int logfree = 0; + + if (!hdr) { + /* One less used entry in the free table. */ + be32_add_cpu(&free->hdr.nused, -1); + xfs_dir2_free_log_header(tp, fbp); + + /* + * If this was the last entry in the table, we can trim the + * table size back. There might be other entries at the end + * referring to non-existent data blocks, get those too. + */ + if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { + int i; /* free entry index */ + + for (i = findex - 1; i >= 0; i--) { + if (free->bests[i] != cpu_to_be16(NULLDATAOFF)) + break; + } + free->hdr.nvalid = cpu_to_be32(i + 1); + logfree = 0; + } else { + /* Not the last entry, just punch it out. */ + free->bests[findex] = cpu_to_be16(NULLDATAOFF); + logfree = 1; + } + /* + * If there are no useful entries left in the block, + * get rid of the block if we can. + */ + if (!free->hdr.nused) { + int error; + + error = xfs_dir2_shrink_inode(args, fdb, fbp); + if (error == 0) { + fbp = NULL; + logfree = 0; + } else if (error != ENOSPC || args->total != 0) + return error; + /* + * It's possible to get ENOSPC if there is no + * space reservation. In this case some one + * else will eventually get rid of this block. + */ + } + } else { + /* + * Data block is not empty, just set the free entry to the new + * value. + */ + free->bests[findex] = cpu_to_be16(longest); + logfree = 1; + } + + /* Log the free entry that changed, unless we got rid of it. */ + if (logfree) + xfs_dir2_free_log_bests(tp, fbp, findex, findex); + return 0; +} + /* * Remove an entry from a node directory. * This removes the leaf entry and the data entry, @@ -834,14 +976,14 @@ xfs_dir2_leafn_rebalance( static int /* error */ xfs_dir2_leafn_remove( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int index, /* leaf entry index */ xfs_da_state_blk_t *dblk, /* data block */ int *rval) /* resulting block needs join */ { xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t db; /* data block number */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -858,7 +1000,7 @@ xfs_dir2_leafn_remove( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); /* * Point to the entry we're removing. @@ -884,7 +1026,7 @@ xfs_dir2_leafn_remove( * in the data block in case it changes. */ dbp = dblk->bp; - hdr = dbp->data; + hdr = dbp->b_addr; dep = (xfs_dir2_data_entry_t *)((char *)hdr + off); longest = be16_to_cpu(hdr->bestfree[0].length); needlog = needscan = 0; @@ -905,22 +1047,21 @@ xfs_dir2_leafn_remove( */ if (longest < be16_to_cpu(hdr->bestfree[0].length)) { int error; /* error return value */ - xfs_dabuf_t *fbp; /* freeblock buffer */ + struct xfs_buf *fbp; /* freeblock buffer */ xfs_dir2_db_t fdb; /* freeblock block number */ int findex; /* index in freeblock entries */ xfs_dir2_free_t *free; /* freeblock structure */ - int logfree; /* need to log free entry */ /* * Convert the data block number to a free block, * read in the free block. */ fdb = xfs_dir2_db_to_fdb(mp, db); - if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), - -1, &fbp, XFS_DATA_FORK))) { + error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb), + &fbp); + if (error) return error; - } - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(be32_to_cpu(free->hdr.firstdb) == xfs_dir2_free_max_bests(mp) * @@ -948,82 +1089,19 @@ xfs_dir2_leafn_remove( * In this case just drop the buffer and some one else * will eventually get rid of the empty block. */ - else if (error == ENOSPC && args->total == 0) - xfs_da_buf_done(dbp); - else + else if (!(error == ENOSPC && args->total == 0)) return error; } /* * If we got rid of the data block, we can eliminate that entry * in the free block. */ - if (hdr == NULL) { - /* - * One less used entry in the free table. - */ - be32_add_cpu(&free->hdr.nused, -1); - xfs_dir2_free_log_header(tp, fbp); - /* - * If this was the last entry in the table, we can - * trim the table size back. There might be other - * entries at the end referring to non-existent - * data blocks, get those too. - */ - if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { - int i; /* free entry index */ - - for (i = findex - 1; - i >= 0 && - free->bests[i] == cpu_to_be16(NULLDATAOFF); - i--) - continue; - free->hdr.nvalid = cpu_to_be32(i + 1); - logfree = 0; - } - /* - * Not the last entry, just punch it out. - */ - else { - free->bests[findex] = cpu_to_be16(NULLDATAOFF); - logfree = 1; - } - /* - * If there are no useful entries left in the block, - * get rid of the block if we can. - */ - if (!free->hdr.nused) { - error = xfs_dir2_shrink_inode(args, fdb, fbp); - if (error == 0) { - fbp = NULL; - logfree = 0; - } else if (error != ENOSPC || args->total != 0) - return error; - /* - * It's possible to get ENOSPC if there is no - * space reservation. In this case some one - * else will eventually get rid of this block. - */ - } - } - /* - * Data block is not empty, just set the free entry to - * the new value. - */ - else { - free->bests[findex] = cpu_to_be16(longest); - logfree = 1; - } - /* - * Log the free entry that changed, unless we got rid of it. - */ - if (logfree) - xfs_dir2_free_log_bests(tp, fbp, findex, findex); - /* - * Drop the buffer if we still have it. - */ - if (fbp) - xfs_da_buf_done(fbp); + error = xfs_dir2_data_block_free(args, hdr, free, + fdb, findex, fbp, longest); + if (error) + return error; } + xfs_dir2_leafn_check(dp, bp); /* * Return indication of whether this leaf block is empty enough @@ -1114,7 +1192,7 @@ xfs_dir2_leafn_toosmall( { xfs_da_state_blk_t *blk; /* leaf block */ xfs_dablk_t blkno; /* leaf block number */ - xfs_dabuf_t *bp; /* leaf buffer */ + struct xfs_buf *bp; /* leaf buffer */ int bytes; /* bytes in use */ int count; /* leaf live entry count */ int error; /* error return value */ @@ -1130,7 +1208,7 @@ xfs_dir2_leafn_toosmall( * to coalesce with a sibling. */ blk = &state->path.blk[state->path.active - 1]; - info = blk->bp->data; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); leaf = (xfs_dir2_leaf_t *)info; count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); @@ -1177,19 +1255,18 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - if ((error = - xfs_da_read_buf(state->args->trans, state->args->dp, blkno, - -1, &bp, XFS_DATA_FORK))) { + error = xfs_dir2_leafn_read(state->args->trans, state->args->dp, + blkno, -1, &bp); + if (error) return error; - } - ASSERT(bp != NULL); + /* * Count bytes in the two blocks combined. */ leaf = (xfs_dir2_leaf_t *)info; count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); bytes = state->blocksize - (state->blocksize >> 2); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); bytes -= count * (uint)sizeof(leaf->ents[0]); @@ -1198,7 +1275,7 @@ xfs_dir2_leafn_toosmall( */ if (bytes >= 0) break; - xfs_da_brelse(state->args->trans, bp); + xfs_trans_brelse(state->args->trans, bp); } /* * Didn't like either block, give up. @@ -1207,11 +1284,7 @@ xfs_dir2_leafn_toosmall( *action = 0; return 0; } - /* - * Done with the sibling leaf block here, drop the dabuf - * so path_shift can get it. - */ - xfs_da_buf_done(bp); + /* * Make altpath point to the block we want to keep (the lower * numbered block) and path point to the block we want to drop. @@ -1247,8 +1320,8 @@ xfs_dir2_leafn_unbalance( args = state->args; ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC); - drop_leaf = drop_blk->bp->data; - save_leaf = save_blk->bp->data; + drop_leaf = drop_blk->bp->b_addr; + save_leaf = save_blk->bp->b_addr; ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); /* @@ -1356,13 +1429,13 @@ xfs_dir2_node_addname_int( { xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t dbno; /* data block number */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* data unused entry pointer */ int error; /* error return value */ xfs_dir2_db_t fbno; /* freespace block number */ - xfs_dabuf_t *fbp; /* freespace buffer */ + struct xfs_buf *fbp; /* freespace buffer */ int findex; /* freespace entry index */ xfs_dir2_free_t *free=NULL; /* freespace block structure */ xfs_dir2_db_t ifbno; /* initial freespace block no */ @@ -1390,7 +1463,7 @@ xfs_dir2_node_addname_int( * Remember initial freespace block number. */ ifbno = fblk->blkno; - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = fblk->index; /* @@ -1466,15 +1539,14 @@ xfs_dir2_node_addname_int( * This should be really rare, so there's no reason * to avoid it. */ - if ((error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), -2, &fbp, - XFS_DATA_FORK))) { + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + &fbp); + if (error) return error; - } - if (unlikely(fbp == NULL)) { + if (!fbp) continue; - } - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = 0; } @@ -1492,7 +1564,7 @@ xfs_dir2_node_addname_int( /* * Drop the block. */ - xfs_da_brelse(tp, fbp); + xfs_trans_brelse(tp, fbp); fbp = NULL; if (fblk && fblk->bp) fblk->bp = NULL; @@ -1507,36 +1579,23 @@ xfs_dir2_node_addname_int( /* * Not allowed to allocate, return failure. */ - if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || - args->total == 0) { - /* - * Drop the freespace buffer unless it came from our - * caller. - */ - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) return XFS_ERROR(ENOSPC); - } + /* * Allocate and initialize the new data block. */ if (unlikely((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &dbno)) || - (error = xfs_dir2_data_init(args, dbno, &dbp)))) { - /* - * Drop the freespace buffer unless it came from our - * caller. - */ - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + (error = xfs_dir2_data_init(args, dbno, &dbp)))) return error; - } + /* * If (somehow) we have a freespace block, get rid of it. */ if (fbp) - xfs_da_brelse(tp, fbp); + xfs_trans_brelse(tp, fbp); if (fblk && fblk->bp) fblk->bp = NULL; @@ -1545,12 +1604,12 @@ xfs_dir2_node_addname_int( * that was just allocated. */ fbno = xfs_dir2_db_to_fdb(mp, dbno); - if (unlikely(error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), -2, &fbp, - XFS_DATA_FORK))) { - xfs_da_buf_done(dbp); + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + &fbp); + if (error) return error; - } + /* * If there wasn't a freespace block, the read will * return a NULL fbp. Allocate and initialize a new one. @@ -1587,18 +1646,18 @@ xfs_dir2_node_addname_int( /* * Get a buffer for the new block. */ - if ((error = xfs_da_get_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - -1, &fbp, XFS_DATA_FORK))) { + error = xfs_da_get_buf(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + -1, &fbp, XFS_DATA_FORK); + if (error) return error; - } - ASSERT(fbp != NULL); + fbp->b_ops = &xfs_dir2_free_buf_ops; /* * Initialize the new block to be empty, and remember * its first slot as our empty slot. */ - free = fbp->data; + free = fbp->b_addr; free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); free->hdr.firstdb = cpu_to_be32( (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * @@ -1606,7 +1665,7 @@ xfs_dir2_node_addname_int( free->hdr.nvalid = 0; free->hdr.nused = 0; } else { - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); } @@ -1639,7 +1698,7 @@ xfs_dir2_node_addname_int( * We haven't allocated the data entry yet so this will * change again. */ - hdr = dbp->data; + hdr = dbp->b_addr; free->bests[findex] = hdr->bestfree[0].length; logfree = 1; } @@ -1650,22 +1709,17 @@ xfs_dir2_node_addname_int( /* * If just checking, we succeeded. */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) { - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; - } + /* * Read the data block in. */ - if (unlikely( - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), - -1, &dbp, XFS_DATA_FORK))) { - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno), + -1, &dbp); + if (error) return error; - } - hdr = dbp->data; + hdr = dbp->b_addr; logfree = 0; } ASSERT(be16_to_cpu(hdr->bestfree[0].length) >= length); @@ -1714,16 +1768,10 @@ xfs_dir2_node_addname_int( if (logfree) xfs_dir2_free_log_bests(tp, fbp, findex, findex); /* - * If the caller didn't hand us the freespace block, drop it. - */ - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); - /* * Return the data block and offset in args, then drop the data block. */ args->blkno = (xfs_dablk_t)dbno; args->index = be16_to_cpu(*tagp); - xfs_da_buf_done(dbp); return 0; } @@ -1761,22 +1809,23 @@ xfs_dir2_node_lookup( /* If a CI match, dup the actual name and return EEXIST */ xfs_dir2_data_entry_t *dep; - dep = (xfs_dir2_data_entry_t *)((char *)state->extrablk.bp-> - data + state->extrablk.index); + dep = (xfs_dir2_data_entry_t *) + ((char *)state->extrablk.bp->b_addr + + state->extrablk.index); rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen); } /* * Release the btree blocks and leaf block. */ for (i = 0; i < state->path.active; i++) { - xfs_da_brelse(args->trans, state->path.blk[i].bp); + xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } /* * Release the data block if we have it. */ if (state->extravalid && state->extrablk.bp) { - xfs_da_brelse(args->trans, state->extrablk.bp); + xfs_trans_brelse(args->trans, state->extrablk.bp); state->extrablk.bp = NULL; } xfs_da_state_free(state); @@ -1893,13 +1942,13 @@ xfs_dir2_node_replace( */ blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); - leaf = blk->bp->data; + leaf = blk->bp->b_addr; lep = &leaf->ents[blk->index]; ASSERT(state->extravalid); /* * Point to the data entry. */ - hdr = state->extrablk.bp->data; + hdr = state->extrablk.bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); dep = (xfs_dir2_data_entry_t *) ((char *)hdr + @@ -1916,14 +1965,14 @@ xfs_dir2_node_replace( * Didn't find it, and we're holding a data block. Drop it. */ else if (state->extravalid) { - xfs_da_brelse(args->trans, state->extrablk.bp); + xfs_trans_brelse(args->trans, state->extrablk.bp); state->extrablk.bp = NULL; } /* * Release all the buffers in the cursor. */ for (i = 0; i < state->path.active; i++) { - xfs_da_brelse(args->trans, state->path.blk[i].bp); + xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } xfs_da_state_free(state); @@ -1940,7 +1989,7 @@ xfs_dir2_node_trim_free( xfs_fileoff_t fo, /* free block number */ int *rvalp) /* out: did something */ { - xfs_dabuf_t *bp; /* freespace buffer */ + struct xfs_buf *bp; /* freespace buffer */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ xfs_dir2_free_t *free; /* freespace structure */ @@ -1953,25 +2002,22 @@ xfs_dir2_node_trim_free( /* * Read the freespace block. */ - if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, - XFS_DATA_FORK))) { + error = xfs_dir2_free_try_read(tp, dp, fo, &bp); + if (error) return error; - } - /* * There can be holes in freespace. If fo is a hole, there's * nothing to do. */ - if (bp == NULL) { + if (!bp) return 0; - } - free = bp->data; + free = bp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); /* * If there are used entries, there's nothing to do. */ if (be32_to_cpu(free->hdr.nused) > 0) { - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); *rvalp = 0; return 0; } @@ -1987,7 +2033,7 @@ xfs_dir2_node_trim_free( * pieces. This is the last block of an extent. */ ASSERT(error != ENOSPC); - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); return error; } /* diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 067f403ecf8..7da79f6515f 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -25,11 +25,13 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, xfs_dir2_db_t *dbp); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, - struct xfs_dabuf *bp); + struct xfs_buf *bp); extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const unsigned char *name, int len); /* xfs_dir2_block.c */ +extern const struct xfs_buf_ops xfs_dir2_block_buf_ops; + extern int xfs_dir2_block_addname(struct xfs_da_args *args); extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, xfs_off_t *offset, filldir_t filldir); @@ -37,57 +39,70 @@ extern int xfs_dir2_block_lookup(struct xfs_da_args *args); extern int xfs_dir2_block_removename(struct xfs_da_args *args); extern int xfs_dir2_block_replace(struct xfs_da_args *args); extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, - struct xfs_dabuf *lbp, struct xfs_dabuf *dbp); + struct xfs_buf *lbp, struct xfs_buf *dbp); /* xfs_dir2_data.c */ #ifdef DEBUG -extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp); +#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp); #else #define xfs_dir2_data_check(dp,bp) #endif + +extern const struct xfs_buf_ops xfs_dir2_data_buf_ops; + +extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); +extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mapped_bno); + extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup, int *loghead); extern void xfs_dir2_data_freescan(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr, int *loghead); extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, - struct xfs_dabuf **bpp); -extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp, + struct xfs_buf **bpp); +extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_dir2_data_entry *dep); extern void xfs_dir2_data_log_header(struct xfs_trans *tp, - struct xfs_dabuf *bp); -extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp, + struct xfs_buf *bp); +extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_dir2_data_unused *dup); -extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp, +extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); -extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp, +extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ +extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops; + +extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, - struct xfs_dabuf *dbp); + struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); extern void xfs_dir2_leaf_compact(struct xfs_da_args *args, - struct xfs_dabuf *bp); -extern void xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp, + struct xfs_buf *bp); +extern void xfs_dir2_leaf_compact_x1(struct xfs_buf *bp, int *indexp, int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent, size_t bufsize, xfs_off_t *offset, filldir_t filldir); extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno, - struct xfs_dabuf **bpp, int magic); -extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp, + struct xfs_buf **bpp, int magic); +extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp, - struct xfs_dabuf *bp); + struct xfs_buf *bp); extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); extern int xfs_dir2_leaf_replace(struct xfs_da_args *args); extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args, - struct xfs_dabuf *lbp); + struct xfs_buf *lbp); extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args, - struct xfs_dabuf *lbp, xfs_dir2_db_t db); + struct xfs_buf *lbp, xfs_dir2_db_t db); extern struct xfs_dir2_leaf_entry * xfs_dir2_leaf_find_entry(struct xfs_dir2_leaf *leaf, int index, int compact, int lowstale, int highstale, @@ -96,13 +111,13 @@ extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); /* xfs_dir2_node.c */ extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, - struct xfs_dabuf *lbp); -extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count); -extern int xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp, + struct xfs_buf *lbp); +extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_buf *bp, int *count); +extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp, struct xfs_da_args *args, int *indexp, struct xfs_da_state *state); -extern int xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp, - struct xfs_dabuf *leaf2_bp); +extern int xfs_dir2_leafn_order(struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp); extern int xfs_dir2_leafn_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action); @@ -115,6 +130,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args); extern int xfs_dir2_node_replace(struct xfs_da_args *args); extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, int *rvalp); +extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, struct xfs_buf **bpp); /* xfs_dir2_sf.c */ extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp); @@ -122,7 +139,7 @@ extern xfs_ino_t xfs_dir2_sfe_get_ino(struct xfs_dir2_sf_hdr *sfp, struct xfs_dir2_sf_entry *sfep); extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp); -extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp, +extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp, int size, xfs_dir2_sf_hdr_t *sfhp); extern int xfs_dir2_sf_addname(struct xfs_da_args *args); extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 19bf0c5e38f..1b9fc3ec7e4 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -222,7 +222,7 @@ xfs_dir2_block_sfsize( int /* error */ xfs_dir2_block_to_sf( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_buf *bp, int size, /* shortform directory size */ xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */ { @@ -249,7 +249,7 @@ xfs_dir2_block_to_sf( * and add local data. */ hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP); - memcpy(hdr, bp->data, mp->m_dirblksize); + memcpy(hdr, bp->b_addr, mp->m_dirblksize); logflags = XFS_ILOG_CORE; if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) { ASSERT(error != ENOSPC); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index f9c3fe304a1..69cf4fcde03 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -179,12 +179,14 @@ xfs_ioc_trim( * used by the fstrim application. In the end it really doesn't * matter as trimming blocks is an advisory interface. */ + if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || + range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp))) + return -XFS_ERROR(EINVAL); + start = BTOBB(range.start); end = start + BTOBBT(range.len) - 1; minlen = BTOBB(max_t(u64, granularity, range.minlen)); - if (XFS_BB_TO_FSB(mp, start) >= mp->m_sb.sb_dblocks) - return -XFS_ERROR(EINVAL); if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index bf27fcca484..9e1bf5294c9 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -248,7 +248,59 @@ xfs_qm_init_dquot_blk( xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } +static void +xfs_dquot_buf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + struct xfs_disk_dquot *ddq; + xfs_dqid_t id = 0; + int i; + + /* + * On the first read of the buffer, verify that each dquot is valid. + * We don't know what the id of the dquot is supposed to be, just that + * they should be increasing monotonically within the buffer. If the + * first id is corrupt, then it will fail on the second dquot in the + * buffer so corruptions could point to the wrong dquot in this case. + */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + int error; + + ddq = &d[i].dd_diskdq; + + if (i == 0) + id = be32_to_cpu(ddq->d_id); + + error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, + "xfs_dquot_read_verify"); + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } + } +} + +static void +xfs_dquot_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); +} + +void +xfs_dquot_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); +} +const struct xfs_buf_ops xfs_dquot_buf_ops = { + .verify_read = xfs_dquot_buf_read_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; /* * Allocate a block and fill it with dquots. @@ -315,6 +367,7 @@ xfs_qm_dqalloc( error = xfs_buf_geterror(bp); if (error) goto error1; + bp->b_ops = &xfs_dquot_buf_ops; /* * Make a chunk of dquots out of this buffer and log @@ -359,6 +412,51 @@ xfs_qm_dqalloc( return (error); } +STATIC int +xfs_qm_dqrepair( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_dquot *dqp, + xfs_dqid_t firstid, + struct xfs_buf **bpp) +{ + int error; + struct xfs_disk_dquot *ddq; + struct xfs_dqblk *d; + int i; + + /* + * Read the buffer without verification so we get the corrupted + * buffer returned to us. make sure we verify it on write, though. + */ + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, bpp, NULL); + + if (error) { + ASSERT(*bpp == NULL); + return XFS_ERROR(error); + } + (*bpp)->b_ops = &xfs_dquot_buf_ops; + + ASSERT(xfs_buf_islocked(*bpp)); + d = (struct xfs_dqblk *)(*bpp)->b_addr; + + /* Do the actual repair of dquots in this buffer */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + ddq = &d[i].dd_diskdq; + error = xfs_qm_dqcheck(mp, ddq, firstid + i, + dqp->dq_flags & XFS_DQ_ALLTYPES, + XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair"); + if (error) { + /* repair failed, we're screwed */ + xfs_trans_brelse(tp, *bpp); + return XFS_ERROR(EIO); + } + } + + return 0; +} /* * Maps a dquot to the buffer containing its on-disk version. @@ -378,7 +476,6 @@ xfs_qm_dqtobp( xfs_buf_t *bp; xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); xfs_mount_t *mp = dqp->q_mount; - xfs_disk_dquot_t *ddq; xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); xfs_trans_t *tp = (tpp ? *tpp : NULL); @@ -439,33 +536,24 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp); - if (error || !bp) - return XFS_ERROR(error); - } - - ASSERT(xfs_buf_islocked(bp)); + 0, &bp, &xfs_dquot_buf_ops); - /* - * calculate the location of the dquot inside the buffer. - */ - ddq = bp->b_addr + dqp->q_bufoffset; + if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { + xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * + mp->m_quotainfo->qi_dqperchunk; + ASSERT(bp == NULL); + error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp); + } - /* - * A simple sanity check in case we got a corrupted dquot... - */ - error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES, - flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN), - "dqtobp"); - if (error) { - if (!(flags & XFS_QMOPT_DQREPAIR)) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EIO); + if (error) { + ASSERT(bp == NULL); + return XFS_ERROR(error); } } + ASSERT(xfs_buf_islocked(bp)); *O_bpp = bp; - *O_ddpp = ddq; + *O_ddpp = bp->b_addr + dqp->q_bufoffset; return (0); } @@ -920,7 +1008,7 @@ xfs_qm_dqflush( * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 7d20af27346..c694a8469c4 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -161,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } +extern const struct xfs_buf_ops xfs_dquot_buf_ops; + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 42679223a0f..a83611849ce 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -29,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* * Note that we only accept fileids which are long enough rather than allow @@ -189,6 +190,9 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid, struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; struct inode *inode = NULL; + if (fh_len < xfs_fileid_length(fileid_type)) + return NULL; + switch (fileid_type) { case FILEID_INO32_GEN_PARENT: inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9f7ec15a652..67284edb84d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -31,11 +31,14 @@ #include "xfs_error.h" #include "xfs_vnodeops.h" #include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" #include "xfs_ioctl.h" #include "xfs_trace.h" #include <linux/dcache.h> #include <linux/falloc.h> +#include <linux/pagevec.h> static const struct vm_operations_struct xfs_file_vm_ops; @@ -83,7 +86,7 @@ xfs_rw_ilock_demote( * valid before the operation, it will be read from disk before * being partially zeroed. */ -STATIC int +int xfs_iozero( struct xfs_inode *ip, /* inode */ loff_t pos, /* offset in file */ @@ -236,7 +239,6 @@ xfs_file_aio_read( ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; - unsigned long seg; XFS_STATS_INC(xs_read_calls); @@ -247,33 +249,22 @@ xfs_file_aio_read( if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - /* START copy & waste from filemap.c */ - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *iv = &iovp[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - size += iv->iov_len; - if (unlikely((ssize_t)(size|iv->iov_len) < 0)) - return XFS_ERROR(-EINVAL); - } - /* END copy & waste from filemap.c */ + ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE); + if (ret < 0) + return ret; if (unlikely(ioflags & IO_ISDIRECT)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((iocb->ki_pos & target->bt_smask) || - (size & target->bt_smask)) { - if (iocb->ki_pos == i_size_read(inode)) + if ((pos & target->bt_smask) || (size & target->bt_smask)) { + if (pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); } } - n = XFS_MAXIOFFSET(mp) - iocb->ki_pos; + n = mp->m_super->s_maxbytes - pos; if (n <= 0 || size == 0) return 0; @@ -299,20 +290,21 @@ xfs_file_aio_read( xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { - ret = -xfs_flushinval_pages(ip, - (iocb->ki_pos & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); + ret = -filemap_write_and_wait_range( + VFS_I(ip)->i_mapping, + pos, -1); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } + truncate_pagecache_range(VFS_I(ip), pos, -1); } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); + trace_xfs_file_read(ip, size, pos, ioflags); - ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); + ret = generic_file_aio_read(iocb, iovp, nr_segs, pos); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -680,10 +672,11 @@ xfs_file_dio_aio_write( goto out; if (mapping->nrpages) { - ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, - FI_REMAPF_LOCKED); + ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + pos, -1); if (ret) goto out; + truncate_pagecache_range(VFS_I(ip), pos, -1); } /* @@ -738,16 +731,17 @@ xfs_file_buffered_aio_write( write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); ret = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, ret); + pos, &iocb->ki_pos, count, 0); + /* - * if we just got an ENOSPC, flush the inode now we aren't holding any - * page locks and retry *once* + * If we just got an ENOSPC, try to write back all dirty inodes to + * convert delalloc space to free up some of the excess reserved + * metadata space. */ if (ret == -ENOSPC && !enospc) { enospc = 1; - ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (!ret) - goto write_retry; + xfs_flush_inodes(ip->i_mount); + goto write_retry; } current->backing_dev_info = NULL; @@ -781,10 +775,12 @@ xfs_file_aio_write( if (ocount == 0) return 0; - xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); + sb_start_write(inode->i_sb); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + ret = -EIO; + goto out; + } if (unlikely(file->f_flags & O_DIRECT)) ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); @@ -803,6 +799,8 @@ xfs_file_aio_write( ret = err; } +out: + sb_end_write(inode->i_sb); return ret; } @@ -895,7 +893,7 @@ xfs_dir_open( */ mode = xfs_ilock_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); + xfs_dir2_data_readahead(NULL, ip, 0, -1); xfs_iunlock(ip, mode); return 0; } @@ -946,7 +944,6 @@ xfs_file_mmap( struct vm_area_struct *vma) { vma->vm_ops = &xfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; file_accessed(filp); return 0; @@ -966,17 +963,232 @@ xfs_vm_page_mkwrite( return block_page_mkwrite(vma, vmf, xfs_get_blocks); } +/* + * This type is designed to indicate the type of offset we would like + * to search from page cache for either xfs_seek_data() or xfs_seek_hole(). + */ +enum { + HOLE_OFF = 0, + DATA_OFF, +}; + +/* + * Lookup the desired type of offset from the given page. + * + * On success, return true and the offset argument will point to the + * start of the region that was found. Otherwise this function will + * return false and keep the offset argument unchanged. + */ +STATIC bool +xfs_lookup_buffer_offset( + struct page *page, + loff_t *offset, + unsigned int type) +{ + loff_t lastoff = page_offset(page); + bool found = false; + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + /* + * Unwritten extents that have data in the page + * cache covering them can be identified by the + * BH_Unwritten state flag. Pages with multiple + * buffers might have a mix of holes, data and + * unwritten extents - any buffer with valid + * data in it should have BH_Uptodate flag set + * on it. + */ + if (buffer_unwritten(bh) || + buffer_uptodate(bh)) { + if (type == DATA_OFF) + found = true; + } else { + if (type == HOLE_OFF) + found = true; + } + + if (found) { + *offset = lastoff; + break; + } + lastoff += bh->b_size; + } while ((bh = bh->b_this_page) != head); + + return found; +} + +/* + * This routine is called to find out and return a data or hole offset + * from the page cache for unwritten extents according to the desired + * type for xfs_seek_data() or xfs_seek_hole(). + * + * The argument offset is used to tell where we start to search from the + * page cache. Map is used to figure out the end points of the range to + * lookup pages. + * + * Return true if the desired type of offset was found, and the argument + * offset is filled with that address. Otherwise, return false and keep + * offset unchanged. + */ +STATIC bool +xfs_find_get_desired_pgoff( + struct inode *inode, + struct xfs_bmbt_irec *map, + unsigned int type, + loff_t *offset) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct pagevec pvec; + pgoff_t index; + pgoff_t end; + loff_t endoff; + loff_t startoff = *offset; + loff_t lastoff = startoff; + bool found = false; + + pagevec_init(&pvec, 0); + + index = startoff >> PAGE_CACHE_SHIFT; + endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); + end = endoff >> PAGE_CACHE_SHIFT; + do { + int want; + unsigned nr_pages; + unsigned int i; + + want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + want); + /* + * No page mapped into given range. If we are searching holes + * and if this is the first time we got into the loop, it means + * that the given offset is landed in a hole, return it. + * + * If we have already stepped through some block buffers to find + * holes but they all contains data. In this case, the last + * offset is already updated and pointed to the end of the last + * mapped page, if it does not reach the endpoint to search, + * that means there should be a hole between them. + */ + if (nr_pages == 0) { + /* Data search found nothing */ + if (type == DATA_OFF) + break; + + ASSERT(type == HOLE_OFF); + if (lastoff == startoff || lastoff < endoff) { + found = true; + *offset = lastoff; + } + break; + } + + /* + * At lease we found one page. If this is the first time we + * step into the loop, and if the first page index offset is + * greater than the given search offset, a hole was found. + */ + if (type == HOLE_OFF && lastoff == startoff && + lastoff < page_offset(pvec.pages[0])) { + found = true; + break; + } + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + loff_t b_offset; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), + * or even swizzled back from swapper_space to tmpfs + * file mapping. However, page->index will not change + * because we have a reference on the page. + * + * Searching done if the page index is out of range. + * If the current offset is not reaches the end of + * the specified search range, there should be a hole + * between them. + */ + if (page->index > end) { + if (type == HOLE_OFF && lastoff < endoff) { + *offset = lastoff; + found = true; + } + goto out; + } + + lock_page(page); + /* + * Page truncated or invalidated(page->mapping == NULL). + * We can freely skip it and proceed to check the next + * page. + */ + if (unlikely(page->mapping != inode->i_mapping)) { + unlock_page(page); + continue; + } + + if (!page_has_buffers(page)) { + unlock_page(page); + continue; + } + + found = xfs_lookup_buffer_offset(page, &b_offset, type); + if (found) { + /* + * The found offset may be less than the start + * point to search if this is the first time to + * come here. + */ + *offset = max_t(loff_t, startoff, b_offset); + unlock_page(page); + goto out; + } + + /* + * We either searching data but nothing was found, or + * searching hole but found a data buffer. In either + * case, probably the next page contains the desired + * things, update the last offset to it so. + */ + lastoff = page_offset(page) + PAGE_SIZE; + unlock_page(page); + } + + /* + * The number of returned pages less than our desired, search + * done. In this case, nothing was found for searching data, + * but we found a hole behind the last offset. + */ + if (nr_pages < want) { + if (type == HOLE_OFF) { + *offset = lastoff; + found = true; + } + break; + } + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index <= end); + +out: + pagevec_release(&pvec); + return found; +} + STATIC loff_t xfs_seek_data( struct file *file, - loff_t start, - u32 type) + loff_t start) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec map[2]; - int nmap = 2; loff_t uninitialized_var(offset); xfs_fsize_t isize; xfs_fileoff_t fsbno; @@ -992,36 +1204,74 @@ xfs_seek_data( goto out_unlock; } - fsbno = XFS_B_TO_FSBT(mp, start); - /* * Try to read extents from the first block indicated * by fsbno to the end block of the file. */ + fsbno = XFS_B_TO_FSBT(mp, start); end = XFS_B_TO_FSB(mp, isize); + for (;;) { + struct xfs_bmbt_irec map[2]; + int nmap = 2; + unsigned int i; - error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, - XFS_BMAPI_ENTIRE); - if (error) - goto out_unlock; + error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; - /* - * Treat unwritten extent as data extent since it might - * contains dirty data in page cache. - */ - if (map[0].br_startblock != HOLESTARTBLOCK) { - offset = max_t(loff_t, start, - XFS_FSB_TO_B(mp, map[0].br_startoff)); - } else { + /* No extents at given offset, must be beyond EOF */ + if (nmap == 0) { + error = ENXIO; + goto out_unlock; + } + + for (i = 0; i < nmap; i++) { + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[i].br_startoff)); + + /* Landed in a data extent */ + if (map[i].br_startblock == DELAYSTARTBLOCK || + (map[i].br_state == XFS_EXT_NORM && + !isnullstartblock(map[i].br_startblock))) + goto out; + + /* + * Landed in an unwritten extent, try to search data + * from page cache. + */ + if (map[i].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_find_get_desired_pgoff(inode, &map[i], + DATA_OFF, &offset)) + goto out; + } + } + + /* + * map[0] is hole or its an unwritten extent but + * without data in page cache. Probably means that + * we are reading after EOF if nothing in map[1]. + */ if (nmap == 1) { error = ENXIO; goto out_unlock; } - offset = max_t(loff_t, start, - XFS_FSB_TO_B(mp, map[1].br_startoff)); + ASSERT(i > 1); + + /* + * Nothing was found, proceed to the next round of search + * if reading offset not beyond or hit EOF. + */ + fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; + start = XFS_FSB_TO_B(mp, fsbno); + if (start >= isize) { + error = ENXIO; + goto out_unlock; + } } +out: if (offset != file->f_pos) file->f_pos = offset; @@ -1036,16 +1286,15 @@ out_unlock: STATIC loff_t xfs_seek_hole( struct file *file, - loff_t start, - u32 type) + loff_t start) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; loff_t uninitialized_var(offset); - loff_t holeoff; xfs_fsize_t isize; xfs_fileoff_t fsbno; + xfs_filblks_t end; uint lock; int error; @@ -1061,21 +1310,77 @@ xfs_seek_hole( } fsbno = XFS_B_TO_FSBT(mp, start); - error = xfs_bmap_first_unused(NULL, ip, 1, &fsbno, XFS_DATA_FORK); - if (error) - goto out_unlock; + end = XFS_B_TO_FSB(mp, isize); + + for (;;) { + struct xfs_bmbt_irec map[2]; + int nmap = 2; + unsigned int i; + + error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; + + /* No extents at given offset, must be beyond EOF */ + if (nmap == 0) { + error = ENXIO; + goto out_unlock; + } + + for (i = 0; i < nmap; i++) { + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[i].br_startoff)); + + /* Landed in a hole */ + if (map[i].br_startblock == HOLESTARTBLOCK) + goto out; + + /* + * Landed in an unwritten extent, try to search hole + * from page cache. + */ + if (map[i].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_find_get_desired_pgoff(inode, &map[i], + HOLE_OFF, &offset)) + goto out; + } + } - holeoff = XFS_FSB_TO_B(mp, fsbno); - if (holeoff <= start) - offset = start; - else { /* - * xfs_bmap_first_unused() could return a value bigger than - * isize if there are no more holes past the supplied offset. + * map[0] contains data or its unwritten but contains + * data in page cache, probably means that we are + * reading after EOF. We should fix offset to point + * to the end of the file(i.e., there is an implicit + * hole at the end of any file). */ - offset = min_t(loff_t, holeoff, isize); + if (nmap == 1) { + offset = isize; + break; + } + + ASSERT(i > 1); + + /* + * Both mappings contains data, proceed to the next round of + * search if the current reading offset not beyond or hit EOF. + */ + fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; + start = XFS_FSB_TO_B(mp, fsbno); + if (start >= isize) { + offset = isize; + break; + } } +out: + /* + * At this point, we must have found a hole. However, the returned + * offset may be bigger than the file size as it may be aligned to + * page boundary for unwritten extents, we need to deal with this + * situation in particular. + */ + offset = min_t(loff_t, offset, isize); if (offset != file->f_pos) file->f_pos = offset; @@ -1099,9 +1404,9 @@ xfs_file_llseek( case SEEK_SET: return generic_file_llseek(file, offset, origin); case SEEK_DATA: - return xfs_seek_data(file, offset, origin); + return xfs_seek_data(file, offset); case SEEK_HOLE: - return xfs_seek_hole(file, offset, origin); + return xfs_seek_hole(file, offset); default: return -EINVAL; } @@ -1141,4 +1446,5 @@ const struct file_operations xfs_dir_file_operations = { static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = xfs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index c13fed8c394..6dda3f949b0 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ #define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ #define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ -#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ +#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ @@ -339,6 +340,35 @@ typedef struct xfs_error_injection { /* + * Speculative preallocation trimming. + */ +#define XFS_EOFBLOCKS_VERSION 1 +struct xfs_eofblocks { + __u32 eof_version; + __u32 eof_flags; + uid_t eof_uid; + gid_t eof_gid; + prid_t eof_prid; + __u32 pad32; + __u64 eof_min_file_size; + __u64 pad64[12]; +}; + +/* eof_flags values */ +#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */ +#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */ +#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */ +#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */ +#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */ +#define XFS_EOF_FLAGS_VALID \ + (XFS_EOF_FLAGS_SYNC | \ + XFS_EOF_FLAGS_UID | \ + XFS_EOF_FLAGS_GID | \ + XFS_EOF_FLAGS_PRID | \ + XFS_EOF_FLAGS_MINFILESIZE) + + +/* * The user-level Handle Request interface structure. */ typedef struct xfs_fsop_handlereq { @@ -456,6 +486,7 @@ typedef struct xfs_handle { /* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ #define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) #define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) +#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks) /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c deleted file mode 100644 index 652b875a9d4..00000000000 --- a/fs/xfs/xfs_fs_subr.c +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_vnodeops.h" -#include "xfs_bmap_btree.h" -#include "xfs_inode.h" -#include "xfs_trace.h" - -/* - * note: all filemap functions return negative error codes. These - * need to be inverted before returning to the xfs core functions. - */ -void -xfs_tosspages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - int fiopt) -{ - /* can't toss partial tail pages, so mask them out */ - last &= ~(PAGE_SIZE - 1); - truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1); -} - -int -xfs_flushinval_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - int fiopt) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - int ret = 0; - - trace_xfs_pagecache_inval(ip, first, last); - - xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = filemap_write_and_wait_range(mapping, first, - last == -1 ? LLONG_MAX : last); - if (!ret) - truncate_inode_pages_range(mapping, first, last); - return -ret; -} - -int -xfs_flush_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - uint64_t flags, - int fiopt) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - int ret = 0; - int ret2; - - xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = -filemap_fdatawrite_range(mapping, first, - last == -1 ? LLONG_MAX : last); - if (flags & XBF_ASYNC) - return ret; - ret2 = xfs_wait_on_pages(ip, first, last); - if (!ret) - ret = ret2; - return ret; -} - -int -xfs_wait_on_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - - if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { - return -filemap_fdatawait_range(mapping, first, - last == -1 ? XFS_ISIZE(ip) - 1 : last); - } - return 0; -} diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index c25b094efbf..94eaeedc549 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -97,7 +97,9 @@ xfs_fs_geometry( (xfs_sb_version_haslazysbcount(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | (xfs_sb_version_hasattr2(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_ATTR2 : 0); + XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | + (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -112,18 +114,40 @@ xfs_fs_geometry( return 0; } +static struct xfs_buf * +xfs_growfs_get_hdr_buf( + struct xfs_mount *mp, + xfs_daddr_t blkno, + size_t numblks, + int flags, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + + bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); + if (!bp) + return NULL; + + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + bp->b_bn = blkno; + bp->b_maps[0].bm_bn = blkno; + bp->b_ops = ops; + + return bp; +} + static int xfs_growfs_data_private( xfs_mount_t *mp, /* mount point for filesystem */ xfs_growfs_data_t *in) /* growfs data input struct */ { xfs_agf_t *agf; + struct xfs_agfl *agfl; xfs_agi_t *agi; xfs_agnumber_t agno; xfs_extlen_t agsize; xfs_extlen_t tmpsize; xfs_alloc_rec_t *arec; - struct xfs_btree_block *block; xfs_buf_t *bp; int bucket; int dpct; @@ -146,9 +170,14 @@ xfs_growfs_data_private( dpct = pct - mp->m_sb.sb_imax_pct; bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; + if (bp->b_error) { + int error = bp->b_error; + xfs_buf_relse(bp); + return error; + } xfs_buf_relse(bp); new = nb; /* use new as a temporary here */ @@ -186,17 +215,18 @@ xfs_growfs_data_private( nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { /* - * AG freelist header block + * AG freespace header block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agf_buf_ops); if (!bp) { error = ENOMEM; goto error0; } + agf = XFS_BUF_TO_AGF(bp); - memset(agf, 0, mp->m_sb.sb_sectsize); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); agf->agf_seqno = cpu_to_be32(agno); @@ -223,17 +253,39 @@ xfs_growfs_data_private( goto error0; /* + * AG freelist header block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agfl_buf_ops); + if (!bp) { + error = ENOMEM; + goto error0; + } + + agfl = XFS_BUF_TO_AGFL(bp); + for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) + agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* * AG inode header block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agi_buf_ops); if (!bp) { error = ENOMEM; goto error0; } + agi = XFS_BUF_TO_AGI(bp); - memset(agi, 0, mp->m_sb.sb_sectsize); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); agi->agi_seqno = cpu_to_be32(agno); @@ -254,24 +306,22 @@ xfs_growfs_data_private( /* * BNO btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); + if (!bp) { error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); - block->bb_level = 0; - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_ALLOC_REC_ADDR(mp, block, 1); + + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -280,25 +330,22 @@ xfs_growfs_data_private( /* * CNT btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); if (!bp) { error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); - block->bb_level = 0; - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_ALLOC_REC_ADDR(mp, block, 1); + + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); nfree += be32_to_cpu(arec->ar_blockcount); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -307,20 +354,17 @@ xfs_growfs_data_private( /* * INO btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_inobt_buf_ops); if (!bp) { error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); - block->bb_level = 0; - block->bb_numrecs = 0; - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -399,9 +443,28 @@ xfs_growfs_data_private( /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + error = 0; + /* + * new secondary superblocks need to be zeroed, not read from + * disk as the contents of the new area we are growing into is + * completely unknown. + */ + if (agno < oagcount) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp, + &xfs_sb_buf_ops); + } else { + bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0); + if (bp) { + bp->b_ops = &xfs_sb_buf_ops; + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + } else + error = ENOMEM; + } + if (error) { xfs_warn(mp, "error %d reading secondary superblock for ag %d", @@ -409,6 +472,7 @@ xfs_growfs_data_private( break; } xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); + /* * If we get an error writing out the alternate superblocks, * just issue a warning and continue. The real work is @@ -423,7 +487,7 @@ xfs_growfs_data_private( break; /* no point in continuing */ } } - return 0; + return error; error0: xfs_trans_cancel(tp, XFS_TRANS_ABORT); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 76e81cff70b..5399ef222dd 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -21,7 +21,8 @@ /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, * other XFS code uses these values. Times are measured in centisecs (i.e. - * 100ths of a second). + * 100ths of a second) with the exception of eofb_timer, which is measured in + * seconds. */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ @@ -40,4 +41,5 @@ xfs_param_t xfs_params = { .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, + .eofb_timer = { 1, 300, 3600*24}, }; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 177a21a7ac4..a815412eab8 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -200,7 +200,8 @@ xfs_ialloc_inode_init( */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * blks_per_cluster, 0); + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); if (!fbuf) return ENOMEM; /* @@ -210,6 +211,7 @@ xfs_ialloc_inode_init( * to log a whole cluster of inodes instead of all the * individual transactions causing a lot of log traffic. */ + fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); for (i = 0; i < ninodes; i++) { int ioffset = i << mp->m_sb.sb_inodelog; @@ -250,6 +252,7 @@ xfs_ialloc_ag_alloc( /* boundary */ struct xfs_perag *pag; + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; @@ -431,7 +434,7 @@ xfs_ialloc_next_ag( spin_lock(&mp->m_agirotor_lock); agno = mp->m_agirotor; - if (++mp->m_agirotor == mp->m_maxagi) + if (++mp->m_agirotor >= mp->m_maxagi) mp->m_agirotor = 0; spin_unlock(&mp->m_agirotor_lock); @@ -442,14 +445,13 @@ xfs_ialloc_next_ag( * Select an allocation group to look for a free inode in, based on the parent * inode and then mode. Return the allocation group buffer. */ -STATIC xfs_buf_t * /* allocation group buffer */ +STATIC xfs_agnumber_t xfs_ialloc_ag_select( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent directory inode number */ umode_t mode, /* bits set to indicate file type */ int okalloc) /* ok to allocate more space */ { - xfs_buf_t *agbp; /* allocation group header buffer */ xfs_agnumber_t agcount; /* number of ag's in the filesystem */ xfs_agnumber_t agno; /* current ag number */ int flags; /* alloc buffer locking flags */ @@ -459,6 +461,7 @@ xfs_ialloc_ag_select( int needspace; /* file mode implies space allocated */ xfs_perag_t *pag; /* per allocation group data */ xfs_agnumber_t pagno; /* parent (starting) ag number */ + int error; /* * Files of these types need at least one block if length > 0 @@ -474,7 +477,9 @@ xfs_ialloc_ag_select( if (pagno >= agcount) pagno = 0; } + ASSERT(pagno < agcount); + /* * Loop through allocation groups, looking for one with a little * free space in it. Note we don't look for free inodes, exactly. @@ -486,51 +491,45 @@ xfs_ialloc_ag_select( flags = XFS_ALLOC_FLAG_TRYLOCK; for (;;) { pag = xfs_perag_get(mp, agno); + if (!pag->pagi_inodeok) { + xfs_ialloc_next_ag(mp); + goto nextag; + } + if (!pag->pagi_init) { - if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { - agbp = NULL; + error = xfs_ialloc_pagi_init(mp, tp, agno); + if (error) goto nextag; - } - } else - agbp = NULL; + } - if (!pag->pagi_inodeok) { - xfs_ialloc_next_ag(mp); - goto unlock_nextag; + if (pag->pagi_freecount) { + xfs_perag_put(pag); + return agno; } - /* - * Is there enough free space for the file plus a block - * of inodes (if we need to allocate some)? - */ - ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp); - if (ineed && !pag->pagf_init) { - if (agbp == NULL && - xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { - agbp = NULL; + if (!okalloc) + goto nextag; + + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, tp, agno, flags); + if (error) goto nextag; - } - (void)xfs_alloc_pagf_init(mp, tp, agno, flags); } - if (!ineed || pag->pagf_init) { - if (ineed && !(longest = pag->pagf_longest)) - longest = pag->pagf_flcount > 0; - if (!ineed || - (pag->pagf_freeblks >= needspace + ineed && - longest >= ineed && - okalloc)) { - if (agbp == NULL && - xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { - agbp = NULL; - goto nextag; - } - xfs_perag_put(pag); - return agbp; - } + + /* + * Is there enough free space for the file plus a block of + * inodes? (if we need to allocate some)? + */ + ineed = XFS_IALLOC_BLOCKS(mp); + longest = pag->pagf_longest; + if (!longest) + longest = pag->pagf_flcount > 0; + + if (pag->pagf_freeblks >= needspace + ineed && + longest >= ineed) { + xfs_perag_put(pag); + return agno; } -unlock_nextag: - if (agbp) - xfs_trans_brelse(tp, agbp); nextag: xfs_perag_put(pag); /* @@ -538,13 +537,13 @@ nextag: * down. */ if (XFS_FORCED_SHUTDOWN(mp)) - return NULL; + return NULLAGNUMBER; agno++; if (agno >= agcount) agno = 0; if (agno == pagno) { if (flags == 0) - return NULL; + return NULLAGNUMBER; flags = 0; } } @@ -607,195 +606,39 @@ xfs_ialloc_get_rec( } /* - * Visible inode allocation functions. - */ -/* - * Find a free (set) bit in the inode bitmask. - */ -static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) -{ - return xfs_lowbit64(*fp); -} - -/* - * Allocate an inode on disk. - * Mode is used to tell whether the new inode will need space, and whether - * it is a directory. + * Allocate an inode. * - * The arguments IO_agbp and alloc_done are defined to work within - * the constraint of one allocation per transaction. - * xfs_dialloc() is designed to be called twice if it has to do an - * allocation to make more free inodes. On the first call, - * IO_agbp should be set to NULL. If an inode is available, - * i.e., xfs_dialloc() did not need to do an allocation, an inode - * number is returned. In this case, IO_agbp would be set to the - * current ag_buf and alloc_done set to false. - * If an allocation needed to be done, xfs_dialloc would return - * the current ag_buf in IO_agbp and set alloc_done to true. - * The caller should then commit the current transaction, allocate a new - * transaction, and call xfs_dialloc() again, passing in the previous - * value of IO_agbp. IO_agbp should be held across the transactions. - * Since the agbp is locked across the two calls, the second call is - * guaranteed to have a free inode available. - * - * Once we successfully pick an inode its number is returned and the - * on-disk data structures are updated. The inode itself is not read - * in, since doing so would break ordering constraints with xfs_reclaim. + * The caller selected an AG for us, and made sure that free inodes are + * available. */ -int -xfs_dialloc( - xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t parent, /* parent inode (directory) */ - umode_t mode, /* mode bits for new inode */ - int okalloc, /* ok to allocate more space */ - xfs_buf_t **IO_agbp, /* in/out ag header's buffer */ - boolean_t *alloc_done, /* true if we needed to replenish - inode freelist */ - xfs_ino_t *inop) /* inode number allocated */ +STATIC int +xfs_dialloc_ag( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_ino_t parent, + xfs_ino_t *inop) { - xfs_agnumber_t agcount; /* number of allocation groups */ - xfs_buf_t *agbp; /* allocation group header's buffer */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_agi_t *agi; /* allocation group header structure */ - xfs_btree_cur_t *cur; /* inode allocation btree cursor */ - int error; /* error return value */ - int i; /* result code */ - int ialloced; /* inode allocation status */ - int noroom = 0; /* no space for inode blk allocation */ - xfs_ino_t ino; /* fs-relative inode to be returned */ - /* REFERENCED */ - int j; /* result code */ - xfs_mount_t *mp; /* file system mount structure */ - int offset; /* index of inode in chunk */ - xfs_agino_t pagino; /* parent's AG relative inode # */ - xfs_agnumber_t pagno; /* parent's AG number */ - xfs_inobt_rec_incore_t rec; /* inode allocation record */ - xfs_agnumber_t tagno; /* testing allocation group number */ - xfs_btree_cur_t *tcur; /* temp cursor */ - xfs_inobt_rec_incore_t trec; /* temp inode allocation record */ - struct xfs_perag *pag; - - - if (*IO_agbp == NULL) { - /* - * We do not have an agbp, so select an initial allocation - * group for inode allocation. - */ - agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc); - /* - * Couldn't find an allocation group satisfying the - * criteria, give up. - */ - if (!agbp) { - *inop = NULLFSINO; - return 0; - } - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); - } else { - /* - * Continue where we left off before. In this case, we - * know that the allocation group has free inodes. - */ - agbp = *IO_agbp; - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); - ASSERT(be32_to_cpu(agi->agi_freecount) > 0); - } - mp = tp->t_mountp; - agcount = mp->m_sb.sb_agcount; - agno = be32_to_cpu(agi->agi_seqno); - tagno = agno; - pagno = XFS_INO_TO_AGNO(mp, parent); - pagino = XFS_INO_TO_AGINO(mp, parent); - - /* - * If we have already hit the ceiling of inode blocks then clear - * okalloc so we scan all available agi structures for a free - * inode. - */ - - if (mp->m_maxicount && - mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { - noroom = 1; - okalloc = 0; - } + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); + xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); + struct xfs_perag *pag; + struct xfs_btree_cur *cur, *tcur; + struct xfs_inobt_rec_incore rec, trec; + xfs_ino_t ino; + int error; + int offset; + int i, j; - /* - * Loop until we find an allocation group that either has free inodes - * or in which we can allocate some inodes. Iterate through the - * allocation groups upward, wrapping at the end. - */ - *alloc_done = B_FALSE; - while (!agi->agi_freecount) { - /* - * Don't do anything if we're not supposed to allocate - * any blocks, just go on to the next ag. - */ - if (okalloc) { - /* - * Try to allocate some new inodes in the allocation - * group. - */ - if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) { - xfs_trans_brelse(tp, agbp); - if (error == ENOSPC) { - *inop = NULLFSINO; - return 0; - } else - return error; - } - if (ialloced) { - /* - * We successfully allocated some inodes, return - * the current context to the caller so that it - * can commit the current transaction and call - * us again where we left off. - */ - ASSERT(be32_to_cpu(agi->agi_freecount) > 0); - *alloc_done = B_TRUE; - *IO_agbp = agbp; - *inop = NULLFSINO; - return 0; - } - } - /* - * If it failed, give up on this ag. - */ - xfs_trans_brelse(tp, agbp); - /* - * Go on to the next ag: get its ag header. - */ -nextag: - if (++tagno == agcount) - tagno = 0; - if (tagno == agno) { - *inop = NULLFSINO; - return noroom ? ENOSPC : 0; - } - pag = xfs_perag_get(mp, tagno); - if (pag->pagi_inodeok == 0) { - xfs_perag_put(pag); - goto nextag; - } - error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); - xfs_perag_put(pag); - if (error) - goto nextag; - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); - } - /* - * Here with an allocation group that has a free inode. - * Reset agno since we may have chosen a new ag in the - * loop above. - */ - agno = tagno; - *IO_agbp = NULL; pag = xfs_perag_get(mp, agno); + ASSERT(pag->pagi_init); + ASSERT(pag->pagi_inodeok); + ASSERT(pag->pagi_freecount > 0); + restart_pagno: - cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -995,7 +838,7 @@ newino: } alloc_inode: - offset = xfs_ialloc_find_free(&rec.ir_free); + offset = xfs_lowbit64(rec.ir_free); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1028,6 +871,165 @@ error0: } /* + * Allocate an inode on disk. + * + * Mode is used to tell whether the new inode will need space, and whether it + * is a directory. + * + * This function is designed to be called twice if it has to do an allocation + * to make more free inodes. On the first call, *IO_agbp should be set to NULL. + * If an inode is available without having to performn an allocation, an inode + * number is returned. In this case, *IO_agbp is set to NULL. If an allocation + * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp. + * The caller should then commit the current transaction, allocate a + * new transaction, and call xfs_dialloc() again, passing in the previous value + * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI + * buffer is locked across the two calls, the second call is guaranteed to have + * a free inode available. + * + * Once we successfully pick an inode its number is returned and the on-disk + * data structures are updated. The inode itself is not read in, since doing so + * would break ordering constraints with xfs_reclaim. + */ +int +xfs_dialloc( + struct xfs_trans *tp, + xfs_ino_t parent, + umode_t mode, + int okalloc, + struct xfs_buf **IO_agbp, + xfs_ino_t *inop) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agbp; + xfs_agnumber_t agno; + int error; + int ialloced; + int noroom = 0; + xfs_agnumber_t start_agno; + struct xfs_perag *pag; + + if (*IO_agbp) { + /* + * If the caller passes in a pointer to the AGI buffer, + * continue where we left off before. In this case, we + * know that the allocation group has free inodes. + */ + agbp = *IO_agbp; + goto out_alloc; + } + + /* + * We do not have an agbp, so select an initial allocation + * group for inode allocation. + */ + start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + if (start_agno == NULLAGNUMBER) { + *inop = NULLFSINO; + return 0; + } + + /* + * If we have already hit the ceiling of inode blocks then clear + * okalloc so we scan all available agi structures for a free + * inode. + */ + if (mp->m_maxicount && + mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { + noroom = 1; + okalloc = 0; + } + + /* + * Loop until we find an allocation group that either has free inodes + * or in which we can allocate some inodes. Iterate through the + * allocation groups upward, wrapping at the end. + */ + agno = start_agno; + for (;;) { + pag = xfs_perag_get(mp, agno); + if (!pag->pagi_inodeok) { + xfs_ialloc_next_ag(mp); + goto nextag; + } + + if (!pag->pagi_init) { + error = xfs_ialloc_pagi_init(mp, tp, agno); + if (error) + goto out_error; + } + + /* + * Do a first racy fast path check if this AG is usable. + */ + if (!pag->pagi_freecount && !okalloc) + goto nextag; + + /* + * Then read in the AGI buffer and recheck with the AGI buffer + * lock held. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) + goto out_error; + + if (pag->pagi_freecount) { + xfs_perag_put(pag); + goto out_alloc; + } + + if (!okalloc) + goto nextag_relse_buffer; + + + error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced); + if (error) { + xfs_trans_brelse(tp, agbp); + + if (error != ENOSPC) + goto out_error; + + xfs_perag_put(pag); + *inop = NULLFSINO; + return 0; + } + + if (ialloced) { + /* + * We successfully allocated some inodes, return + * the current context to the caller so that it + * can commit the current transaction and call + * us again where we left off. + */ + ASSERT(pag->pagi_freecount > 0); + xfs_perag_put(pag); + + *IO_agbp = agbp; + *inop = NULLFSINO; + return 0; + } + +nextag_relse_buffer: + xfs_trans_brelse(tp, agbp); +nextag: + xfs_perag_put(pag); + if (++agno == mp->m_sb.sb_agcount) + agno = 0; + if (agno == start_agno) { + *inop = NULLFSINO; + return noroom ? ENOSPC : 0; + } + } + +out_alloc: + *IO_agbp = NULL; + return xfs_dialloc_ag(tp, agbp, parent, inop); +out_error: + xfs_perag_put(pag); + return XFS_ERROR(error); +} + +/* * Free disk inode. Carefully avoids touching the incore inode, all * manipulations incore are the caller's responsibility. * The on-disk inode is not changed by this operation, only the @@ -1472,6 +1474,57 @@ xfs_check_agi_unlinked( #define xfs_check_agi_unlinked(agi) #endif +static void +xfs_agi_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + int agi_ok; + + /* + * Validate the magic number of the agi block. + */ + agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && + XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); + + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag) + agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) == + bp->b_pag->pag_agno; + + if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + xfs_check_agi_unlinked(agi); +} + +static void +xfs_agi_read_verify( + struct xfs_buf *bp) +{ + xfs_agi_verify(bp); +} + +static void +xfs_agi_write_verify( + struct xfs_buf *bp) +{ + xfs_agi_verify(bp); +} + +const struct xfs_buf_ops xfs_agi_buf_ops = { + .verify_read = xfs_agi_read_verify, + .verify_write = xfs_agi_write_verify, +}; + /* * Read in the allocation group header (inode allocation section) */ @@ -1482,38 +1535,18 @@ xfs_read_agi( xfs_agnumber_t agno, /* allocation group number */ struct xfs_buf **bpp) /* allocation group hdr buf */ { - struct xfs_agi *agi; /* allocation group header */ - int agi_ok; /* agi is consistent */ int error; ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp); + XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); if (error) return error; ASSERT(!xfs_buf_geterror(*bpp)); - agi = XFS_BUF_TO_AGI(*bpp); - - /* - * Validate the magic number of the agi block. - */ - agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) && - be32_to_cpu(agi->agi_seqno) == agno; - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, - XFS_RANDOM_IALLOC_READ_AGI))) { - XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW, - mp, agi); - xfs_trans_brelse(tp, *bpp); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_buf_set_ref(*bpp, XFS_AGI_REF); - - xfs_check_agi_unlinked(agi); return 0; } diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 65ac57c8063..c8da3df271e 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -75,8 +75,6 @@ xfs_dialloc( umode_t mode, /* mode bits for new inode */ int okalloc, /* ok to allocate more space */ struct xfs_buf **agbp, /* buf for a.g. inode header */ - boolean_t *alloc_done, /* an allocation was done to replenish - the free inodes */ xfs_ino_t *inop); /* inode number allocated */ /* @@ -149,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, /* * Get the data from the pointed-to record. */ -extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, +int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); +extern const struct xfs_buf_ops xfs_agi_buf_ops; + #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 2b8b7a37aa1..bec344b3650 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -33,6 +33,7 @@ #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_trace.h" STATIC int @@ -181,6 +182,59 @@ xfs_inobt_key_diff( cur->bc_rec.i.ir_startino; } +void +xfs_inobt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + unsigned int level; + int sblock_ok; /* block passes checks */ + + /* magic number and level verification */ + level = be16_to_cpu(block->bb_level); + sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) && + level < mp->m_in_maxlevels; + + /* numrecs verification */ + sblock_ok = sblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0]; + + /* sibling pointer verification */ + sblock_ok = sblock_ok && + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_leftsib && + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_rightsib; + + if (!sblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_inobt_read_verify( + struct xfs_buf *bp) +{ + xfs_inobt_verify(bp); +} + +static void +xfs_inobt_write_verify( + struct xfs_buf *bp) +{ + xfs_inobt_verify(bp); +} + +const struct xfs_buf_ops xfs_inobt_buf_ops = { + .verify_read = xfs_inobt_read_verify, + .verify_write = xfs_inobt_write_verify, +}; + #ifdef DEBUG STATIC int xfs_inobt_keys_inorder( @@ -218,6 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, + .buf_ops = &xfs_inobt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index f782ad0c476..25c0239a8ea 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); +extern const struct xfs_buf_ops xfs_inobt_buf_ops; + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_icache.c index 1e9ee064dbb..96e344e3e92 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_icache.c @@ -19,6 +19,7 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" @@ -35,11 +36,425 @@ #include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_fsops.h" +#include "xfs_icache.h" #include <linux/kthread.h> #include <linux/freezer.h> -struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ +STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_inode *ip); + +/* + * Allocate and initialise an xfs_inode. + */ +STATIC struct xfs_inode * +xfs_inode_alloc( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + + /* + * if this didn't occur in transactions, we could use + * KM_MAYFAIL and return NULL here on ENOMEM. Set the + * code up to do this anyway. + */ + ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); + if (!ip) + return NULL; + if (inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } + + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(!xfs_isiflocked(ip)); + ASSERT(ip->i_ino == 0); + + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + /* initialise the xfs inode */ + ip->i_ino = ino; + ip->i_mount = mp; + memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); + ip->i_afp = NULL; + memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); + ip->i_flags = 0; + ip->i_delayed_blks = 0; + memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); + + return ip; +} + +STATIC void +xfs_inode_free_callback( + struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct xfs_inode *ip = XFS_I(inode); + + kmem_zone_free(xfs_inode_zone, ip); +} + +STATIC void +xfs_inode_free( + struct xfs_inode *ip) +{ + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + xfs_idestroy_fork(ip, XFS_DATA_FORK); + break; + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + if (ip->i_itemp) { + ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); + xfs_inode_item_destroy(ip); + ip->i_itemp = NULL; + } + + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(!xfs_isiflocked(ip)); + + /* + * Because we use RCU freeing we need to ensure the inode always + * appears to be reclaimed with an invalid inode number when in the + * free state. The ip->i_flags_lock provides the barrier against lookup + * races. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + +/* + * Check the validity of the inode we just found it the cache + */ +static int +xfs_iget_cache_hit( + struct xfs_perag *pag, + struct xfs_inode *ip, + xfs_ino_t ino, + int flags, + int lock_flags) __releases(RCU) +{ + struct inode *inode = VFS_I(ip); + struct xfs_mount *mp = ip->i_mount; + int error; + + /* + * check for re-use of an inode within an RCU grace period due to the + * radix tree nodes not being updated yet. We monitor for this by + * setting the inode number to zero before freeing the inode structure. + * If the inode has been reallocated and set up, then the inode number + * will not match, so check for that, too. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + + + /* + * If we are racing with another cache hit that is currently + * instantiating this inode or currently recycling it out of + * reclaimabe state, wait for the initialisation to complete + * before continuing. + * + * XXX(hch): eventually we should do something equivalent to + * wait_on_inode to wait for these flags to be cleared + * instead of polling for it. + */ + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + + /* + * If lookup is racing with unlink return an error immediately. + */ + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_error; + } + + /* + * If IRECLAIMABLE is set, we've torn down the VFS inode already. + * Need to carefully get it back into useable state. + */ + if (ip->i_flags & XFS_IRECLAIMABLE) { + trace_xfs_iget_reclaim(ip); + + /* + * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode + * from stomping over us while we recycle the inode. We can't + * clear the radix tree reclaimable tag yet as it requires + * pag_ici_lock to be held exclusive. + */ + ip->i_flags |= XFS_IRECLAIM; + + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + error = -inode_init_always(mp->m_super, inode); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + rcu_read_lock(); + spin_lock(&ip->i_flags_lock); + + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + trace_xfs_iget_reclaim_fail(ip); + goto out_error; + } + + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + /* + * Clear the per-lifetime state in the inode as we are now + * effectively a new inode and need to return to the initial + * state before reuse occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; + ip->i_flags |= XFS_INEW; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); + inode->i_state = I_NEW; + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + } else { + /* If the VFS inode is being torn down, pause and try again. */ + if (!igrab(inode)) { + trace_xfs_iget_skip(ip); + error = EAGAIN; + goto out_error; + } + + /* We've got a live one. */ + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + trace_xfs_iget_hit(ip); + } + + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); + + xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); + XFS_STATS_INC(xs_ig_found); + + return 0; + +out_error: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + return error; +} + + +static int +xfs_iget_cache_miss( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_trans_t *tp, + xfs_ino_t ino, + struct xfs_inode **ipp, + int flags, + int lock_flags) +{ + struct xfs_inode *ip; + int error; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); + int iflags; + + ip = xfs_inode_alloc(mp, ino); + if (!ip) + return ENOMEM; + + error = xfs_iread(mp, tp, ip, flags); + if (error) + goto out_destroy; + + trace_xfs_iget_miss(ip); + + if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_destroy; + } + + /* + * Preload the radix tree so we can insert safely under the + * write spinlock. Note that we cannot sleep inside the preload + * region. Since we can be called from transaction context, don't + * recurse into the file system. + */ + if (radix_tree_preload(GFP_NOFS)) { + error = EAGAIN; + goto out_destroy; + } + + /* + * Because the inode hasn't been added to the radix-tree yet it can't + * be found by another thread, so we can do the non-sleeping lock here. + */ + if (lock_flags) { + if (!xfs_ilock_nowait(ip, lock_flags)) + BUG(); + } + + /* + * These values must be set before inserting the inode into the radix + * tree as the moment it is inserted a concurrent lookup (allowed by the + * RCU locking mechanism) can find it and that lookup must see that this + * is an inode currently under construction (i.e. that XFS_INEW is set). + * The ip->i_flags_lock that protects the XFS_INEW flag forms the + * memory barrier that ensures this detection works correctly at lookup + * time. + */ + iflags = XFS_INEW; + if (flags & XFS_IGET_DONTCACHE) + iflags |= XFS_IDONTCACHE; + ip->i_udquot = ip->i_gdquot = NULL; + xfs_iflags_set(ip, iflags); + + /* insert the new inode */ + spin_lock(&pag->pag_ici_lock); + error = radix_tree_insert(&pag->pag_ici_root, agino, ip); + if (unlikely(error)) { + WARN_ON(error != -EEXIST); + XFS_STATS_INC(xs_ig_dup); + error = EAGAIN; + goto out_preload_end; + } + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + + *ipp = ip; + return 0; + +out_preload_end: + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + if (lock_flags) + xfs_iunlock(ip, lock_flags); +out_destroy: + __destroy_inode(VFS_I(ip)); + xfs_inode_free(ip); + return error; +} + +/* + * Look up an inode by number in the given file system. + * The inode is looked up in the cache held in each AG. + * If the inode is found in the cache, initialise the vfs inode + * if necessary. + * + * If it is not in core, read it in from the file system's device, + * add it to the cache and initialise the vfs inode. + * + * The inode is locked according to the value of the lock_flags parameter. + * This flag parameter indicates how and if the inode's IO lock and inode lock + * should be taken. + * + * mp -- the mount point structure for the current file system. It points + * to the inode hash table. + * tp -- a pointer to the current transaction if there is one. This is + * simply passed through to the xfs_iread() call. + * ino -- the number of the inode desired. This is the unique identifier + * within the file system for the inode being requested. + * lock_flags -- flags indicating how to lock the inode. See the comment + * for xfs_ilock() for a list of valid values. + */ +int +xfs_iget( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + xfs_inode_t **ipp) +{ + xfs_inode_t *ip; + int error; + xfs_perag_t *pag; + xfs_agino_t agino; + + /* + * xfs_reclaim_inode() uses the ILOCK to ensure an inode + * doesn't get freed while it's being referenced during a + * radix tree traversal here. It assumes this function + * aqcuires only the ILOCK (and therefore it has no need to + * involve the IOLOCK in this synchronization). + */ + ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); + + /* reject inode numbers outside existing AGs */ + if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) + return EINVAL; + + /* get the perag structure and ensure that it's inode capable */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); + agino = XFS_INO_TO_AGINO(mp, ino); + +again: + error = 0; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + + if (ip) { + error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); + if (error) + goto out_error_or_again; + } else { + rcu_read_unlock(); + XFS_STATS_INC(xs_ig_missed); + + error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, + flags, lock_flags); + if (error) + goto out_error_or_again; + } + xfs_perag_put(pag); + + *ipp = ip; + + /* + * If we have a real type for an on-disk inode, we can set ops(&unlock) + * now. If it's a new inode being created, xfs_ialloc will handle it. + */ + if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) + xfs_setup_inode(ip); + return 0; + +out_error_or_again: + if (error == EAGAIN) { + delay(1); + goto again; + } + xfs_perag_put(pag); + return error; +} /* * The inode lookup is done in batches to keep the amount of lock traffic and @@ -101,8 +516,11 @@ xfs_inode_ag_walk( struct xfs_mount *mp, struct xfs_perag *pag, int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags), - int flags) + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args, + int tag) { uint32_t first_index; int last_error = 0; @@ -121,9 +539,17 @@ restart: int i; rcu_read_lock(); - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + + if (tag == -1) + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)batch, first_index, XFS_LOOKUP_BATCH); + else + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **) batch, first_index, + XFS_LOOKUP_BATCH, tag); + if (!nr_found) { rcu_read_unlock(); break; @@ -164,7 +590,7 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = execute(batch[i], pag, flags); + error = execute(batch[i], pag, flags, args); IRELE(batch[i]); if (error == EAGAIN) { skipped++; @@ -189,12 +615,40 @@ restart: return last_error; } +/* + * Background scanning to trim post-EOF preallocated space. This is queued + * based on the 'background_prealloc_discard_period' tunable (5m by default). + */ +STATIC void +xfs_queue_eofblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_eofblocks_work, + msecs_to_jiffies(xfs_eofb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_eofblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_eofblocks_work); + xfs_icache_free_eofblocks(mp, NULL); + xfs_queue_eofblocks(mp); +} + int xfs_inode_ag_iterator( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags), - int flags) + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args) { struct xfs_perag *pag; int error = 0; @@ -204,7 +658,7 @@ xfs_inode_ag_iterator( ag = 0; while ((pag = xfs_perag_get(mp, ag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); xfs_perag_put(pag); if (error) { last_error = error; @@ -215,215 +669,50 @@ xfs_inode_ag_iterator( return XFS_ERROR(last_error); } -STATIC int -xfs_sync_inode_data( - struct xfs_inode *ip, - struct xfs_perag *pag, - int flags) -{ - struct inode *inode = VFS_I(ip); - struct address_space *mapping = inode->i_mapping; - int error = 0; - - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - return 0; - - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { - if (flags & SYNC_TRYLOCK) - return 0; - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } - - error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? - 0 : XBF_ASYNC, FI_NONE); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return error; -} - -/* - * Write out pagecache data for the whole filesystem. - */ -STATIC int -xfs_sync_data( - struct xfs_mount *mp, - int flags) -{ - int error; - - ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); - - error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags); - if (error) - return XFS_ERROR(error); - - xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); - return 0; -} - -STATIC int -xfs_sync_fsdata( - struct xfs_mount *mp) -{ - struct xfs_buf *bp; - int error; - - /* - * If the buffer is pinned then push on the log so we won't get stuck - * waiting in the write for someone, maybe ourselves, to flush the log. - * - * Even though we just pushed the log above, we did not have the - * superblock buffer locked at that point so it can become pinned in - * between there and here. - */ - bp = xfs_getsb(mp, 0); - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - return error; -} - -/* - * When remounting a filesystem read-only or freezing the filesystem, we have - * two phases to execute. This first phase is syncing the data before we - * quiesce the filesystem, and the second is flushing all the inodes out after - * we've waited for all the transactions created by the first phase to - * complete. The second phase ensures that the inodes are written to their - * location on disk rather than just existing in transactions in the log. This - * means after a quiesce there is no log replay required to write the inodes to - * disk (this is the main difference between a sync and a quiesce). - */ -/* - * First stage of freeze - no writers will make progress now we are here, - * so we flush delwri and delalloc buffers here, then wait for all I/O to - * complete. Data is frozen at that point. Metadata is not frozen, - * transactions can still occur here so don't bother emptying the AIL - * because it'll just get dirty again. - */ int -xfs_quiesce_data( - struct xfs_mount *mp) -{ - int error, error2 = 0; - - /* force out the log */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp); - - /* mark the log as covered if needed */ - if (xfs_log_need_covered(mp)) - error2 = xfs_fs_log_dummy(mp); - - return error ? error : error2; -} - -/* - * Second stage of a quiesce. The data is already synced, now we have to take - * care of the metadata. New transactions are already blocked, so we need to - * wait for any remaining transactions to drain out before proceeding. - */ -void -xfs_quiesce_attr( - struct xfs_mount *mp) -{ - int error = 0; - - /* wait for all modifications to complete */ - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); - - /* reclaim inodes to do any IO before the freeze completes */ - xfs_reclaim_inodes(mp, 0); - xfs_reclaim_inodes(mp, SYNC_WAIT); - - /* flush all pending changes from the AIL */ - xfs_ail_push_all_sync(mp->m_ail); - - /* - * Just warn here till VFS can correctly support - * read-only remount without racing. - */ - WARN_ON(atomic_read(&mp->m_active_trans) != 0); - - /* Push the superblock and write an unmount record */ - error = xfs_log_sbcount(mp); - if (error) - xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " - "Frozen image may not be consistent."); - xfs_log_unmount_write(mp); - - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. - */ - xfs_ail_push_all_sync(mp->m_ail); -} - -static void -xfs_syncd_queue_sync( - struct xfs_mount *mp) -{ - queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work, - msecs_to_jiffies(xfs_syncd_centisecs * 10)); -} - -/* - * Every sync period we need to unpin all items, reclaim inodes and sync - * disk quotas. We might need to cover the log to indicate that the - * filesystem is idle and not frozen. - */ -STATIC void -xfs_sync_worker( - struct work_struct *work) +xfs_inode_ag_iterator_tag( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args, + int tag) { - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_sync_work); - int error; - - /* - * We shouldn't write/force the log if we are in the mount/unmount - * process or on a read only filesystem. The workqueue still needs to be - * active in both cases, however, because it is used for inode reclaim - * during these times. Use the MS_ACTIVE flag to avoid doing anything - * during mount. Doing work during unmount is avoided by calling - * cancel_delayed_work_sync on this work queue before tearing down - * the ail and the log in xfs_log_unmount. - */ - if (!(mp->m_super->s_flags & MS_ACTIVE) && - !(mp->m_flags & XFS_MOUNT_RDONLY)) { - /* dgc: errors ignored here */ - if (mp->m_super->s_frozen == SB_UNFROZEN && - xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp); - else - xfs_log_force(mp, 0); + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; - /* start pushing all the metadata that is currently - * dirty */ - xfs_ail_push_all(mp->m_ail); + ag = 0; + while ((pag = xfs_perag_get_tag(mp, ag, tag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == EFSCORRUPTED) + break; + } } - - /* queue us up again */ - xfs_syncd_queue_sync(mp); + return XFS_ERROR(last_error); } /* * Queue a new inode reclaim pass if there are reclaimable inodes and there * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs syncd work default of 30s. Perhaps this should have it's own + * on the xfs periodic sync default of 30s. Perhaps this should have it's own * tunable, but that can be done if this method proves to be ineffective or too * aggressive. */ static void -xfs_syncd_queue_reclaim( +xfs_reclaim_work_queue( struct xfs_mount *mp) { rcu_read_lock(); if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { - queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); } rcu_read_unlock(); @@ -436,7 +725,7 @@ xfs_syncd_queue_reclaim( * goes low. It scans as quickly as possible avoiding locked inodes or those * already being flushed, and once done schedules a future pass. */ -STATIC void +void xfs_reclaim_worker( struct work_struct *work) { @@ -444,65 +733,10 @@ xfs_reclaim_worker( struct xfs_mount, m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_TRYLOCK); - xfs_syncd_queue_reclaim(mp); + xfs_reclaim_work_queue(mp); } -/* - * Flush delayed allocate data, attempting to free up reserved space - * from existing allocations. At this point a new allocation attempt - * has failed with ENOSPC and we are in the process of scratching our - * heads, looking about for more room. - * - * Queue a new data flush if there isn't one already in progress and - * wait for completion of the flush. This means that we only ever have one - * inode flush in progress no matter how many ENOSPC events are occurring and - * so will prevent the system from bogging down due to every concurrent - * ENOSPC event scanning all the active inodes in the system for writeback. - */ -void -xfs_flush_inodes( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - - queue_work(xfs_syncd_wq, &mp->m_flush_work); - flush_work_sync(&mp->m_flush_work); -} - -STATIC void -xfs_flush_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(work, - struct xfs_mount, m_flush_work); - - xfs_sync_data(mp, SYNC_TRYLOCK); - xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); -} - -int -xfs_syncd_init( - struct xfs_mount *mp) -{ - INIT_WORK(&mp->m_flush_work, xfs_flush_worker); - INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); - INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - - xfs_syncd_queue_sync(mp); - - return 0; -} - -void -xfs_syncd_stop( - struct xfs_mount *mp) -{ - cancel_delayed_work_sync(&mp->m_sync_work); - cancel_delayed_work_sync(&mp->m_reclaim_work); - cancel_work_sync(&mp->m_flush_work); -} - -void +static void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, struct xfs_inode *ip) @@ -520,7 +754,7 @@ __xfs_inode_set_reclaim_tag( spin_unlock(&ip->i_mount->m_perag_lock); /* schedule periodic background inode reclaim */ - xfs_syncd_queue_reclaim(ip->i_mount); + xfs_reclaim_work_queue(ip->i_mount); trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, -1, _RET_IP_); @@ -568,7 +802,7 @@ __xfs_inode_clear_reclaim( } } -void +STATIC void __xfs_inode_clear_reclaim_tag( xfs_mount_t *mp, xfs_perag_t *pag, @@ -712,8 +946,8 @@ restart: * Note that xfs_iflush will never block on the inode buffer lock, as * xfs_ifree_cluster() can lock the inode buffer before it locks the * ip->i_lock, and we are doing the exact opposite here. As a result, - * doing a blocking xfs_itobp() to get the cluster buffer would result - * in an ABBA deadlock with xfs_ifree_cluster(). + * doing a blocking xfs_imap_to_bp() to get the cluster buffer would + * result in an ABBA deadlock with xfs_ifree_cluster(). * * As xfs_ifree_cluser() must gather all inodes that are active in the * cache to mark them stale, if we hit this case we don't actually want @@ -778,9 +1012,9 @@ out: /* * We could return EAGAIN here to make reclaim rescan the inode tree in * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and xfssyncd never goes back to the idle - * state. Instead, return 0 to let the next scheduled background reclaim - * attempt to reclaim the inode again. + * waiting for IO to complete and the reclaim work never goes back to + * the idle state. Instead, return 0 to let the next scheduled + * background reclaim attempt to reclaim the inode again. */ return 0; } @@ -791,7 +1025,7 @@ out: * then a shut down during filesystem unmount reclaim walk leak all the * unreclaimed inodes. */ -int +STATIC int xfs_reclaim_inodes_ag( struct xfs_mount *mp, int flags, @@ -936,7 +1170,7 @@ xfs_reclaim_inodes_nr( int nr_to_scan) { /* kick background reclaimer and push the AIL */ - xfs_syncd_queue_reclaim(mp); + xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); @@ -962,3 +1196,146 @@ xfs_reclaim_inodes_count( return reclaimable; } +STATIC int +xfs_inode_match_id( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + if (eofb->eof_flags & XFS_EOF_FLAGS_UID && + ip->i_d.di_uid != eofb->eof_uid) + return 0; + + if (eofb->eof_flags & XFS_EOF_FLAGS_GID && + ip->i_d.di_gid != eofb->eof_gid) + return 0; + + if (eofb->eof_flags & XFS_EOF_FLAGS_PRID && + xfs_get_projid(ip) != eofb->eof_prid) + return 0; + + return 1; +} + +STATIC int +xfs_inode_free_eofblocks( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags, + void *args) +{ + int ret; + struct xfs_eofblocks *eofb = args; + + if (!xfs_can_free_eofblocks(ip, false)) { + /* inode could be preallocated or append-only */ + trace_xfs_inode_free_eofblocks_invalid(ip); + xfs_inode_clear_eofblocks_tag(ip); + return 0; + } + + /* + * If the mapping is dirty the operation can block and wait for some + * time. Unless we are waiting, skip it. + */ + if (!(flags & SYNC_WAIT) && + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + if (eofb) { + if (!xfs_inode_match_id(ip, eofb)) + return 0; + + /* skip the inode if the file size is too small */ + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; + } + + ret = xfs_free_eofblocks(ip->i_mount, ip, true); + + /* don't revisit the inode if we're not waiting */ + if (ret == EAGAIN && !(flags & SYNC_WAIT)) + ret = 0; + + return ret; +} + +int +xfs_icache_free_eofblocks( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + int flags = SYNC_TRYLOCK; + + if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) + flags = SYNC_WAIT; + + return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, + eofb, XFS_ICI_EOFBLOCKS_TAG); +} + +void +xfs_inode_set_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + int tagged; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_set_eofblocks_tag(ip); + + tagged = radix_tree_tagged(&pag->pag_ici_root, + XFS_ICI_EOFBLOCKS_TAG); + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!tagged) { + /* propagate the eofblocks tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + + /* kick off background trimming */ + xfs_queue_eofblocks(ip->i_mount); + + trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +void +xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_clear_eofblocks_tag(ip); + + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { + /* clear the eofblocks tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_icache.h index 941202e7ac6..e0f138c70a2 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_icache.h @@ -24,28 +24,30 @@ struct xfs_perag; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ -extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ +int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, + uint flags, uint lock_flags, xfs_inode_t **ipp); -int xfs_syncd_init(struct xfs_mount *mp); -void xfs_syncd_stop(struct xfs_mount *mp); - -int xfs_quiesce_data(struct xfs_mount *mp); -void xfs_quiesce_attr(struct xfs_mount *mp); - -void xfs_flush_inodes(struct xfs_inode *ip); +void xfs_reclaim_worker(struct work_struct *work); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); -void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); -void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, - struct xfs_inode *ip); + +void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); +void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); +int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); +void xfs_eofblocks_worker(struct work_struct *); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags); + int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, + int flags, void *args), + int flags, void *args); +int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, + int flags, void *args), + int flags, void *args, int tag); #endif diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c deleted file mode 100644 index 1bb4365e8c2..00000000000 --- a/fs/xfs/xfs_iget.c +++ /dev/null @@ -1,720 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_acl.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_quota.h" -#include "xfs_utils.h" -#include "xfs_trans_priv.h" -#include "xfs_inode_item.h" -#include "xfs_bmap.h" -#include "xfs_trace.h" - - -/* - * Define xfs inode iolock lockdep classes. We need to ensure that all active - * inodes are considered the same for lockdep purposes, including inodes that - * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to - * guarantee the locks are considered the same when there are multiple lock - * initialisation siteÑ•. Also, define a reclaimable inode class so it is - * obvious in lockdep reports which class the report is against. - */ -static struct lock_class_key xfs_iolock_active; -struct lock_class_key xfs_iolock_reclaimable; - -/* - * Allocate and initialise an xfs_inode. - */ -STATIC struct xfs_inode * -xfs_inode_alloc( - struct xfs_mount *mp, - xfs_ino_t ino) -{ - struct xfs_inode *ip; - - /* - * if this didn't occur in transactions, we could use - * KM_MAYFAIL and return NULL here on ENOMEM. Set the - * code up to do this anyway. - */ - ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); - if (!ip) - return NULL; - if (inode_init_always(mp->m_super, VFS_I(ip))) { - kmem_zone_free(xfs_inode_zone, ip); - return NULL; - } - - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(!xfs_isiflocked(ip)); - ASSERT(ip->i_ino == 0); - - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - lockdep_set_class_and_name(&ip->i_iolock.mr_lock, - &xfs_iolock_active, "xfs_iolock_active"); - - /* initialise the xfs inode */ - ip->i_ino = ino; - ip->i_mount = mp; - memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); - ip->i_afp = NULL; - memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); - ip->i_flags = 0; - ip->i_delayed_blks = 0; - memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); - - return ip; -} - -STATIC void -xfs_inode_free_callback( - struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - struct xfs_inode *ip = XFS_I(inode); - - kmem_zone_free(xfs_inode_zone, ip); -} - -void -xfs_inode_free( - struct xfs_inode *ip) -{ - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - xfs_idestroy_fork(ip, XFS_DATA_FORK); - break; - } - - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - - if (ip->i_itemp) { - ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); - xfs_inode_item_destroy(ip); - ip->i_itemp = NULL; - } - - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(!xfs_isiflocked(ip)); - - /* - * Because we use RCU freeing we need to ensure the inode always - * appears to be reclaimed with an invalid inode number when in the - * free state. The ip->i_flags_lock provides the barrier against lookup - * races. - */ - spin_lock(&ip->i_flags_lock); - ip->i_flags = XFS_IRECLAIM; - ip->i_ino = 0; - spin_unlock(&ip->i_flags_lock); - - call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); -} - -/* - * Check the validity of the inode we just found it the cache - */ -static int -xfs_iget_cache_hit( - struct xfs_perag *pag, - struct xfs_inode *ip, - xfs_ino_t ino, - int flags, - int lock_flags) __releases(RCU) -{ - struct inode *inode = VFS_I(ip); - struct xfs_mount *mp = ip->i_mount; - int error; - - /* - * check for re-use of an inode within an RCU grace period due to the - * radix tree nodes not being updated yet. We monitor for this by - * setting the inode number to zero before freeing the inode structure. - * If the inode has been reallocated and set up, then the inode number - * will not match, so check for that, too. - */ - spin_lock(&ip->i_flags_lock); - if (ip->i_ino != ino) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(xs_ig_frecycle); - error = EAGAIN; - goto out_error; - } - - - /* - * If we are racing with another cache hit that is currently - * instantiating this inode or currently recycling it out of - * reclaimabe state, wait for the initialisation to complete - * before continuing. - * - * XXX(hch): eventually we should do something equivalent to - * wait_on_inode to wait for these flags to be cleared - * instead of polling for it. - */ - if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(xs_ig_frecycle); - error = EAGAIN; - goto out_error; - } - - /* - * If lookup is racing with unlink return an error immediately. - */ - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_error; - } - - /* - * If IRECLAIMABLE is set, we've torn down the VFS inode already. - * Need to carefully get it back into useable state. - */ - if (ip->i_flags & XFS_IRECLAIMABLE) { - trace_xfs_iget_reclaim(ip); - - /* - * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode - * from stomping over us while we recycle the inode. We can't - * clear the radix tree reclaimable tag yet as it requires - * pag_ici_lock to be held exclusive. - */ - ip->i_flags |= XFS_IRECLAIM; - - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - - error = -inode_init_always(mp->m_super, inode); - if (error) { - /* - * Re-initializing the inode failed, and we are in deep - * trouble. Try to re-add it to the reclaim list. - */ - rcu_read_lock(); - spin_lock(&ip->i_flags_lock); - - ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); - ASSERT(ip->i_flags & XFS_IRECLAIMABLE); - trace_xfs_iget_reclaim_fail(ip); - goto out_error; - } - - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - - /* - * Clear the per-lifetime state in the inode as we are now - * effectively a new inode and need to return to the initial - * state before reuse occurs. - */ - ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; - ip->i_flags |= XFS_INEW; - __xfs_inode_clear_reclaim_tag(mp, pag, ip); - inode->i_state = I_NEW; - - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - lockdep_set_class_and_name(&ip->i_iolock.mr_lock, - &xfs_iolock_active, "xfs_iolock_active"); - - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); - } else { - /* If the VFS inode is being torn down, pause and try again. */ - if (!igrab(inode)) { - trace_xfs_iget_skip(ip); - error = EAGAIN; - goto out_error; - } - - /* We've got a live one. */ - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - trace_xfs_iget_hit(ip); - } - - if (lock_flags != 0) - xfs_ilock(ip, lock_flags); - - xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); - XFS_STATS_INC(xs_ig_found); - - return 0; - -out_error: - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - return error; -} - - -static int -xfs_iget_cache_miss( - struct xfs_mount *mp, - struct xfs_perag *pag, - xfs_trans_t *tp, - xfs_ino_t ino, - struct xfs_inode **ipp, - int flags, - int lock_flags) -{ - struct xfs_inode *ip; - int error; - xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); - int iflags; - - ip = xfs_inode_alloc(mp, ino); - if (!ip) - return ENOMEM; - - error = xfs_iread(mp, tp, ip, flags); - if (error) - goto out_destroy; - - trace_xfs_iget_miss(ip); - - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_destroy; - } - - /* - * Preload the radix tree so we can insert safely under the - * write spinlock. Note that we cannot sleep inside the preload - * region. Since we can be called from transaction context, don't - * recurse into the file system. - */ - if (radix_tree_preload(GFP_NOFS)) { - error = EAGAIN; - goto out_destroy; - } - - /* - * Because the inode hasn't been added to the radix-tree yet it can't - * be found by another thread, so we can do the non-sleeping lock here. - */ - if (lock_flags) { - if (!xfs_ilock_nowait(ip, lock_flags)) - BUG(); - } - - /* - * These values must be set before inserting the inode into the radix - * tree as the moment it is inserted a concurrent lookup (allowed by the - * RCU locking mechanism) can find it and that lookup must see that this - * is an inode currently under construction (i.e. that XFS_INEW is set). - * The ip->i_flags_lock that protects the XFS_INEW flag forms the - * memory barrier that ensures this detection works correctly at lookup - * time. - */ - iflags = XFS_INEW; - if (flags & XFS_IGET_DONTCACHE) - iflags |= XFS_IDONTCACHE; - ip->i_udquot = ip->i_gdquot = NULL; - xfs_iflags_set(ip, iflags); - - /* insert the new inode */ - spin_lock(&pag->pag_ici_lock); - error = radix_tree_insert(&pag->pag_ici_root, agino, ip); - if (unlikely(error)) { - WARN_ON(error != -EEXIST); - XFS_STATS_INC(xs_ig_dup); - error = EAGAIN; - goto out_preload_end; - } - spin_unlock(&pag->pag_ici_lock); - radix_tree_preload_end(); - - *ipp = ip; - return 0; - -out_preload_end: - spin_unlock(&pag->pag_ici_lock); - radix_tree_preload_end(); - if (lock_flags) - xfs_iunlock(ip, lock_flags); -out_destroy: - __destroy_inode(VFS_I(ip)); - xfs_inode_free(ip); - return error; -} - -/* - * Look up an inode by number in the given file system. - * The inode is looked up in the cache held in each AG. - * If the inode is found in the cache, initialise the vfs inode - * if necessary. - * - * If it is not in core, read it in from the file system's device, - * add it to the cache and initialise the vfs inode. - * - * The inode is locked according to the value of the lock_flags parameter. - * This flag parameter indicates how and if the inode's IO lock and inode lock - * should be taken. - * - * mp -- the mount point structure for the current file system. It points - * to the inode hash table. - * tp -- a pointer to the current transaction if there is one. This is - * simply passed through to the xfs_iread() call. - * ino -- the number of the inode desired. This is the unique identifier - * within the file system for the inode being requested. - * lock_flags -- flags indicating how to lock the inode. See the comment - * for xfs_ilock() for a list of valid values. - */ -int -xfs_iget( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - uint flags, - uint lock_flags, - xfs_inode_t **ipp) -{ - xfs_inode_t *ip; - int error; - xfs_perag_t *pag; - xfs_agino_t agino; - - /* - * xfs_reclaim_inode() uses the ILOCK to ensure an inode - * doesn't get freed while it's being referenced during a - * radix tree traversal here. It assumes this function - * aqcuires only the ILOCK (and therefore it has no need to - * involve the IOLOCK in this synchronization). - */ - ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); - - /* reject inode numbers outside existing AGs */ - if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) - return EINVAL; - - /* get the perag structure and ensure that it's inode capable */ - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); - agino = XFS_INO_TO_AGINO(mp, ino); - -again: - error = 0; - rcu_read_lock(); - ip = radix_tree_lookup(&pag->pag_ici_root, agino); - - if (ip) { - error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); - if (error) - goto out_error_or_again; - } else { - rcu_read_unlock(); - XFS_STATS_INC(xs_ig_missed); - - error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, - flags, lock_flags); - if (error) - goto out_error_or_again; - } - xfs_perag_put(pag); - - *ipp = ip; - - /* - * If we have a real type for an on-disk inode, we can set ops(&unlock) - * now. If it's a new inode being created, xfs_ialloc will handle it. - */ - if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) - xfs_setup_inode(ip); - return 0; - -out_error_or_again: - if (error == EAGAIN) { - delay(1); - goto again; - } - xfs_perag_put(pag); - return error; -} - -/* - * This is a wrapper routine around the xfs_ilock() routine - * used to centralize some grungy code. It is used in places - * that wish to lock the inode solely for reading the extents. - * The reason these places can't just call xfs_ilock(SHARED) - * is that the inode lock also guards to bringing in of the - * extents from disk for a file in b-tree format. If the inode - * is in b-tree format, then we need to lock the inode exclusively - * until the extents are read in. Locking it exclusively all - * the time would limit our parallelism unnecessarily, though. - * What we do instead is check to see if the extents have been - * read in yet, and only lock the inode exclusively if they - * have not. - * - * The function returns a value which should be given to the - * corresponding xfs_iunlock_map_shared(). This value is - * the mode in which the lock was actually taken. - */ -uint -xfs_ilock_map_shared( - xfs_inode_t *ip) -{ - uint lock_mode; - - if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && - ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { - lock_mode = XFS_ILOCK_EXCL; - } else { - lock_mode = XFS_ILOCK_SHARED; - } - - xfs_ilock(ip, lock_mode); - - return lock_mode; -} - -/* - * This is simply the unlock routine to go with xfs_ilock_map_shared(). - * All it does is call xfs_iunlock() with the given lock_mode. - */ -void -xfs_iunlock_map_shared( - xfs_inode_t *ip, - unsigned int lock_mode) -{ - xfs_iunlock(ip, lock_mode); -} - -/* - * The xfs inode contains 2 locks: a multi-reader lock called the - * i_iolock and a multi-reader lock called the i_lock. This routine - * allows either or both of the locks to be obtained. - * - * The 2 locks should always be ordered so that the IO lock is - * obtained first in order to prevent deadlock. - * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks - * to be locked. It can be: - * XFS_IOLOCK_SHARED, - * XFS_IOLOCK_EXCL, - * XFS_ILOCK_SHARED, - * XFS_ILOCK_EXCL, - * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, - * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, - * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, - * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL - */ -void -xfs_ilock( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - - if (lock_flags & XFS_IOLOCK_EXCL) - mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_IOLOCK_SHARED) - mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - - if (lock_flags & XFS_ILOCK_EXCL) - mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); - else if (lock_flags & XFS_ILOCK_SHARED) - mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); - - trace_xfs_ilock(ip, lock_flags, _RET_IP_); -} - -/* - * This is just like xfs_ilock(), except that the caller - * is guaranteed not to sleep. It returns 1 if it gets - * the requested locks and 0 otherwise. If the IO lock is - * obtained but the inode lock cannot be, then the IO lock - * is dropped before returning. - * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks to be - * to be locked. See the comment for xfs_ilock() for a list - * of valid values. - */ -int -xfs_ilock_nowait( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - - if (lock_flags & XFS_IOLOCK_EXCL) { - if (!mrtryupdate(&ip->i_iolock)) - goto out; - } else if (lock_flags & XFS_IOLOCK_SHARED) { - if (!mrtryaccess(&ip->i_iolock)) - goto out; - } - if (lock_flags & XFS_ILOCK_EXCL) { - if (!mrtryupdate(&ip->i_lock)) - goto out_undo_iolock; - } else if (lock_flags & XFS_ILOCK_SHARED) { - if (!mrtryaccess(&ip->i_lock)) - goto out_undo_iolock; - } - trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); - return 1; - - out_undo_iolock: - if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); - else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); - out: - return 0; -} - -/* - * xfs_iunlock() is used to drop the inode locks acquired with - * xfs_ilock() and xfs_ilock_nowait(). The caller must pass - * in the flags given to xfs_ilock() or xfs_ilock_nowait() so - * that we know which locks to drop. - * - * ip -- the inode being unlocked - * lock_flags -- this parameter indicates the inode's locks to be - * to be unlocked. See the comment for xfs_ilock() for a list - * of valid values for this parameter. - * - */ -void -xfs_iunlock( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - ASSERT(lock_flags != 0); - - if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); - else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); - - if (lock_flags & XFS_ILOCK_EXCL) - mrunlock_excl(&ip->i_lock); - else if (lock_flags & XFS_ILOCK_SHARED) - mrunlock_shared(&ip->i_lock); - - trace_xfs_iunlock(ip, lock_flags, _RET_IP_); -} - -/* - * give up write locks. the i/o lock cannot be held nested - * if it is being demoted. - */ -void -xfs_ilock_demote( - xfs_inode_t *ip, - uint lock_flags) -{ - ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); - - if (lock_flags & XFS_ILOCK_EXCL) - mrdemote(&ip->i_lock); - if (lock_flags & XFS_IOLOCK_EXCL) - mrdemote(&ip->i_iolock); - - trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); -} - -#ifdef DEBUG -int -xfs_isilocked( - xfs_inode_t *ip, - uint lock_flags) -{ - if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { - if (!(lock_flags & XFS_ILOCK_SHARED)) - return !!ip->i_lock.mr_writer; - return rwsem_is_locked(&ip->i_lock.mr_lock); - } - - if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { - if (!(lock_flags & XFS_IOLOCK_SHARED)) - return !!ip->i_iolock.mr_writer; - return rwsem_is_locked(&ip->i_iolock.mr_lock); - } - - ASSERT(0); - return 0; -} -#endif - -void -__xfs_iflock( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); - - do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); - if (xfs_isiflocked(ip)) - io_schedule(); - } while (!xfs_iflock_nowait(ip)); - - finish_wait(wq, &wait.wait); -} diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index a59eea09930..66282dcb821 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -45,6 +45,7 @@ #include "xfs_filestream.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_icache.h" kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; @@ -74,6 +75,256 @@ xfs_get_extsz_hint( return 0; } +/* + * This is a wrapper routine around the xfs_ilock() routine used to centralize + * some grungy code. It is used in places that wish to lock the inode solely + * for reading the extents. The reason these places can't just call + * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the + * extents from disk for a file in b-tree format. If the inode is in b-tree + * format, then we need to lock the inode exclusively until the extents are read + * in. Locking it exclusively all the time would limit our parallelism + * unnecessarily, though. What we do instead is check to see if the extents + * have been read in yet, and only lock the inode exclusively if they have not. + * + * The function returns a value which should be given to the corresponding + * xfs_iunlock_map_shared(). This value is the mode in which the lock was + * actually taken. + */ +uint +xfs_ilock_map_shared( + xfs_inode_t *ip) +{ + uint lock_mode; + + if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && + ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { + lock_mode = XFS_ILOCK_EXCL; + } else { + lock_mode = XFS_ILOCK_SHARED; + } + + xfs_ilock(ip, lock_mode); + + return lock_mode; +} + +/* + * This is simply the unlock routine to go with xfs_ilock_map_shared(). + * All it does is call xfs_iunlock() with the given lock_mode. + */ +void +xfs_iunlock_map_shared( + xfs_inode_t *ip, + unsigned int lock_mode) +{ + xfs_iunlock(ip, lock_mode); +} + +/* + * The xfs inode contains 2 locks: a multi-reader lock called the + * i_iolock and a multi-reader lock called the i_lock. This routine + * allows either or both of the locks to be obtained. + * + * The 2 locks should always be ordered so that the IO lock is + * obtained first in order to prevent deadlock. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks + * to be locked. It can be: + * XFS_IOLOCK_SHARED, + * XFS_IOLOCK_EXCL, + * XFS_ILOCK_SHARED, + * XFS_ILOCK_EXCL, + * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, + * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, + * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, + * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL + */ +void +xfs_ilock( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_IOLOCK_SHARED) + mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + + if (lock_flags & XFS_ILOCK_EXCL) + mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + else if (lock_flags & XFS_ILOCK_SHARED) + mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); +} + +/* + * This is just like xfs_ilock(), except that the caller + * is guaranteed not to sleep. It returns 1 if it gets + * the requested locks and 0 otherwise. If the IO lock is + * obtained but the inode lock cannot be, then the IO lock + * is dropped before returning. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks to be + * to be locked. See the comment for xfs_ilock() for a list + * of valid values. + */ +int +xfs_ilock_nowait( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) { + if (!mrtryupdate(&ip->i_iolock)) + goto out; + } else if (lock_flags & XFS_IOLOCK_SHARED) { + if (!mrtryaccess(&ip->i_iolock)) + goto out; + } + if (lock_flags & XFS_ILOCK_EXCL) { + if (!mrtryupdate(&ip->i_lock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_ILOCK_SHARED) { + if (!mrtryaccess(&ip->i_lock)) + goto out_undo_iolock; + } + return 1; + + out_undo_iolock: + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + out: + return 0; +} + +/* + * xfs_iunlock() is used to drop the inode locks acquired with + * xfs_ilock() and xfs_ilock_nowait(). The caller must pass + * in the flags given to xfs_ilock() or xfs_ilock_nowait() so + * that we know which locks to drop. + * + * ip -- the inode being unlocked + * lock_flags -- this parameter indicates the inode's locks to be + * to be unlocked. See the comment for xfs_ilock() for a list + * of valid values for this parameter. + * + */ +void +xfs_iunlock( + xfs_inode_t *ip, + uint lock_flags) +{ + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT(lock_flags != 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + + if (lock_flags & XFS_ILOCK_EXCL) + mrunlock_excl(&ip->i_lock); + else if (lock_flags & XFS_ILOCK_SHARED) + mrunlock_shared(&ip->i_lock); + + trace_xfs_iunlock(ip, lock_flags, _RET_IP_); +} + +/* + * give up write locks. the i/o lock cannot be held nested + * if it is being demoted. + */ +void +xfs_ilock_demote( + xfs_inode_t *ip, + uint lock_flags) +{ + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + + if (lock_flags & XFS_ILOCK_EXCL) + mrdemote(&ip->i_lock); + if (lock_flags & XFS_IOLOCK_EXCL) + mrdemote(&ip->i_iolock); + + trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); +} + +#ifdef DEBUG +int +xfs_isilocked( + xfs_inode_t *ip, + uint lock_flags) +{ + if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { + if (!(lock_flags & XFS_ILOCK_SHARED)) + return !!ip->i_lock.mr_writer; + return rwsem_is_locked(&ip->i_lock.mr_lock); + } + + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { + if (!(lock_flags & XFS_IOLOCK_SHARED)) + return !!ip->i_iolock.mr_writer; + return rwsem_is_locked(&ip->i_iolock.mr_lock); + } + + ASSERT(0); + return 0; +} +#endif + +void +__xfs_iflock( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); + + finish_wait(wq, &wait.wait); +} + #ifdef DEBUG /* * Make sure that the extents in the given memory buffer @@ -131,169 +382,108 @@ xfs_inobp_check( } #endif -/* - * Find the buffer associated with the given inode map - * We do basic validation checks on the buffer once it has been - * retrieved from disk. - */ -STATIC int -xfs_imap_to_bp( - xfs_mount_t *mp, - xfs_trans_t *tp, - struct xfs_imap *imap, - xfs_buf_t **bpp, - uint buf_flags, - uint iget_flags) +static void +xfs_inode_buf_verify( + struct xfs_buf *bp) { - int error; + struct xfs_mount *mp = bp->b_target->bt_mount; int i; int ni; - xfs_buf_t *bp; - - buf_flags |= XBF_UNMAPPED; - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, buf_flags, &bp); - if (error) { - if (error != EAGAIN) { - xfs_warn(mp, - "%s: xfs_trans_read_buf() returned error %d.", - __func__, error); - } else { - ASSERT(buf_flags & XBF_TRYLOCK); - } - return error; - } /* * Validate the magic number and version of every inode in the buffer - * (if DEBUG kernel) or the first inode in the buffer, otherwise. */ -#ifdef DEBUG - ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog; -#else /* usual case */ - ni = 1; -#endif - + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; for (i = 0; i < ni; i++) { int di_ok; xfs_dinode_t *dip; - dip = (xfs_dinode_t *)xfs_buf_offset(bp, + dip = (struct xfs_dinode *)xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && XFS_DINODE_GOOD_VERSION(dip->di_version); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, XFS_RANDOM_ITOBP_INOTOBP))) { - if (iget_flags & XFS_IGET_UNTRUSTED) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EINVAL); - } - XFS_CORRUPTION_ERROR("xfs_imap_to_bp", - XFS_ERRLEVEL_HIGH, mp, dip); + xfs_buf_ioerror(bp, EFSCORRUPTED); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, + mp, dip); #ifdef DEBUG xfs_emerg(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", - (unsigned long long)imap->im_blkno, i, + (unsigned long long)bp->b_bn, i, be16_to_cpu(dip->di_magic)); ASSERT(0); #endif - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); } } - xfs_inobp_check(mp, bp); - *bpp = bp; - return 0; } -/* - * This routine is called to map an inode number within a file - * system to the buffer containing the on-disk version of the - * inode. It returns a pointer to the buffer containing the - * on-disk inode in the bpp parameter, and in the dip parameter - * it returns a pointer to the on-disk inode within that buffer. - * - * If a non-zero error is returned, then the contents of bpp and - * dipp are undefined. - * - * Use xfs_imap() to determine the size and location of the - * buffer to read from disk. - */ -int -xfs_inotobp( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - xfs_dinode_t **dipp, - xfs_buf_t **bpp, - int *offset, - uint imap_flags) -{ - struct xfs_imap imap; - xfs_buf_t *bp; - int error; - - imap.im_blkno = 0; - error = xfs_imap(mp, tp, ino, &imap, imap_flags); - if (error) - return error; - error = xfs_imap_to_bp(mp, tp, &imap, &bp, 0, imap_flags); - if (error) - return error; +static void +xfs_inode_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp); +} - *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); - *bpp = bp; - *offset = imap.im_boffset; - return 0; +static void +xfs_inode_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp); } +const struct xfs_buf_ops xfs_inode_buf_ops = { + .verify_read = xfs_inode_buf_read_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + /* - * This routine is called to map an inode to the buffer containing - * the on-disk version of the inode. It returns a pointer to the - * buffer containing the on-disk inode in the bpp parameter, and in - * the dip parameter it returns a pointer to the on-disk inode within - * that buffer. + * This routine is called to map an inode to the buffer containing the on-disk + * version of the inode. It returns a pointer to the buffer containing the + * on-disk inode in the bpp parameter, and in the dipp parameter it returns a + * pointer to the on-disk inode within that buffer. * - * If a non-zero error is returned, then the contents of bpp and - * dipp are undefined. - * - * The inode is expected to already been mapped to its buffer and read - * in once, thus we can use the mapping information stored in the inode - * rather than calling xfs_imap(). This allows us to avoid the overhead - * of looking at the inode btree for small block file systems - * (see xfs_imap()). + * If a non-zero error is returned, then the contents of bpp and dipp are + * undefined. */ int -xfs_itobp( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_dinode_t **dipp, - xfs_buf_t **bpp, - uint buf_flags) +xfs_imap_to_bp( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp, + uint buf_flags, + uint iget_flags) { - xfs_buf_t *bp; - int error; + struct xfs_buf *bp; + int error; + + buf_flags |= XBF_UNMAPPED; + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + (int)imap->im_len, buf_flags, &bp, + &xfs_inode_buf_ops); + if (error) { + if (error == EAGAIN) { + ASSERT(buf_flags & XBF_TRYLOCK); + return error; + } - ASSERT(ip->i_imap.im_blkno != 0); + if (error == EFSCORRUPTED && + (iget_flags & XFS_IGET_UNTRUSTED)) + return XFS_ERROR(EINVAL); - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0); - if (error) + xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", + __func__, error); return error; - - if (!bp) { - ASSERT(buf_flags & XBF_TRYLOCK); - ASSERT(tp == NULL); - *bpp = NULL; - return EAGAIN; } - *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); *bpp = bp; + *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); return 0; } @@ -796,10 +986,9 @@ xfs_iread( /* * Get pointers to the on-disk inode and the buffer containing it. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 0, iget_flags); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); if (error) return error; - dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); /* * If we got something that isn't an inode it means someone @@ -876,7 +1065,7 @@ xfs_iread( /* * Use xfs_trans_brelse() to release the buffer containing the * on-disk inode, because it was acquired with xfs_trans_read_buf() - * in xfs_itobp() above. If tp is NULL, this is just a normal + * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal * brelse(). If we're within a transaction, then xfs_trans_brelse() * will only release the buffer if it is not dirty within the * transaction. It will be OK to release the buffer in this case, @@ -936,16 +1125,16 @@ xfs_iread_extents( * set according to the contents of the given cred structure. * * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() - * has a free inode available, call xfs_iget() - * to obtain the in-core version of the allocated inode. Finally, - * fill in the inode and log its initial contents. In this case, - * ialloc_context would be set to NULL and call_again set to false. + * has a free inode available, call xfs_iget() to obtain the in-core + * version of the allocated inode. Finally, fill in the inode and + * log its initial contents. In this case, ialloc_context would be + * set to NULL. * - * If xfs_dialloc() does not have an available inode, - * it will replenish its supply by doing an allocation. Since we can - * only do one allocation within a transaction without deadlocks, we - * must commit the current transaction before returning the inode itself. - * In this case, therefore, we will set call_again to true and return. + * If xfs_dialloc() does not have an available inode, it will replenish + * its supply by doing an allocation. Since we can only do one + * allocation within a transaction without deadlocks, we must commit + * the current transaction before returning the inode itself. + * In this case, therefore, we will set ialloc_context and return. * The caller should then commit the current transaction, start a new * transaction, and call xfs_ialloc() again to actually get the inode. * @@ -970,7 +1159,6 @@ xfs_ialloc( prid_t prid, int okalloc, xfs_buf_t **ialloc_context, - boolean_t *call_again, xfs_inode_t **ipp) { xfs_ino_t ino; @@ -985,10 +1173,10 @@ xfs_ialloc( * the on-disk inode to be allocated. */ error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, - ialloc_context, call_again, &ino); + ialloc_context, &ino); if (error) return error; - if (*call_again || ino == NULLFSINO) { + if (*ialloc_context || ino == NULLFSINO) { *ipp = NULL; return 0; } @@ -1207,7 +1395,9 @@ xfs_itruncate_extents( int error = 0; int done = 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!atomic_read(&VFS_I(ip)->i_count) || + xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_itemp != NULL); @@ -1226,7 +1416,7 @@ xfs_itruncate_extents( * then there is nothing to do. */ first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (first_unmap_block == last_block) return 0; @@ -1355,7 +1545,8 @@ xfs_iunlink( * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); if (error) return error; @@ -1429,16 +1620,16 @@ xfs_iunlink_remove( if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { /* - * We're at the head of the list. Get the inode's - * on-disk buffer to see if there is anyone after us - * on the list. Only modify our next pointer if it - * is not already NULLAGINO. This saves us the overhead - * of dealing with the buffer when there is no need to - * change it. + * We're at the head of the list. Get the inode's on-disk + * buffer to see if there is anyone after us on the list. + * Only modify our next pointer if it is not already NULLAGINO. + * This saves us the overhead of dealing with the buffer when + * there is no need to change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); if (error) { - xfs_warn(mp, "%s: xfs_itobp() returned error %d.", + xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", __func__, error); return error; } @@ -1472,34 +1663,45 @@ xfs_iunlink_remove( next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); last_ibp = NULL; while (next_agino != agino) { - /* - * If the last inode wasn't the one pointing to - * us, then release its buffer since we're not - * going to do anything with it. - */ - if (last_ibp != NULL) { + struct xfs_imap imap; + + if (last_ibp) xfs_trans_brelse(tp, last_ibp); - } + + imap.im_blkno = 0; next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); - error = xfs_inotobp(mp, tp, next_ino, &last_dip, - &last_ibp, &last_offset, 0); + + error = xfs_imap(mp, tp, next_ino, &imap, 0); + if (error) { + xfs_warn(mp, + "%s: xfs_imap returned error %d.", + __func__, error); + return error; + } + + error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, + &last_ibp, 0, 0); if (error) { xfs_warn(mp, - "%s: xfs_inotobp() returned error %d.", + "%s: xfs_imap_to_bp returned error %d.", __func__, error); return error; } + + last_offset = imap.im_boffset; next_agino = be32_to_cpu(last_dip->di_next_unlinked); ASSERT(next_agino != NULLAGINO); ASSERT(next_agino != 0); } + /* - * Now last_ibp points to the buffer previous to us on - * the unlinked list. Pull us from the list. + * Now last_ibp points to the buffer previous to us on the + * unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); if (error) { - xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.", + xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", __func__, error); return error; } @@ -1579,10 +1781,23 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, 0); + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); if (!bp) return ENOMEM; + + /* + * This buffer may not have been correctly initialised as we + * didn't read it from disk. That's not important because we are + * only using to mark the buffer as stale in the log, and to + * attach stale cached inodes on it. That means it will never be + * dispatched for IO. If it is, we want to know about it, and we + * want it to fail. We can acheive this by adding a write + * verifier to the buffer. + */ + bp->b_ops = &xfs_inode_buf_ops; + /* * Walk the inodes already attached to the buffer and mark them * stale. These will all have the flush locks held, so an @@ -1749,7 +1964,8 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp, + 0, 0); if (error) return error; @@ -2428,7 +2644,7 @@ xfs_iflush( /* * For stale inodes we cannot rely on the backing buffer remaining * stale in cache for the remaining life of the stale inode and so - * xfs_itobp() below may give us a buffer that no longer contains + * xfs_imap_to_bp() below may give us a buffer that no longer contains * inodes below. We have to check this after ensuring the inode is * unpinned so that it is safe to reclaim the stale inode after the * flush call. @@ -2454,7 +2670,8 @@ xfs_iflush( /* * Get the buffer containing the on-disk inode. */ - error = xfs_itobp(mp, NULL, ip, &dip, &bp, XBF_TRYLOCK); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, + 0); if (error || !bp) { xfs_ifunlock(ip); return error; @@ -3728,3 +3945,40 @@ xfs_iext_irec_update_extoffs( ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; } } + +/* + * Test whether it is appropriate to check an inode for and free post EOF + * blocks. The 'force' parameter determines whether we should also consider + * regular files that are marked preallocated or append-only. + */ +bool +xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) +{ + /* prealloc/delalloc exists only on regular files */ + if (!S_ISREG(ip->i_d.di_mode)) + return false; + + /* + * Zero sized files with no cached pages and delalloc blocks will not + * have speculative prealloc/delalloc blocks to remove. + */ + if (VFS_I(ip)->i_size == 0 && + VN_CACHED(VFS_I(ip)) == 0 && + ip->i_delayed_blks == 0) + return false; + + /* If we haven't read in the extent list, then don't do it now. */ + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) + return false; + + /* + * Do not free real preallocated or append-only files unless the file + * has delalloc blocks and we are forced to remove them. + */ + if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) + if (!force || ip->i_delayed_blks == 0) + return false; + + return true; +} + diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1efff36a75b..22baf6ea4fa 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -487,8 +487,6 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) #define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) #define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) -extern struct lock_class_key xfs_iolock_reclaimable; - /* * For multiple groups support: if S_ISGID bit is set in the parent * directory, group of new file is set to that of the parent, and @@ -498,11 +496,10 @@ extern struct lock_class_key xfs_iolock_reclaimable; (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ ((pip)->i_d.di_mode & S_ISGID)) + /* - * xfs_iget.c prototypes. + * xfs_inode.c prototypes. */ -int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - uint, uint, xfs_inode_t **); void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); @@ -510,14 +507,9 @@ void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_map_shared(xfs_inode_t *); void xfs_iunlock_map_shared(xfs_inode_t *, uint); -void xfs_inode_free(struct xfs_inode *ip); - -/* - * xfs_inode.c prototypes. - */ int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, - struct xfs_buf **, boolean_t *, xfs_inode_t **); + struct xfs_buf **, xfs_inode_t **); uint xfs_ip2xflags(struct xfs_inode *); uint xfs_dic2xflags(struct xfs_dinode *); @@ -557,12 +549,9 @@ do { \ #define XFS_IGET_UNTRUSTED 0x2 #define XFS_IGET_DONTCACHE 0x4 -int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, - xfs_ino_t, struct xfs_dinode **, - struct xfs_buf **, int *, uint); -int xfs_itobp(struct xfs_mount *, struct xfs_trans *, - struct xfs_inode *, struct xfs_dinode **, - struct xfs_buf **, uint); +int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, + struct xfs_imap *, struct xfs_dinode **, + struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); void xfs_dinode_to_disk(struct xfs_dinode *, @@ -596,6 +585,7 @@ void xfs_iext_irec_compact(xfs_ifork_t *); void xfs_iext_irec_compact_pages(xfs_ifork_t *); void xfs_iext_irec_compact_full(xfs_ifork_t *); void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); +bool xfs_can_free_eofblocks(struct xfs_inode *, bool); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) @@ -608,5 +598,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); extern struct kmem_zone *xfs_ifork_zone; extern struct kmem_zone *xfs_inode_zone; extern struct kmem_zone *xfs_ili_zone; +extern const struct xfs_buf_ops xfs_inode_buf_ops; #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 3a05a41b5d7..c1c3ef88a26 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -42,6 +42,7 @@ #include "xfs_inode_item.h" #include "xfs_export.h" #include "xfs_trace.h" +#include "xfs_icache.h" #include <linux/capability.h> #include <linux/dcache.h> @@ -70,16 +71,16 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct file *file = NULL; + struct fd f = {0}; struct path path; int error; struct xfs_inode *ip; if (cmd == XFS_IOC_FD_TO_HANDLE) { - file = fget(hreq->fd); - if (!file) + f = fdget(hreq->fd); + if (!f.file) return -EBADF; - inode = file->f_path.dentry->d_inode; + inode = f.file->f_path.dentry->d_inode; } else { error = user_lpath((const char __user *)hreq->path, &path); if (error) @@ -134,7 +135,7 @@ xfs_find_handle( out_put: if (cmd == XFS_IOC_FD_TO_HANDLE) - fput(file); + fdput(f); else path_put(&path); return error; @@ -208,6 +209,7 @@ xfs_open_by_handle( struct inode *inode; struct dentry *dentry; fmode_t fmode; + struct path path; if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); @@ -252,8 +254,10 @@ xfs_open_by_handle( goto out_dput; } - filp = dentry_open(dentry, mntget(parfilp->f_path.mnt), - hreq->oflags, cred); + path.mnt = parfilp->f_path.mnt; + path.dentry = dentry; + filp = dentry_open(&path, hreq->oflags, cred); + dput(dentry); if (IS_ERR(filp)) { put_unused_fd(fd); return PTR_ERR(filp); @@ -361,9 +365,15 @@ xfs_fssetdm_by_handle( if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(parfilp); + if (error) + return error; + dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + mnt_drop_write_file(parfilp); return PTR_ERR(dentry); + } if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { error = -XFS_ERROR(EPERM); @@ -379,6 +389,7 @@ xfs_fssetdm_by_handle( fsd.fsd_dmstate); out: + mnt_drop_write_file(parfilp); dput(dentry); return error; } @@ -631,7 +642,11 @@ xfs_ioc_space( if (ioflags & IO_INVIS) attr_flags |= XFS_ATTR_DMI; + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags); + mnt_drop_write_file(filp); return -error; } @@ -1160,6 +1175,7 @@ xfs_ioc_fssetxattr( { struct fsxattr fa; unsigned int mask; + int error; if (copy_from_user(&fa, arg, sizeof(fa))) return -EFAULT; @@ -1168,7 +1184,12 @@ xfs_ioc_fssetxattr( if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) mask |= FSX_NONBLOCK; - return -xfs_ioctl_setattr(ip, &fa, mask); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioctl_setattr(ip, &fa, mask); + mnt_drop_write_file(filp); + return -error; } STATIC int @@ -1193,6 +1214,7 @@ xfs_ioc_setxflags( struct fsxattr fa; unsigned int flags; unsigned int mask; + int error; if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; @@ -1207,7 +1229,12 @@ xfs_ioc_setxflags( mask |= FSX_NONBLOCK; fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); - return -xfs_ioctl_setattr(ip, &fa, mask); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioctl_setattr(ip, &fa, mask); + mnt_drop_write_file(filp); + return -error; } STATIC int @@ -1382,8 +1409,13 @@ xfs_file_ioctl( if (copy_from_user(&dmi, arg, sizeof(dmi))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, dmi.fsd_dmstate); + mnt_drop_write_file(filp); return -error; } @@ -1431,7 +1463,11 @@ xfs_file_ioctl( if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_swapext(&sxp); + mnt_drop_write_file(filp); return -error; } @@ -1460,9 +1496,14 @@ xfs_file_ioctl( if (copy_from_user(&inout, arg, sizeof(inout))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; + /* input parameter is passed in resblks field of structure */ in = inout.resblks; error = xfs_reserve_blocks(mp, &in, &inout); + mnt_drop_write_file(filp); if (error) return -error; @@ -1493,7 +1534,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); return -error; } @@ -1503,7 +1548,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_log(mp, &in); + mnt_drop_write_file(filp); return -error; } @@ -1513,7 +1562,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); return -error; } @@ -1550,6 +1603,26 @@ xfs_file_ioctl( error = xfs_errortag_clearall(mp, 1); return -error; + case XFS_IOC_FREE_EOFBLOCKS: { + struct xfs_eofblocks eofb; + + if (copy_from_user(&eofb, arg, sizeof(eofb))) + return -XFS_ERROR(EFAULT); + + if (eofb.eof_version != XFS_EOFBLOCKS_VERSION) + return -XFS_ERROR(EINVAL); + + if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID) + return -XFS_ERROR(EINVAL); + + if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) || + memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64))) + return -XFS_ERROR(EINVAL); + + error = xfs_icache_free_eofblocks(mp, &eofb); + return -error; + } + default: return -ENOTTY; } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index c4f2da0d2bf..1244274a567 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -600,7 +600,11 @@ xfs_file_compat_ioctl( if (xfs_compat_growfs_data_copyin(&in, arg)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); return -error; } case XFS_IOC_FSGROWFSRT_32: { @@ -608,7 +612,11 @@ xfs_file_compat_ioctl( if (xfs_compat_growfs_rt_copyin(&in, arg)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); return -error; } #endif @@ -627,7 +635,11 @@ xfs_file_compat_ioctl( offsetof(struct xfs_swapext, sx_stat)) || xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_swapext(&sxp); + mnt_drop_write_file(filp); return -error; } case XFS_IOC_FSBULKSTAT_32: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index aadfce6681e..364818eef40 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -41,6 +41,7 @@ #include "xfs_utils.h" #include "xfs_iomap.h" #include "xfs_trace.h" +#include "xfs_icache.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -285,7 +286,7 @@ xfs_iomap_eof_want_preallocate( * do any speculative allocation. */ start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1))); - count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); while (count_fsb > 0) { imaps = nimaps; firstblock = NULLFSBLOCK; @@ -350,6 +351,15 @@ xfs_iomap_prealloc_size( } if (shift) alloc_blocks >>= shift; + + /* + * If we are still trying to allocate more space than is + * available, squash the prealloc hard. This can happen if we + * have a large file on a small filesystem and the above + * lowspace thresholds are smaller than MAXEXTLEN. + */ + while (alloc_blocks >= freesp) + alloc_blocks >>= 4; } if (alloc_blocks < mp->m_writeio_blocks) @@ -373,7 +383,7 @@ xfs_iomap_write_delay( xfs_extlen_t extsz; int nimaps; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; - int prealloc, flushed = 0; + int prealloc; int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -416,8 +426,8 @@ retry: * Make sure preallocation does not create extents beyond the range we * actually support in this filesystem. */ - if (last_fsb > XFS_B_TO_FSB(mp, mp->m_maxioffset)) - last_fsb = XFS_B_TO_FSB(mp, mp->m_maxioffset); + if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)) + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); ASSERT(last_fsb > offset_fsb); @@ -434,31 +444,29 @@ retry: } /* - * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For - * ENOSPC, * flush all other inodes with delalloc blocks to free up - * some of the excess reserved metadata space. For both cases, retry + * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry * without EOF preallocation. */ if (nimaps == 0) { trace_xfs_delalloc_enospc(ip, offset, count); - if (flushed) - return XFS_ERROR(error ? error : ENOSPC); - - if (error == ENOSPC) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_inodes(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); + if (prealloc) { + prealloc = 0; + error = 0; + goto retry; } - - flushed = 1; - error = 0; - prealloc = 0; - goto retry; + return XFS_ERROR(error ? error : ENOSPC); } if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) return xfs_alert_fsblock_zero(ip, &imap[0]); + /* + * Tag the inode as speculatively preallocated so we can reclaim this + * space on demand, if necessary. + */ + if (prealloc) + xfs_inode_set_eofblocks_tag(ip); + *ret_imap = imap[0]; return 0; } @@ -584,7 +592,9 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, &first_block, 1, + count_fsb, + XFS_BMAPI_STACK_SWITCH, + &first_block, 1, imap, &nimaps, &free_list); if (error) goto trans_cancel; @@ -680,9 +690,9 @@ xfs_iomap_write_unwritten( * the same inode that we complete here and might deadlock * on the iolock. */ - xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); + sb_start_intwrite(mp->m_super); tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); - tp->t_flags |= XFS_TRANS_RESERVE; + tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 1a25fd80279..d82efaa2ac7 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -38,6 +38,7 @@ #include "xfs_vnodeops.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" #include <linux/capability.h> #include <linux/xattr.h> @@ -179,7 +180,7 @@ xfs_vn_create( struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool flags) { return xfs_vn_mknod(dir, dentry, mode, 0); } @@ -197,7 +198,7 @@ STATIC struct dentry * xfs_vn_lookup( struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct xfs_inode *cip; struct xfs_name name; @@ -222,7 +223,7 @@ STATIC struct dentry * xfs_vn_ci_lookup( struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct xfs_inode *ip; struct xfs_name xname; @@ -779,8 +780,8 @@ xfs_setattr_size( * care about here. */ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { - error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, - FI_NONE); + error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ip->i_d.di_size, newsize); if (error) goto out_unlock; } @@ -854,6 +855,9 @@ xfs_setattr_size( * and do not wait the usual (long) time for writeout. */ xfs_iflags_set(ip, XFS_ITRUNCATED); + + /* A truncate down always removes post-EOF blocks. */ + xfs_inode_clear_eofblocks_tag(ip); } if (mask & ATTR_CTIME) { @@ -897,6 +901,47 @@ xfs_vn_setattr( return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0); } +STATIC int +xfs_vn_update_time( + struct inode *inode, + struct timespec *now, + int flags) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + trace_xfs_update_time(ip); + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return -error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (flags & S_CTIME) { + inode->i_ctime = *now; + ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec; + } + if (flags & S_MTIME) { + inode->i_mtime = *now; + ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec; + } + if (flags & S_ATIME) { + inode->i_atime = *now; + ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec; + } + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); + return -xfs_trans_commit(tp, 0); +} + #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) /* @@ -991,6 +1036,7 @@ static const struct inode_operations xfs_inode_operations = { .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .fiemap = xfs_vn_fiemap, + .update_time = xfs_vn_update_time, }; static const struct inode_operations xfs_dir_inode_operations = { @@ -1016,6 +1062,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, }; static const struct inode_operations xfs_dir_ci_inode_operations = { @@ -1041,6 +1088,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, }; static const struct inode_operations xfs_symlink_inode_operations = { @@ -1054,6 +1102,7 @@ static const struct inode_operations xfs_symlink_inode_operations = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, }; STATIC void diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index eff577a9b67..2ea7d402188 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -34,6 +34,7 @@ #include "xfs_error.h" #include "xfs_btree.h" #include "xfs_trace.h" +#include "xfs_icache.h" STATIC int xfs_internal_inum( @@ -395,7 +396,8 @@ xfs_bulkstat( if (xfs_inobt_maskn(chunkidx, nicluster) & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, - agbno, nbcluster); + agbno, nbcluster, + &xfs_inode_buf_ops); } irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; @@ -555,7 +557,7 @@ xfs_bulkstat_single( /* * note that requesting valid inode numbers which are not allocated - * to inodes will most likely cause xfs_itobp to generate warning + * to inodes will most likely cause xfs_imap_to_bp to generate warning * messages about bad magic numbers. This is ok. The fact that * the inode isn't actually an inode is handled by the * error check below. Done this way to make the usual case faster diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 828662f70d6..fe7e4df85a7 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -44,6 +44,7 @@ #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/slab.h> +#include <linux/crc32c.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/file.h> @@ -118,6 +119,7 @@ #define xfs_rotorstep xfs_params.rotorstep.val #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val +#define xfs_eofb_secs xfs_params.eofb_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_pid() (current->pid) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d90d4a38860..46bd9d52ab5 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -34,6 +34,8 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_trace.h" +#include "xfs_fsops.h" +#include "xfs_cksum.h" kmem_zone_t *xfs_log_ticket_zone; @@ -45,51 +47,85 @@ xlog_commit_record( struct xlog_in_core **iclog, xfs_lsn_t *commitlsnp); -STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks); +STATIC struct xlog * +xlog_alloc_log( + struct xfs_mount *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks); STATIC int xlog_space_left( struct xlog *log, atomic64_t *head); -STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); -STATIC void xlog_dealloc_log(xlog_t *log); +STATIC int +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog); +STATIC void +xlog_dealloc_log( + struct xlog *log); /* local state machine functions */ STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); -STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog); -STATIC int xlog_state_get_iclog_space(xlog_t *log, - int len, - xlog_in_core_t **iclog, - xlog_ticket_t *ticket, - int *continued_write, - int *logoffsetp); -STATIC int xlog_state_release_iclog(xlog_t *log, - xlog_in_core_t *iclog); -STATIC void xlog_state_switch_iclogs(xlog_t *log, - xlog_in_core_t *iclog, - int eventual_size); -STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); +STATIC void +xlog_state_do_callback( + struct xlog *log, + int aborted, + struct xlog_in_core *iclog); +STATIC int +xlog_state_get_iclog_space( + struct xlog *log, + int len, + struct xlog_in_core **iclog, + struct xlog_ticket *ticket, + int *continued_write, + int *logoffsetp); +STATIC int +xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog); +STATIC void +xlog_state_switch_iclogs( + struct xlog *log, + struct xlog_in_core *iclog, + int eventual_size); +STATIC void +xlog_state_want_sync( + struct xlog *log, + struct xlog_in_core *iclog); STATIC void xlog_grant_push_ail( - struct xlog *log, - int need_bytes); -STATIC void xlog_regrant_reserve_log_space(xlog_t *log, - xlog_ticket_t *ticket); -STATIC void xlog_ungrant_log_space(xlog_t *log, - xlog_ticket_t *ticket); + struct xlog *log, + int need_bytes); +STATIC void +xlog_regrant_reserve_log_space( + struct xlog *log, + struct xlog_ticket *ticket); +STATIC void +xlog_ungrant_log_space( + struct xlog *log, + struct xlog_ticket *ticket); #if defined(DEBUG) -STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); +STATIC void +xlog_verify_dest_ptr( + struct xlog *log, + char *ptr); STATIC void xlog_verify_grant_tail( - struct xlog *log); -STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, - int count, boolean_t syncing); -STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, - xfs_lsn_t tail_lsn); + struct xlog *log); +STATIC void +xlog_verify_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + int count, + boolean_t syncing); +STATIC void +xlog_verify_tail_lsn( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t tail_lsn); #else #define xlog_verify_dest_ptr(a,b) #define xlog_verify_grant_tail(a) @@ -97,7 +133,9 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, #define xlog_verify_tail_lsn(a,b,c) #endif -STATIC int xlog_iclogs_empty(xlog_t *log); +STATIC int +xlog_iclogs_empty( + struct xlog *log); static void xlog_grant_sub_space( @@ -422,7 +460,8 @@ xfs_log_reserve( tic->t_trans_type = t_type; *ticp = tic; - xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt); + xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt + : tic->t_unit_res); trace_xfs_log_reserve(log, tic); @@ -643,25 +682,29 @@ out: } /* - * Finish the recovery of the file system. This is separate from - * the xfs_log_mount() call, because it depends on the code in - * xfs_mountfs() to read in the root and real-time bitmap inodes - * between calling xfs_log_mount() and here. + * Finish the recovery of the file system. This is separate from the + * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read + * in the root and real-time bitmap inodes between calling xfs_log_mount() and + * here. * - * mp - ubiquitous xfs mount point structure + * If we finish recovery successfully, start the background log work. If we are + * not doing recovery, then we have a RO filesystem and we don't need to start + * it. */ int xfs_log_mount_finish(xfs_mount_t *mp) { - int error; + int error = 0; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { error = xlog_recover_finish(mp->m_log); - else { - error = 0; + if (!error) + xfs_log_work_queue(mp); + } else { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } + return error; } @@ -684,7 +727,7 @@ xfs_log_mount_finish(xfs_mount_t *mp) int xfs_log_unmount_write(xfs_mount_t *mp) { - xlog_t *log = mp->m_log; + struct xlog *log = mp->m_log; xlog_in_core_t *iclog; #ifdef DEBUG xlog_in_core_t *first_iclog; @@ -814,15 +857,49 @@ xfs_log_unmount_write(xfs_mount_t *mp) } /* xfs_log_unmount_write */ /* - * Deallocate log structures for unmount/relocation. + * Empty the log for unmount/freeze. * - * We need to stop the aild from running before we destroy - * and deallocate the log as the aild references the log. + * To do this, we first need to shut down the background log work so it is not + * trying to cover the log as we clean up. We then need to unpin all objects in + * the log so we can then flush them out. Once they have completed their IO and + * run the callbacks removing themselves from the AIL, we can write the unmount + * record. */ void -xfs_log_unmount(xfs_mount_t *mp) +xfs_log_quiesce( + struct xfs_mount *mp) { - cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_log->l_work); + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * The superblock buffer is uncached and while xfs_ail_push_all_sync() + * will push it, xfs_wait_buftarg() will not wait for it. Further, + * xfs_buf_iowait() cannot be used because it was pushed with the + * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for + * the IO to complete. + */ + xfs_ail_push_all_sync(mp->m_ail); + xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); + + xfs_log_unmount_write(mp); +} + +/* + * Shut down and release the AIL and Log. + * + * During unmount, we need to ensure we flush all the dirty metadata objects + * from the AIL so that the log is empty before we write the unmount record to + * the log. Once this is done, we can tear down the AIL and the log. + */ +void +xfs_log_unmount( + struct xfs_mount *mp) +{ + xfs_log_quiesce(mp); + xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } @@ -893,7 +970,7 @@ int xfs_log_need_covered(xfs_mount_t *mp) { int needed = 0; - xlog_t *log = mp->m_log; + struct xlog *log = mp->m_log; if (!xfs_fs_writable(mp)) return 0; @@ -1024,9 +1101,9 @@ xlog_space_left( void xlog_iodone(xfs_buf_t *bp) { - xlog_in_core_t *iclog = bp->b_fspriv; - xlog_t *l = iclog->ic_log; - int aborted = 0; + struct xlog_in_core *iclog = bp->b_fspriv; + struct xlog *l = iclog->ic_log; + int aborted = 0; /* * Race to shutdown the filesystem if we see an error. @@ -1054,8 +1131,7 @@ xlog_iodone(xfs_buf_t *bp) * with it being freed after writing the unmount record to the * log. */ - -} /* xlog_iodone */ +} /* * Return size of each in-core log record buffer. @@ -1067,8 +1143,9 @@ xlog_iodone(xfs_buf_t *bp) */ STATIC void -xlog_get_iclog_buffer_size(xfs_mount_t *mp, - xlog_t *log) +xlog_get_iclog_buffer_size( + struct xfs_mount *mp, + struct xlog *log) { int size; int xhdrs; @@ -1124,18 +1201,53 @@ done: } /* xlog_get_iclog_buffer_size */ +void +xfs_log_work_queue( + struct xfs_mount *mp) +{ + queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, + msecs_to_jiffies(xfs_syncd_centisecs * 10)); +} + +/* + * Every sync period we need to unpin all items in the AIL and push them to + * disk. If there is nothing dirty, then we might need to cover the log to + * indicate that the filesystem is idle. + */ +void +xfs_log_worker( + struct work_struct *work) +{ + struct xlog *log = container_of(to_delayed_work(work), + struct xlog, l_work); + struct xfs_mount *mp = log->l_mp; + + /* dgc: errors ignored - not fatal and nowhere to report them */ + if (xfs_log_need_covered(mp)) + xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); + + /* queue us up again */ + xfs_log_work_queue(mp); +} + /* * This routine initializes some of the log structure for a given mount point. * Its primary purpose is to fill in enough, so recovery can occur. However, * some other stuff may be filled in too. */ -STATIC xlog_t * -xlog_alloc_log(xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks) +STATIC struct xlog * +xlog_alloc_log( + struct xfs_mount *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks) { - xlog_t *log; + struct xlog *log; xlog_rec_header_t *head; xlog_in_core_t **iclogp; xlog_in_core_t *iclog, *prev_iclog=NULL; @@ -1144,7 +1256,7 @@ xlog_alloc_log(xfs_mount_t *mp, int error = ENOMEM; uint log2_size = 0; - log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); + log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); if (!log) { xfs_warn(mp, "Log allocation failed: No memory!"); goto out; @@ -1157,6 +1269,7 @@ xlog_alloc_log(xfs_mount_t *mp, log->l_logBBsize = num_bblks; log->l_covered_state = XLOG_STATE_COVER_IDLE; log->l_flags |= XLOG_ACTIVE_RECOVERY; + INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); log->l_prev_block = -1; /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ @@ -1379,6 +1492,84 @@ xlog_grant_push_ail( } /* + * Stamp cycle number in every block + */ +STATIC void +xlog_pack_data( + struct xlog *log, + struct xlog_in_core *iclog, + int roundoff) +{ + int i, j, k; + int size = iclog->ic_offset + roundoff; + __be32 cycle_lsn; + xfs_caddr_t dp; + + cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + + dp = iclog->ic_datap; + for (i = 0; i < BTOBB(size); i++) { + if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) + break; + iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = iclog->ic_data; + + for ( ; i < BTOBB(size); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + for (i = 1; i < log->l_iclog_heads; i++) + xhdr[i].hic_xheader.xh_cycle = cycle_lsn; + } +} + +/* + * Calculate the checksum for a log buffer. + * + * This is a little more complicated than it should be because the various + * headers and the actual data are non-contiguous. + */ +__le32 +xlog_cksum( + struct xlog *log, + struct xlog_rec_header *rhead, + char *dp, + int size) +{ + __uint32_t crc; + + /* first generate the crc for the record header ... */ + crc = xfs_start_cksum((char *)rhead, + sizeof(struct xlog_rec_header), + offsetof(struct xlog_rec_header, h_crc)); + + /* ... then for additional cycle data for v2 logs ... */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; + int i; + + for (i = 1; i < log->l_iclog_heads; i++) { + crc = crc32c(crc, &xhdr[i].hic_xheader, + sizeof(struct xlog_rec_ext_header)); + } + } + + /* ... and finally for the payload */ + crc = crc32c(crc, dp, size); + + return xfs_end_cksum(crc); +} + +/* * The bdstrat callback function for log bufs. This gives us a central * place to trap bufs in case we get hit by a log I/O error and need to * shutdown. Actually, in practice, even when we didn't get a log error, @@ -1434,10 +1625,10 @@ xlog_bdstrat( */ STATIC int -xlog_sync(xlog_t *log, - xlog_in_core_t *iclog) +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog) { - xfs_caddr_t dptr; /* pointer to byte sized element */ xfs_buf_t *bp; int i; uint count; /* byte count of bwrite */ @@ -1446,6 +1637,7 @@ xlog_sync(xlog_t *log, int split = 0; /* split write into two regions */ int error; int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); + int size; XFS_STATS_INC(xs_log_writes); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); @@ -1476,13 +1668,10 @@ xlog_sync(xlog_t *log, xlog_pack_data(log, iclog, roundoff); /* real byte length */ - if (v2) { - iclog->ic_header.h_len = - cpu_to_be32(iclog->ic_offset + roundoff); - } else { - iclog->ic_header.h_len = - cpu_to_be32(iclog->ic_offset); - } + size = iclog->ic_offset; + if (v2) + size += roundoff; + iclog->ic_header.h_len = cpu_to_be32(size); bp = iclog->ic_bp; XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); @@ -1491,12 +1680,36 @@ xlog_sync(xlog_t *log, /* Do we need to split this write into 2 parts? */ if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { + char *dptr; + split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); - iclog->ic_bwritecnt = 2; /* split into 2 writes */ + iclog->ic_bwritecnt = 2; + + /* + * Bump the cycle numbers at the start of each block in the + * part of the iclog that ends up in the buffer that gets + * written to the start of the log. + * + * Watch out for the header magic number case, though. + */ + dptr = (char *)&iclog->ic_header + count; + for (i = 0; i < split; i += BBSIZE) { + __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); + if (++cycle == XLOG_HEADER_MAGIC_NUM) + cycle++; + *(__be32 *)dptr = cpu_to_be32(cycle); + + dptr += BBSIZE; + } } else { iclog->ic_bwritecnt = 1; } + + /* calculcate the checksum */ + iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, + iclog->ic_datap, size); + bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); @@ -1550,19 +1763,6 @@ xlog_sync(xlog_t *log, bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) bp->b_flags |= XBF_FUA; - dptr = bp->b_addr; - /* - * Bump the cycle numbers at the start of each block - * since this part of the buffer is at the start of - * a new cycle. Watch out for the header magic number - * case, though. - */ - for (i = 0; i < split; i += BBSIZE) { - be32_add_cpu((__be32 *)dptr, 1); - if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM) - be32_add_cpu((__be32 *)dptr, 1); - dptr += BBSIZE; - } ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); @@ -1579,12 +1779,12 @@ xlog_sync(xlog_t *log, return 0; } /* xlog_sync */ - /* * Deallocate a log structure */ STATIC void -xlog_dealloc_log(xlog_t *log) +xlog_dealloc_log( + struct xlog *log) { xlog_in_core_t *iclog, *next_iclog; int i; @@ -1616,10 +1816,11 @@ xlog_dealloc_log(xlog_t *log) */ /* ARGSUSED */ static inline void -xlog_state_finish_copy(xlog_t *log, - xlog_in_core_t *iclog, - int record_cnt, - int copy_bytes) +xlog_state_finish_copy( + struct xlog *log, + struct xlog_in_core *iclog, + int record_cnt, + int copy_bytes) { spin_lock(&log->l_icloglock); @@ -2142,7 +2343,8 @@ xlog_write( * State Change: DIRTY -> ACTIVE */ STATIC void -xlog_state_clean_log(xlog_t *log) +xlog_state_clean_log( + struct xlog *log) { xlog_in_core_t *iclog; int changed = 0; @@ -2222,7 +2424,7 @@ xlog_state_clean_log(xlog_t *log) STATIC xfs_lsn_t xlog_get_lowest_lsn( - xlog_t *log) + struct xlog *log) { xlog_in_core_t *lsn_log; xfs_lsn_t lowest_lsn, lsn; @@ -2245,9 +2447,9 @@ xlog_get_lowest_lsn( STATIC void xlog_state_do_callback( - xlog_t *log, - int aborted, - xlog_in_core_t *ciclog) + struct xlog *log, + int aborted, + struct xlog_in_core *ciclog) { xlog_in_core_t *iclog; xlog_in_core_t *first_iclog; /* used to know when we've @@ -2345,14 +2547,27 @@ xlog_state_do_callback( /* - * update the last_sync_lsn before we drop the + * Completion of a iclog IO does not imply that + * a transaction has completed, as transactions + * can be large enough to span many iclogs. We + * cannot change the tail of the log half way + * through a transaction as this may be the only + * transaction in the log and moving th etail to + * point to the middle of it will prevent + * recovery from finding the start of the + * transaction. Hence we should only update the + * last_sync_lsn if this iclog contains + * transaction completion callbacks on it. + * + * We have to do this before we drop the * icloglock to ensure we are the only one that * can update it. */ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - atomic64_set(&log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)); + if (iclog->ic_callback) + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); } else ioerrors++; @@ -2467,7 +2682,7 @@ xlog_state_done_syncing( xlog_in_core_t *iclog, int aborted) { - xlog_t *log = iclog->ic_log; + struct xlog *log = iclog->ic_log; spin_lock(&log->l_icloglock); @@ -2521,12 +2736,13 @@ xlog_state_done_syncing( * is copied. */ STATIC int -xlog_state_get_iclog_space(xlog_t *log, - int len, - xlog_in_core_t **iclogp, - xlog_ticket_t *ticket, - int *continued_write, - int *logoffsetp) +xlog_state_get_iclog_space( + struct xlog *log, + int len, + struct xlog_in_core **iclogp, + struct xlog_ticket *ticket, + int *continued_write, + int *logoffsetp) { int log_offset; xlog_rec_header_t *head; @@ -2631,8 +2847,9 @@ restart: * move grant reservation head forward. */ STATIC void -xlog_regrant_reserve_log_space(xlog_t *log, - xlog_ticket_t *ticket) +xlog_regrant_reserve_log_space( + struct xlog *log, + struct xlog_ticket *ticket) { trace_xfs_log_regrant_reserve_enter(log, ticket); @@ -2677,8 +2894,9 @@ xlog_regrant_reserve_log_space(xlog_t *log, * in the current reservation field. */ STATIC void -xlog_ungrant_log_space(xlog_t *log, - xlog_ticket_t *ticket) +xlog_ungrant_log_space( + struct xlog *log, + struct xlog_ticket *ticket) { int bytes; @@ -2717,8 +2935,8 @@ xlog_ungrant_log_space(xlog_t *log, */ STATIC int xlog_state_release_iclog( - xlog_t *log, - xlog_in_core_t *iclog) + struct xlog *log, + struct xlog_in_core *iclog) { int sync = 0; /* do we sync? */ @@ -2768,9 +2986,10 @@ xlog_state_release_iclog( * that every data block. We have run out of space in this log record. */ STATIC void -xlog_state_switch_iclogs(xlog_t *log, - xlog_in_core_t *iclog, - int eventual_size) +xlog_state_switch_iclogs( + struct xlog *log, + struct xlog_in_core *iclog, + int eventual_size) { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); if (!eventual_size) @@ -3114,7 +3333,9 @@ xfs_log_force_lsn( * disk. */ STATIC void -xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) +xlog_state_want_sync( + struct xlog *log, + struct xlog_in_core *iclog) { assert_spin_locked(&log->l_icloglock); @@ -3158,7 +3379,7 @@ xfs_log_ticket_get( /* * Allocate and initialise a new log ticket. */ -xlog_ticket_t * +struct xlog_ticket * xlog_ticket_alloc( struct xlog *log, int unit_bytes, @@ -3346,9 +3567,10 @@ xlog_verify_grant_tail( /* check if it will fit */ STATIC void -xlog_verify_tail_lsn(xlog_t *log, - xlog_in_core_t *iclog, - xfs_lsn_t tail_lsn) +xlog_verify_tail_lsn( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t tail_lsn) { int blocks; @@ -3385,10 +3607,11 @@ xlog_verify_tail_lsn(xlog_t *log, * the cycle numbers agree with the current cycle number. */ STATIC void -xlog_verify_iclog(xlog_t *log, - xlog_in_core_t *iclog, - int count, - boolean_t syncing) +xlog_verify_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + int count, + boolean_t syncing) { xlog_op_header_t *ophead; xlog_in_core_t *icptr; @@ -3482,7 +3705,7 @@ xlog_verify_iclog(xlog_t *log, */ STATIC int xlog_state_ioerror( - xlog_t *log) + struct xlog *log) { xlog_in_core_t *iclog, *ic; @@ -3527,7 +3750,7 @@ xfs_log_force_umount( struct xfs_mount *mp, int logerror) { - xlog_t *log; + struct xlog *log; int retval; log = mp->m_log; @@ -3634,7 +3857,8 @@ xfs_log_force_umount( } STATIC int -xlog_iclogs_empty(xlog_t *log) +xlog_iclogs_empty( + struct xlog *log) { xlog_in_core_t *iclog; @@ -3649,3 +3873,4 @@ xlog_iclogs_empty(xlog_t *log) } while (iclog != log->l_iclog); return 1; } + diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 748d312850e..5caee96059d 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -181,5 +181,9 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); +void xfs_log_work_queue(struct xfs_mount *mp); +void xfs_log_worker(struct work_struct *work); +void xfs_log_quiesce(struct xfs_mount *mp); + #endif #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 72eba2201b1..16d8d12ea3b 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i) /* * Flags for log structure */ -#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */ #define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being @@ -291,7 +290,7 @@ typedef struct xlog_rec_header { __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ __be64 h_lsn; /* lsn of this LR : 8 */ __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ - __be32 h_chksum; /* may not be used; non-zero if used : 4 */ + __le32 h_crc; /* crc of log record : 4 */ __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; @@ -487,7 +486,7 @@ struct xlog_grant_head { * overflow 31 bits worth of byte offset, so using a byte number will mean * that round off problems won't occur when releasing partial reservations. */ -typedef struct xlog { +struct xlog { /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ @@ -495,6 +494,7 @@ typedef struct xlog { struct xfs_buf *l_xbuf; /* extra buffer for log * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ + struct delayed_work l_work; /* background flush work */ uint l_flags; uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ struct list_head *l_buf_cancel_table; @@ -540,7 +540,7 @@ typedef struct xlog { char *l_iclog_bak[XLOG_MAX_ICLOGS]; #endif -} xlog_t; +}; #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE)) @@ -548,9 +548,15 @@ typedef struct xlog { #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) /* common routines */ -extern int xlog_recover(xlog_t *log); -extern int xlog_recover_finish(xlog_t *log); -extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); +extern int +xlog_recover( + struct xlog *log); +extern int +xlog_recover_finish( + struct xlog *log); + +extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, int size); extern kmem_zone_t *xfs_log_ticket_zone; struct xlog_ticket * diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a7be98abd6a..96fcbb85ff8 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -41,12 +41,22 @@ #include "xfs_trans_priv.h" #include "xfs_quota.h" #include "xfs_utils.h" +#include "xfs_cksum.h" #include "xfs_trace.h" +#include "xfs_icache.h" -STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); -STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); +STATIC int +xlog_find_zeroed( + struct xlog *, + xfs_daddr_t *); +STATIC int +xlog_clear_stale_blocks( + struct xlog *, + xfs_lsn_t); #if defined(DEBUG) -STATIC void xlog_recover_check_summary(xlog_t *); +STATIC void +xlog_recover_check_summary( + struct xlog *); #else #define xlog_recover_check_summary(log) #endif @@ -74,7 +84,7 @@ struct xfs_buf_cancel { static inline int xlog_buf_bbcount_valid( - xlog_t *log, + struct xlog *log, int bbcount) { return bbcount > 0 && bbcount <= log->l_logBBsize; @@ -87,7 +97,7 @@ xlog_buf_bbcount_valid( */ STATIC xfs_buf_t * xlog_get_bp( - xlog_t *log, + struct xlog *log, int nbblks) { struct xfs_buf *bp; @@ -138,10 +148,10 @@ xlog_put_bp( */ STATIC xfs_caddr_t xlog_align( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); @@ -155,10 +165,10 @@ xlog_align( */ STATIC int xlog_bread_noalign( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { int error; @@ -189,10 +199,10 @@ xlog_bread_noalign( STATIC int xlog_bread( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_caddr_t *offset) { int error; @@ -211,10 +221,10 @@ xlog_bread( */ STATIC int xlog_bread_offset( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, /* block to read from */ int nbblks, /* blocks to read */ - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_caddr_t offset) { xfs_caddr_t orig_offset = bp->b_addr; @@ -241,10 +251,10 @@ xlog_bread_offset( */ STATIC int xlog_bwrite( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { int error; @@ -378,8 +388,8 @@ xlog_recover_iodone( */ STATIC int xlog_find_cycle_start( - xlog_t *log, - xfs_buf_t *bp, + struct xlog *log, + struct xfs_buf *bp, xfs_daddr_t first_blk, xfs_daddr_t *last_blk, uint cycle) @@ -421,7 +431,7 @@ xlog_find_cycle_start( */ STATIC int xlog_find_verify_cycle( - xlog_t *log, + struct xlog *log, xfs_daddr_t start_blk, int nbblks, uint stop_on_cycle_no, @@ -490,7 +500,7 @@ out: */ STATIC int xlog_find_verify_log_record( - xlog_t *log, + struct xlog *log, xfs_daddr_t start_blk, xfs_daddr_t *last_blk, int extra_bblks) @@ -600,7 +610,7 @@ out: */ STATIC int xlog_find_head( - xlog_t *log, + struct xlog *log, xfs_daddr_t *return_head_blk) { xfs_buf_t *bp; @@ -871,7 +881,7 @@ validate_head: */ STATIC int xlog_find_tail( - xlog_t *log, + struct xlog *log, xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk) { @@ -1080,7 +1090,7 @@ done: */ STATIC int xlog_find_zeroed( - xlog_t *log, + struct xlog *log, xfs_daddr_t *blk_no) { xfs_buf_t *bp; @@ -1183,7 +1193,7 @@ bp_err: */ STATIC void xlog_add_record( - xlog_t *log, + struct xlog *log, xfs_caddr_t buf, int cycle, int block, @@ -1205,7 +1215,7 @@ xlog_add_record( STATIC int xlog_write_log_records( - xlog_t *log, + struct xlog *log, int cycle, int start_block, int blocks, @@ -1305,7 +1315,7 @@ xlog_write_log_records( */ STATIC int xlog_clear_stale_blocks( - xlog_t *log, + struct xlog *log, xfs_lsn_t tail_lsn) { int tail_cycle, head_cycle; @@ -2050,11 +2060,11 @@ xfs_qm_dqcheck( */ STATIC void xlog_recover_do_dquot_buffer( - xfs_mount_t *mp, - xlog_t *log, - xlog_recover_item_t *item, - xfs_buf_t *bp, - xfs_buf_log_format_t *buf_f) + struct xfs_mount *mp, + struct xlog *log, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) { uint type; @@ -2108,9 +2118,9 @@ xlog_recover_do_dquot_buffer( */ STATIC int xlog_recover_buffer_pass2( - xlog_t *log, - struct list_head *buffer_list, - xlog_recover_item_t *item) + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; xfs_mount_t *mp = log->l_mp; @@ -2135,7 +2145,7 @@ xlog_recover_buffer_pass2( buf_flags |= XBF_UNMAPPED; bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags); + buf_flags, NULL); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; @@ -2189,9 +2199,9 @@ xlog_recover_buffer_pass2( STATIC int xlog_recover_inode_pass2( - xlog_t *log, - struct list_head *buffer_list, - xlog_recover_item_t *item) + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item) { xfs_inode_log_format_t *in_f; xfs_mount_t *mp = log->l_mp; @@ -2228,7 +2238,8 @@ xlog_recover_inode_pass2( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, + NULL); if (!bp) { error = ENOMEM; goto error; @@ -2452,14 +2463,14 @@ error: } /* - * Recover QUOTAOFF records. We simply make a note of it in the xlog_t + * Recover QUOTAOFF records. We simply make a note of it in the xlog * structure, so that we know not to do any dquot item or dquot buffer recovery, * of that type. */ STATIC int xlog_recover_quotaoff_pass1( - xlog_t *log, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover_item *item) { xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; ASSERT(qoff_f); @@ -2483,9 +2494,9 @@ xlog_recover_quotaoff_pass1( */ STATIC int xlog_recover_dquot_pass2( - xlog_t *log, - struct list_head *buffer_list, - xlog_recover_item_t *item) + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item) { xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; @@ -2539,7 +2550,8 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_len == 1); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + NULL); if (error) return error; @@ -2578,9 +2590,9 @@ xlog_recover_dquot_pass2( */ STATIC int xlog_recover_efi_pass2( - xlog_t *log, - xlog_recover_item_t *item, - xfs_lsn_t lsn) + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) { int error; xfs_mount_t *mp = log->l_mp; @@ -2616,8 +2628,8 @@ xlog_recover_efi_pass2( */ STATIC int xlog_recover_efd_pass2( - xlog_t *log, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover_item *item) { xfs_efd_log_format_t *efd_formatp; xfs_efi_log_item_t *efip = NULL; @@ -2812,9 +2824,9 @@ xlog_recover_unmount_trans( */ STATIC int xlog_recover_process_data( - xlog_t *log, + struct xlog *log, struct hlist_head rhash[], - xlog_rec_header_t *rhead, + struct xlog_rec_header *rhead, xfs_caddr_t dp, int pass) { @@ -2986,7 +2998,7 @@ abort_error: */ STATIC int xlog_recover_process_efis( - xlog_t *log) + struct xlog *log) { xfs_log_item_t *lip; xfs_efi_log_item_t *efip; @@ -3098,7 +3110,7 @@ xlog_recover_process_one_iunlink( /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_itobp(mp, NULL, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); if (error) goto fail_iput; @@ -3147,7 +3159,7 @@ xlog_recover_process_one_iunlink( */ STATIC void xlog_recover_process_iunlinks( - xlog_t *log) + struct xlog *log) { xfs_mount_t *mp; xfs_agnumber_t agno; @@ -3205,80 +3217,58 @@ xlog_recover_process_iunlinks( mp->m_dmevmask = mp_dmevmask; } - -#ifdef DEBUG -STATIC void -xlog_pack_data_checksum( - xlog_t *log, - xlog_in_core_t *iclog, - int size) -{ - int i; - __be32 *up; - uint chksum = 0; - - up = (__be32 *)iclog->ic_datap; - /* divide length by 4 to get # words */ - for (i = 0; i < (size >> 2); i++) { - chksum ^= be32_to_cpu(*up); - up++; - } - iclog->ic_header.h_chksum = cpu_to_be32(chksum); -} -#else -#define xlog_pack_data_checksum(log, iclog, size) -#endif - /* - * Stamp cycle number in every block + * Upack the log buffer data and crc check it. If the check fails, issue a + * warning if and only if the CRC in the header is non-zero. This makes the + * check an advisory warning, and the zero CRC check will prevent failure + * warnings from being emitted when upgrading the kernel from one that does not + * add CRCs by default. + * + * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log + * corruption failure */ -void -xlog_pack_data( - xlog_t *log, - xlog_in_core_t *iclog, - int roundoff) +STATIC int +xlog_unpack_data_crc( + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + struct xlog *log) { - int i, j, k; - int size = iclog->ic_offset + roundoff; - __be32 cycle_lsn; - xfs_caddr_t dp; - - xlog_pack_data_checksum(log, iclog, size); - - cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); - - dp = iclog->ic_datap; - for (i = 0; i < BTOBB(size) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; - } - - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xlog_in_core_2_t *xhdr = iclog->ic_data; - - for ( ; i < BTOBB(size); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; + __le32 crc; + + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + if (crc != rhead->h_crc) { + if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + xfs_alert(log->l_mp, + "log record CRC mismatch: found 0x%x, expected 0x%x.\n", + le32_to_cpu(rhead->h_crc), + le32_to_cpu(crc)); + xfs_hex_dump(dp, 32); } - for (i = 1; i < log->l_iclog_heads; i++) { - xhdr[i].hic_xheader.xh_cycle = cycle_lsn; - } + /* + * If we've detected a log record corruption, then we can't + * recover past this point. Abort recovery if we are enforcing + * CRC protection by punting an error back up the stack. + */ + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + return EFSCORRUPTED; } + + return 0; } -STATIC void +STATIC int xlog_unpack_data( - xlog_rec_header_t *rhead, + struct xlog_rec_header *rhead, xfs_caddr_t dp, - xlog_t *log) + struct xlog *log) { int i, j, k; + int error; + + error = xlog_unpack_data_crc(rhead, dp, log); + if (error) + return error; for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { @@ -3295,12 +3285,14 @@ xlog_unpack_data( dp += BBSIZE; } } + + return 0; } STATIC int xlog_valid_rec_header( - xlog_t *log, - xlog_rec_header_t *rhead, + struct xlog *log, + struct xlog_rec_header *rhead, xfs_daddr_t blkno) { int hlen; @@ -3343,7 +3335,7 @@ xlog_valid_rec_header( */ STATIC int xlog_do_recovery_pass( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, int pass) @@ -3426,9 +3418,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, - rhash, rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, + rhash, rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3533,14 +3529,19 @@ xlog_do_recovery_pass( * - order is important. */ error = xlog_bread_offset(log, 0, - bblks - split_bblks, hbp, + bblks - split_bblks, dbp, offset + BBTOB(split_bblks)); if (error) goto bread_err2; } - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks; } @@ -3565,9 +3566,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3595,7 +3600,7 @@ xlog_do_recovery_pass( */ STATIC int xlog_do_log_recovery( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { @@ -3646,7 +3651,7 @@ xlog_do_log_recovery( */ STATIC int xlog_do_recover( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { @@ -3681,13 +3686,14 @@ xlog_do_recover( /* * Now that we've finished replaying all buffer and inode - * updates, re-read in the superblock. + * updates, re-read in the superblock and reverify it. */ bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); ASSERT(!(XFS_BUF_ISWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); + bp->b_ops = &xfs_sb_buf_ops; xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) { @@ -3699,7 +3705,7 @@ xlog_do_recover( /* Convert superblock from on-disk format */ sbp = &log->l_mp->m_sb; - xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); ASSERT(xfs_sb_good_version(sbp)); xfs_buf_relse(bp); @@ -3721,7 +3727,7 @@ xlog_do_recover( */ int xlog_recover( - xlog_t *log) + struct xlog *log) { xfs_daddr_t head_blk, tail_blk; int error; @@ -3767,7 +3773,7 @@ xlog_recover( */ int xlog_recover_finish( - xlog_t *log) + struct xlog *log) { /* * Now we're ready to do the transactions needed for the @@ -3814,7 +3820,7 @@ xlog_recover_finish( */ void xlog_recover_check_summary( - xlog_t *log) + struct xlog *log) { xfs_mount_t *mp; xfs_agf_t *agfp; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 536021fb3d4..7d6df7c00c3 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -42,6 +42,7 @@ #include "xfs_fsops.h" #include "xfs_utils.h" #include "xfs_trace.h" +#include "xfs_icache.h" #ifdef HAVE_PERCPU_SB @@ -303,9 +304,8 @@ STATIC int xfs_mount_validate_sb( xfs_mount_t *mp, xfs_sb_t *sbp, - int flags) + bool check_inprogress) { - int loud = !(flags & XFS_MFSI_QUIET); /* * If the log device and data device have the @@ -315,21 +315,18 @@ xfs_mount_validate_sb( * a volume filesystem in a non-volume manner. */ if (sbp->sb_magicnum != XFS_SB_MAGIC) { - if (loud) - xfs_warn(mp, "bad magic number"); + xfs_warn(mp, "bad magic number"); return XFS_ERROR(EWRONGFS); } if (!xfs_sb_good_version(sbp)) { - if (loud) - xfs_warn(mp, "bad version"); + xfs_warn(mp, "bad version"); return XFS_ERROR(EWRONGFS); } if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "filesystem is marked as having an external log; " "specify logdev on the mount command line."); return XFS_ERROR(EINVAL); @@ -337,8 +334,7 @@ xfs_mount_validate_sb( if (unlikely( sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "filesystem is marked as having an internal log; " "do not specify logdev on the mount command line."); return XFS_ERROR(EINVAL); @@ -372,8 +368,7 @@ xfs_mount_validate_sb( sbp->sb_dblocks == 0 || sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { - if (loud) - XFS_CORRUPTION_ERROR("SB sanity check failed", + XFS_CORRUPTION_ERROR("SB sanity check failed", XFS_ERRLEVEL_LOW, mp, sbp); return XFS_ERROR(EFSCORRUPTED); } @@ -382,12 +377,10 @@ xfs_mount_validate_sb( * Until this is fixed only page-sized or smaller data blocks work. */ if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - if (loud) { - xfs_warn(mp, + xfs_warn(mp, "File system with blocksize %d bytes. " "Only pagesize (%ld) or less will currently work.", sbp->sb_blocksize, PAGE_SIZE); - } return XFS_ERROR(ENOSYS); } @@ -401,23 +394,20 @@ xfs_mount_validate_sb( case 2048: break; default: - if (loud) - xfs_warn(mp, "inode size of %d bytes not supported", + xfs_warn(mp, "inode size of %d bytes not supported", sbp->sb_inodesize); return XFS_ERROR(ENOSYS); } if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "file system too large to be mounted on this system."); return XFS_ERROR(EFBIG); } - if (unlikely(sbp->sb_inprogress)) { - if (loud) - xfs_warn(mp, "file system busy"); + if (check_inprogress && sbp->sb_inprogress) { + xfs_warn(mp, "Offline file system operation in progress!"); return XFS_ERROR(EFSCORRUPTED); } @@ -425,9 +415,7 @@ xfs_mount_validate_sb( * Version 1 directory format has never worked on Linux. */ if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { - if (loud) - xfs_warn(mp, - "file system using version 1 directory format"); + xfs_warn(mp, "file system using version 1 directory format"); return XFS_ERROR(ENOSYS); } @@ -440,7 +428,7 @@ xfs_initialize_perag( xfs_agnumber_t agcount, xfs_agnumber_t *maxagi) { - xfs_agnumber_t index, max_metadata; + xfs_agnumber_t index; xfs_agnumber_t first_initialised = 0; xfs_perag_t *pag; xfs_agino_t agino; @@ -500,43 +488,10 @@ xfs_initialize_perag( else mp->m_flags &= ~XFS_MOUNT_32BITINODES; - if (mp->m_flags & XFS_MOUNT_32BITINODES) { - /* - * Calculate how much should be reserved for inodes to meet - * the max inode percentage. - */ - if (mp->m_maxicount) { - __uint64_t icount; - - icount = sbp->sb_dblocks * sbp->sb_imax_pct; - do_div(icount, 100); - icount += sbp->sb_agblocks - 1; - do_div(icount, sbp->sb_agblocks); - max_metadata = icount; - } else { - max_metadata = agcount; - } - - for (index = 0; index < agcount; index++) { - ino = XFS_AGINO_TO_INO(mp, index, agino); - if (ino > XFS_MAXINUMBER_32) { - index++; - break; - } - - pag = xfs_perag_get(mp, index); - pag->pagi_inodeok = 1; - if (index < max_metadata) - pag->pagf_metadata = 1; - xfs_perag_put(pag); - } - } else { - for (index = 0; index < agcount; index++) { - pag = xfs_perag_get(mp, index); - pag->pagi_inodeok = 1; - xfs_perag_put(pag); - } - } + if (mp->m_flags & XFS_MOUNT_32BITINODES) + index = xfs_set_inode32(mp); + else + index = xfs_set_inode64(mp); if (maxagi) *maxagi = index; @@ -553,11 +508,9 @@ out_unwind: void xfs_sb_from_disk( - struct xfs_mount *mp, + struct xfs_sb *to, xfs_dsb_t *from) { - struct xfs_sb *to = &mp->m_sb; - to->sb_magicnum = be32_to_cpu(from->sb_magicnum); to->sb_blocksize = be32_to_cpu(from->sb_blocksize); to->sb_dblocks = be64_to_cpu(from->sb_dblocks); @@ -659,6 +612,72 @@ xfs_sb_to_disk( } } +static void +xfs_sb_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_sb sb; + int error; + + xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + + /* + * Only check the in progress field for the primary superblock as + * mkfs.xfs doesn't clear it from secondary superblocks. + */ + error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); + if (error) + xfs_buf_ioerror(bp, error); +} + +static void +xfs_sb_read_verify( + struct xfs_buf *bp) +{ + xfs_sb_verify(bp); +} + +/* + * We may be probed for a filesystem match, so we may not want to emit + * messages when the superblock buffer is not actually an XFS superblock. + * If we find an XFS superblock, the run a normal, noisy mount because we are + * really going to mount it and want to know about errors. + */ +static void +xfs_sb_quiet_read_verify( + struct xfs_buf *bp) +{ + struct xfs_sb sb; + + xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + + if (sb.sb_magicnum == XFS_SB_MAGIC) { + /* XFS filesystem, verify noisily! */ + xfs_sb_read_verify(bp); + return; + } + /* quietly fail */ + xfs_buf_ioerror(bp, EWRONGFS); +} + +static void +xfs_sb_write_verify( + struct xfs_buf *bp) +{ + xfs_sb_verify(bp); +} + +const struct xfs_buf_ops xfs_sb_buf_ops = { + .verify_read = xfs_sb_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .verify_read = xfs_sb_quiet_read_verify, + .verify_write = xfs_sb_write_verify, +}; + /* * xfs_readsb * @@ -684,26 +703,27 @@ xfs_readsb(xfs_mount_t *mp, int flags) reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0); + BTOBB(sector_size), 0, + loud ? &xfs_sb_buf_ops + : &xfs_sb_quiet_buf_ops); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); return EIO; } - - /* - * Initialize the mount structure from the superblock. - * But first do some basic consistency checking. - */ - xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp)); - error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); - if (error) { + if (bp->b_error) { + error = bp->b_error; if (loud) xfs_warn(mp, "SB validate failed"); goto release_buf; } /* + * Initialize the mount structure from the superblock. + */ + xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); + + /* * We must be able to do sector-sized and sector-aligned IO. */ if (sector_size > mp->m_sb.sb_sectsize) { @@ -1034,7 +1054,7 @@ xfs_check_sizes(xfs_mount_t *mp) } bp = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "last sector read failed"); return EIO; @@ -1049,7 +1069,7 @@ xfs_check_sizes(xfs_mount_t *mp) } bp = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "log device read failed"); return EIO; @@ -1200,8 +1220,6 @@ xfs_mountfs( xfs_set_maxicount(mp); - mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog); - error = xfs_uuid_mount(mp); if (error) goto out; @@ -1462,6 +1480,8 @@ xfs_unmountfs( __uint64_t resblks; int error; + cancel_delayed_work_sync(&mp->m_eofblocks_work); + xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); IRELE(mp->m_rootip); @@ -1485,21 +1505,16 @@ xfs_unmountfs( /* * And reclaim all inodes. At this point there should be no dirty - * inode, and none should be pinned or locked, but use synchronous - * reclaim just to be sure. + * inodes and none should be pinned or locked, but use synchronous + * reclaim just to be sure. We can stop background inode reclaim + * here as well if it is still running. */ + cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); /* - * Flush out the log synchronously so that we know for sure - * that nothing is pinned. This is important because bflush() - * will skip pinned buffers. - */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* * Unreserve any blocks we have so that when we unmount we don't account * the reserved free space as used. This is really only necessary for * lazy superblock counting because it trusts the incore superblock @@ -1524,14 +1539,6 @@ xfs_unmountfs( xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. - */ - xfs_ail_push_all_sync(mp->m_ail); - xfs_wait_buftarg(mp->m_ddev_targp); - - xfs_log_unmount_write(mp); xfs_log_unmount(mp); xfs_uuid_unmount(mp); @@ -1544,7 +1551,7 @@ xfs_unmountfs( int xfs_fs_writable(xfs_mount_t *mp) { - return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) || + return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY)); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 90c1fc9eaea..bab8314507e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -51,15 +51,8 @@ typedef struct xfs_trans_reservations { #else /* __KERNEL__ */ -#include "xfs_sync.h" - struct xlog; -struct xfs_mount_args; struct xfs_inode; -struct xfs_bmbt_irec; -struct xfs_bmap_free; -struct xfs_extdelta; -struct xfs_swapext; struct xfs_mru_cache; struct xfs_nameops; struct xfs_ail; @@ -176,7 +169,6 @@ typedef struct xfs_mount { uint m_qflags; /* quota status flags */ xfs_trans_reservations_t m_reservations;/* precomputed res values */ __uint64_t m_maxicount; /* maximum inode count */ - __uint64_t m_maxioffset; /* maximum inode offset */ __uint64_t m_resblks; /* total reserved blocks */ __uint64_t m_resblks_avail;/* available reserved blocks */ __uint64_t m_resblks_save; /* reserved blks @ remount,ro */ @@ -203,9 +195,9 @@ typedef struct xfs_mount { struct mutex m_icsb_mutex; /* balancer sync lock */ #endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ - struct delayed_work m_sync_work; /* background sync work */ struct delayed_work m_reclaim_work; /* background inode reclaim */ - struct work_struct m_flush_work; /* background inode flush */ + struct delayed_work m_eofblocks_work; /* background eof blocks + trimming */ __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ struct shrinker m_inode_shrink; /* inode reclaim shrinker */ @@ -215,6 +207,9 @@ typedef struct xfs_mount { struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; + struct workqueue_struct *m_reclaim_workqueue; + struct workqueue_struct *m_log_workqueue; + struct workqueue_struct *m_eofblocks_workqueue; } xfs_mount_t; /* @@ -297,8 +292,6 @@ xfs_preferred_iosize(xfs_mount_t *mp) PAGE_CACHE_SIZE)); } -#define XFS_MAXIOFFSET(mp) ((mp)->m_maxioffset) - #define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN) #define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) @@ -314,9 +307,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, #define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ #define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ -#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen) -#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l)) - /* * Flags for xfs_mountfs */ @@ -398,7 +388,9 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); -extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *); +extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); +extern const struct xfs_buf_ops xfs_sb_buf_ops; + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 249db198776..60eff476315 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -40,6 +40,7 @@ #include "xfs_utils.h" #include "xfs_qm.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* * The global quota manager. There is only one of these for the entire @@ -891,7 +892,8 @@ xfs_qm_dqiter_bufs( while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), - mp->m_quotainfo->qi_dqchunklen, 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + &xfs_dquot_buf_ops); if (error) break; @@ -940,7 +942,7 @@ xfs_qm_dqiterate( map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); lblkno = 0; - maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); do { nmaps = XFS_DQITER_MAP_SIZE; /* @@ -978,7 +980,8 @@ xfs_qm_dqiterate( while (rablkcnt--) { xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), - mp->m_quotainfo->qi_dqchunklen); + mp->m_quotainfo->qi_dqchunklen, + NULL); rablkno++; } } @@ -1453,7 +1456,7 @@ xfs_qm_dqreclaim_one( int error; if (!xfs_dqlock_nowait(dqp)) - goto out_busy; + goto out_move_tail; /* * This dquot has acquired a reference in the meantime remove it from @@ -1476,7 +1479,7 @@ xfs_qm_dqreclaim_one( * getting flushed to disk, we don't want to reclaim it. */ if (!xfs_dqflock_nowait(dqp)) - goto out_busy; + goto out_unlock_move_tail; if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; @@ -1487,7 +1490,7 @@ xfs_qm_dqreclaim_one( if (error) { xfs_warn(mp, "%s: dquot %p flush failed", __func__, dqp); - goto out_busy; + goto out_unlock_move_tail; } xfs_buf_delwri_queue(bp, buffer_list); @@ -1496,7 +1499,7 @@ xfs_qm_dqreclaim_one( * Give the dquot another try on the freelist, as the * flushing will take some time. */ - goto out_busy; + goto out_unlock_move_tail; } xfs_dqfunlock(dqp); @@ -1515,14 +1518,13 @@ xfs_qm_dqreclaim_one( XFS_STATS_INC(xs_qm_dqreclaims); return; -out_busy: - xfs_dqunlock(dqp); - /* * Move the dquot to the tail of the list so that we don't spin on it. */ +out_unlock_move_tail: + xfs_dqunlock(dqp); +out_move_tail: list_move_tail(&dqp->q_lru, &qi->qi_lru_list); - trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(xs_qm_dqreclaim_misses); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 858a3b18611..8a59f854655 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -40,6 +40,7 @@ #include "xfs_utils.h" #include "xfs_qm.h" #include "xfs_trace.h" +#include "xfs_icache.h" STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, @@ -783,11 +784,11 @@ xfs_qm_scall_getquota( (XFS_IS_OQUOTA_ENFORCED(mp) && (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && dst->d_id != 0) { - if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) && + if ((dst->d_bcount > dst->d_blk_softlimit) && (dst->d_blk_softlimit > 0)) { ASSERT(dst->d_btimer != 0); } - if (((int) dst->d_icount > (int) dst->d_ino_softlimit) && + if ((dst->d_icount > dst->d_ino_softlimit) && (dst->d_ino_softlimit > 0)) { ASSERT(dst->d_itimer != 0); } @@ -845,7 +846,8 @@ STATIC int xfs_dqrele_inode( struct xfs_inode *ip, struct xfs_perag *pag, - int flags) + int flags, + void *args) { /* skip quota inodes */ if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || @@ -881,5 +883,5 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); } diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index fed504fc299..71926d63052 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -97,8 +97,7 @@ xfs_fs_set_xstate( STATIC int xfs_fs_get_dqblk( struct super_block *sb, - int type, - qid_t id, + struct kqid qid, struct fs_disk_quota *fdq) { struct xfs_mount *mp = XFS_M(sb); @@ -108,14 +107,14 @@ xfs_fs_get_dqblk( if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; - return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq); + return -xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid), + xfs_quota_type(qid.type), fdq); } STATIC int xfs_fs_set_dqblk( struct super_block *sb, - int type, - qid_t id, + struct kqid qid, struct fs_disk_quota *fdq) { struct xfs_mount *mp = XFS_M(sb); @@ -127,7 +126,8 @@ xfs_fs_set_dqblk( if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; - return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); + return -xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid), + xfs_quota_type(qid.type), fdq); } const struct quotactl_ops xfs_quotactl_operations = { diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 92d4331cd4f..98dc670d3ee 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -38,6 +38,7 @@ #include "xfs_utils.h" #include "xfs_trace.h" #include "xfs_buf.h" +#include "xfs_icache.h" /* @@ -857,7 +858,7 @@ xfs_rtbuf_get( xfs_buf_t *bp; /* block buffer, result */ xfs_inode_t *ip; /* bitmap or summary inode */ xfs_bmbt_irec_t map; - int nmap; + int nmap = 1; int error; /* error value */ ip = issum ? mp->m_rsumip : mp->m_rbmip; @@ -869,7 +870,7 @@ xfs_rtbuf_get( ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), - mp->m_bsize, 0, &bp); + mp->m_bsize, 0, &bp, NULL); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -1872,9 +1873,14 @@ xfs_growfs_rt( */ bp = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, nrblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; + if (bp->b_error) { + error = bp->b_error; + xfs_buf_relse(bp); + return error; + } xfs_buf_relse(bp); /* @@ -2219,9 +2225,11 @@ xfs_rtmount_init( } bp = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0); - if (!bp) { + XFS_FSB_TO_BB(mp, 1), 0, NULL); + if (!bp || bp->b_error) { xfs_warn(mp, "realtime device size check failed"); + if (bp) + xfs_buf_relse(bp); return EIO; } xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index f429d9d5d32..a05b45175fb 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -81,6 +81,7 @@ struct xfs_mount; #define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ #define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ #define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ +#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ #define XFS_SB_VERSION2_OKREALFBITS \ (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ @@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); } +static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) +{ + return (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT)); +} + /* * end of superblock version macros */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0d9de41a715..ab8839b2627 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -49,7 +49,7 @@ #include "xfs_extfree_item.h" #include "xfs_mru_cache.h" #include "xfs_inode_item.h" -#include "xfs_sync.h" +#include "xfs_icache.h" #include "xfs_trace.h" #include <linux/namei.h> @@ -88,6 +88,8 @@ mempool_t *xfs_ioend_pool; * unwritten extent conversion */ #define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ #define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ +#define MNTOPT_32BITINODE "inode32" /* inode allocation limited to + * XFS_MAXINUMBER_32 */ #define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ #define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ #define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */ @@ -120,12 +122,18 @@ mempool_t *xfs_ioend_pool; * in the future, too. */ enum { - Opt_barrier, Opt_nobarrier, Opt_err + Opt_barrier, + Opt_nobarrier, + Opt_inode64, + Opt_inode32, + Opt_err }; static const match_table_t tokens = { {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, + {Opt_inode64, "inode64"}, + {Opt_inode32, "inode32"}, {Opt_err, NULL} }; @@ -197,7 +205,9 @@ xfs_parseargs( */ mp->m_flags |= XFS_MOUNT_BARRIER; mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; +#if !XFS_BIG_INUMS mp->m_flags |= XFS_MOUNT_SMALL_INUMS; +#endif /* * These can be overridden by the mount option parsing. @@ -294,6 +304,8 @@ xfs_parseargs( return EINVAL; } dswidth = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_32BITINODE)) { + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; #if !XFS_BIG_INUMS @@ -492,6 +504,7 @@ xfs_showargs( { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, + { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -591,6 +604,80 @@ xfs_max_file_offset( return (((__uint64_t)pagefactor) << bitshift) - 1; } +xfs_agnumber_t +xfs_set_inode32(struct xfs_mount *mp) +{ + xfs_agnumber_t index = 0; + xfs_agnumber_t maxagi = 0; + xfs_sb_t *sbp = &mp->m_sb; + xfs_agnumber_t max_metadata; + xfs_agino_t agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks -1, 0); + xfs_ino_t ino = XFS_AGINO_TO_INO(mp, sbp->sb_agcount -1, agino); + xfs_perag_t *pag; + + /* Calculate how much should be reserved for inodes to meet + * the max inode percentage. + */ + if (mp->m_maxicount) { + __uint64_t icount; + + icount = sbp->sb_dblocks * sbp->sb_imax_pct; + do_div(icount, 100); + icount += sbp->sb_agblocks - 1; + do_div(icount, sbp->sb_agblocks); + max_metadata = icount; + } else { + max_metadata = sbp->sb_agcount; + } + + for (index = 0; index < sbp->sb_agcount; index++) { + ino = XFS_AGINO_TO_INO(mp, index, agino); + + if (ino > XFS_MAXINUMBER_32) { + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 0; + pag->pagf_metadata = 0; + xfs_perag_put(pag); + continue; + } + + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 1; + maxagi++; + if (index < max_metadata) + pag->pagf_metadata = 1; + xfs_perag_put(pag); + } + mp->m_flags |= (XFS_MOUNT_32BITINODES | + XFS_MOUNT_SMALL_INUMS); + + return maxagi; +} + +xfs_agnumber_t +xfs_set_inode64(struct xfs_mount *mp) +{ + xfs_agnumber_t index = 0; + + for (index = 0; index < mp->m_sb.sb_agcount; index++) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 1; + pag->pagf_metadata = 0; + xfs_perag_put(pag); + } + + /* There is no need for lock protection on m_flags, + * the rw_semaphore of the VFS superblock is locked + * during mount/umount/remount operations, so this is + * enough to avoid concurency on the m_flags field + */ + mp->m_flags &= ~(XFS_MOUNT_32BITINODES | + XFS_MOUNT_SMALL_INUMS); + return index; +} + STATIC int xfs_blkdev_get( xfs_mount_t *mp, @@ -776,8 +863,30 @@ xfs_init_mount_workqueues( WQ_MEM_RECLAIM, 0, mp->m_fsname); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; + + mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_reclaim_workqueue) + goto out_destroy_cil; + + mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_log_workqueue) + goto out_destroy_reclaim; + + mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_eofblocks_workqueue) + goto out_destroy_log; + return 0; +out_destroy_log: + destroy_workqueue(mp->m_log_workqueue); +out_destroy_reclaim: + destroy_workqueue(mp->m_reclaim_workqueue); +out_destroy_cil: + destroy_workqueue(mp->m_cil_workqueue); out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_data_iodone_queue: @@ -790,11 +899,32 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_eofblocks_workqueue); + destroy_workqueue(mp->m_log_workqueue); + destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); } +/* + * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK + * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting + * for IO to complete so that we effectively throttle multiple callers to the + * rate at which IO is completing. + */ +void +xfs_flush_inodes( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + + if (down_read_trylock(&sb->s_umount)) { + sync_inodes_sb(sb); + up_read(&sb->s_umount); + } +} + /* Catch misguided souls that try to use this interface on XFS */ STATIC struct inode * xfs_fs_alloc_inode( @@ -868,67 +998,14 @@ xfs_fs_inode_init_once( "xfsino", ip->i_ino); } -/* - * This is called by the VFS when dirtying inode metadata. This can happen - * for a few reasons, but we only care about timestamp updates, given that - * we handled the rest ourselves. In theory no other calls should happen, - * but for example generic_write_end() keeps dirtying the inode after - * updating i_size. Thus we check that the flags are exactly I_DIRTY_SYNC, - * and skip this call otherwise. - * - * We'll hopefull get a different method just for updating timestamps soon, - * at which point this hack can go away, and maybe we'll also get real - * error handling here. - */ -STATIC void -xfs_fs_dirty_inode( - struct inode *inode, - int flags) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - if (flags != I_DIRTY_SYNC) - return; - - trace_xfs_dirty_inode(ip); - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - goto trouble; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - /* - * Grab all the latest timestamps from the Linux inode. - */ - ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; - ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec; - ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec; - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); - error = xfs_trans_commit(tp, 0); - if (error) - goto trouble; - return; - -trouble: - xfs_warn(mp, "failed to update timestamps for inode 0x%llx", ip->i_ino); -} - STATIC void xfs_fs_evict_inode( struct inode *inode) { xfs_inode_t *ip = XFS_I(inode); + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + trace_xfs_evict_inode(ip); truncate_inode_pages(&inode->i_data, 0); @@ -937,22 +1014,6 @@ xfs_fs_evict_inode( XFS_STATS_INC(vn_remove); XFS_STATS_DEC(vn_active); - /* - * The iolock is used by the file system to coordinate reads, - * writes, and block truncates. Up to this point the lock - * protected concurrent accesses by users of the inode. But - * from here forward we're doing some final processing of the - * inode because we're done with it, and although we reuse the - * iolock for protection it is really a distinct lock class - * (in the lockdep sense) from before. To keep lockdep happy - * (and basically indicate what we are doing), we explicitly - * re-init the iolock here. - */ - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - lockdep_set_class_and_name(&ip->i_iolock.mr_lock, - &xfs_iolock_reclaimable, "xfs_iolock_reclaimable"); - xfs_inactive(ip); } @@ -989,7 +1050,7 @@ xfs_fs_put_super( xfs_filestream_unmount(mp); xfs_unmountfs(mp); - xfs_syncd_stop(mp); + xfs_freesb(mp); xfs_icsb_destroy_counters(mp); xfs_destroy_mount_workqueues(mp); @@ -1004,7 +1065,6 @@ xfs_fs_sync_fs( int wait) { struct xfs_mount *mp = XFS_M(sb); - int error; /* * Doing anything during the async pass would be counterproductive. @@ -1012,17 +1072,14 @@ xfs_fs_sync_fs( if (!wait) return 0; - error = xfs_quiesce_data(mp); - if (error) - return -error; - + xfs_log_force(mp, XFS_LOG_SYNC); if (laptop_mode) { /* * The disk must be active because we're syncing. - * We schedule xfssyncd now (now that the disk is + * We schedule log work now (now that the disk is * active) instead of later (when it might not be). */ - flush_delayed_work_sync(&mp->m_sync_work); + flush_delayed_work(&mp->m_log->l_work); } return 0; @@ -1099,6 +1156,48 @@ xfs_restore_resvblks(struct xfs_mount *mp) xfs_reserve_blocks(mp, &resblks, NULL); } +/* + * Trigger writeback of all the dirty metadata in the file system. + * + * This ensures that the metadata is written to their location on disk rather + * than just existing in transactions in the log. This means after a quiesce + * there is no log replay required to write the inodes to disk - this is the + * primary difference between a sync and a quiesce. + * + * Note: xfs_log_quiesce() stops background log work - the callers must ensure + * it is started again when appropriate. + */ +void +xfs_quiesce_attr( + struct xfs_mount *mp) +{ + int error = 0; + + /* wait for all modifications to complete */ + while (atomic_read(&mp->m_active_trans) > 0) + delay(100); + + /* force the log to unpin objects from the now complete transactions */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* reclaim inodes to do any IO before the freeze completes */ + xfs_reclaim_inodes(mp, 0); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + /* Push the superblock and write an unmount record */ + error = xfs_log_sbcount(mp); + if (error) + xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " + "Frozen image may not be consistent."); + /* + * Just warn here till VFS can correctly support + * read-only remount without racing. + */ + WARN_ON(atomic_read(&mp->m_active_trans) != 0); + + xfs_log_quiesce(mp); +} + STATIC int xfs_fs_remount( struct super_block *sb, @@ -1124,6 +1223,12 @@ xfs_fs_remount( case Opt_nobarrier: mp->m_flags &= ~XFS_MOUNT_BARRIER; break; + case Opt_inode64: + mp->m_maxagi = xfs_set_inode64(mp); + break; + case Opt_inode32: + mp->m_maxagi = xfs_set_inode32(mp); + break; default: /* * Logically we would return an error here to prevent @@ -1173,20 +1278,18 @@ xfs_fs_remount( * value if it is non-zero, otherwise go with the default. */ xfs_restore_resvblks(mp); + xfs_log_work_queue(mp); } /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { /* - * After we have synced the data but before we sync the - * metadata, we need to free up the reserve block pool so that - * the used block count in the superblock on disk is correct at - * the end of the remount. Stash the current reserve pool size - * so that if we get remounted rw, we can return it to the same - * size. + * Before we sync the metadata, we need to free up the reserve + * block pool so that the used block count in the superblock on + * disk is correct at the end of the remount. Stash the current + * reserve pool size so that if we get remounted rw, we can + * return it to the same size. */ - - xfs_quiesce_data(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; @@ -1218,6 +1321,7 @@ xfs_fs_unfreeze( struct xfs_mount *mp = XFS_M(sb); xfs_restore_resvblks(mp); + xfs_log_work_queue(mp); return 0; } @@ -1296,6 +1400,8 @@ xfs_fs_fill_super( spin_lock_init(&mp->m_sb_lock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); mp->m_super = sb; sb->s_fs_info = mp; @@ -1346,10 +1452,6 @@ xfs_fs_fill_super( /* * we must configure the block size in the superblock before we run the * full mount process as the mount process can lookup and cache inodes. - * For the same reason we must also initialise the syncd and register - * the inode cache shrinker so that inodes can be reclaimed during - * operations like a quotacheck that iterate all inodes in the - * filesystem. */ sb->s_magic = XFS_SB_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; @@ -1359,13 +1461,9 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - error = xfs_syncd_init(mp); - if (error) - goto out_filestream_unmount; - error = xfs_mountfs(mp); if (error) - goto out_syncd_stop; + goto out_filestream_unmount; root = igrab(VFS_I(mp->m_rootip)); if (!root) { @@ -1383,8 +1481,7 @@ xfs_fs_fill_super( } return 0; - out_syncd_stop: - xfs_syncd_stop(mp); + out_filestream_unmount: xfs_filestream_unmount(mp); out_free_sb: @@ -1404,7 +1501,6 @@ out_destroy_workqueues: out_unmount: xfs_filestream_unmount(mp); xfs_unmountfs(mp); - xfs_syncd_stop(mp); goto out_free_sb; } @@ -1436,7 +1532,6 @@ xfs_fs_free_cached_objects( static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, - .dirty_inode = xfs_fs_dirty_inode, .evict_inode = xfs_fs_evict_inode, .drop_inode = xfs_fs_drop_inode, .put_super = xfs_fs_put_super, @@ -1491,13 +1586,9 @@ xfs_init_zones(void) if (!xfs_da_state_zone) goto out_destroy_btree_cur_zone; - xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); - if (!xfs_dabuf_zone) - goto out_destroy_da_state_zone; - xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); if (!xfs_ifork_zone) - goto out_destroy_dabuf_zone; + goto out_destroy_da_state_zone; xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); if (!xfs_trans_zone) @@ -1514,9 +1605,8 @@ xfs_init_zones(void) * size possible under XFS. This wastes a little bit of memory, * but it is much faster. */ - xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + - (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / - NBWORD) * sizeof(int))), "xfs_buf_item"); + xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item), + "xfs_buf_item"); if (!xfs_buf_item_zone) goto out_destroy_log_item_desc_zone; @@ -1561,8 +1651,6 @@ xfs_init_zones(void) kmem_zone_destroy(xfs_trans_zone); out_destroy_ifork_zone: kmem_zone_destroy(xfs_ifork_zone); - out_destroy_dabuf_zone: - kmem_zone_destroy(xfs_dabuf_zone); out_destroy_da_state_zone: kmem_zone_destroy(xfs_da_state_zone); out_destroy_btree_cur_zone: @@ -1582,6 +1670,11 @@ xfs_init_zones(void) STATIC void xfs_destroy_zones(void) { + /* + * Make sure all delayed rcu free are flushed before we + * destroy caches. + */ + rcu_barrier(); kmem_zone_destroy(xfs_ili_zone); kmem_zone_destroy(xfs_inode_zone); kmem_zone_destroy(xfs_efi_zone); @@ -1590,7 +1683,6 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_log_item_desc_zone); kmem_zone_destroy(xfs_trans_zone); kmem_zone_destroy(xfs_ifork_zone); - kmem_zone_destroy(xfs_dabuf_zone); kmem_zone_destroy(xfs_da_state_zone); kmem_zone_destroy(xfs_btree_cur_zone); kmem_zone_destroy(xfs_bmap_free_item_zone); @@ -1604,16 +1696,6 @@ STATIC int __init xfs_init_workqueues(void) { /* - * We never want to the same work item to run twice, reclaiming inodes - * or idling the log is not going to get any faster by multiple CPUs - * competing for ressources. Use the default large max_active value - * so that even lots of filesystems can perform these task in parallel. - */ - xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0); - if (!xfs_syncd_wq) - return -ENOMEM; - - /* * The allocation workqueue can be used in memory reclaim situations * (writepage path), and parallelism is only limited by the number of * AGs in all the filesystems mounted. Hence use the default large @@ -1621,20 +1703,15 @@ xfs_init_workqueues(void) */ xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); if (!xfs_alloc_wq) - goto out_destroy_syncd; + return -ENOMEM; return 0; - -out_destroy_syncd: - destroy_workqueue(xfs_syncd_wq); - return -ENOMEM; } STATIC void xfs_destroy_workqueues |