aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile4
-rw-r--r--fs/xfs/uuid.h6
-rw-r--r--fs/xfs/xfs_acl.c4
-rw-r--r--fs/xfs/xfs_ag.h5
-rw-r--r--fs/xfs/xfs_alloc.c183
-rw-r--r--fs/xfs/xfs_alloc.h6
-rw-r--r--fs/xfs/xfs_alloc_btree.c79
-rw-r--r--fs/xfs/xfs_alloc_btree.h16
-rw-r--r--fs/xfs/xfs_aops.c208
-rw-r--r--fs/xfs/xfs_aops.h14
-rw-r--r--fs/xfs/xfs_attr.c173
-rw-r--r--fs/xfs/xfs_attr_leaf.c412
-rw-r--r--fs/xfs/xfs_attr_leaf.h27
-rw-r--r--fs/xfs/xfs_bmap.c129
-rw-r--r--fs/xfs/xfs_bmap.h9
-rw-r--r--fs/xfs/xfs_bmap_btree.c63
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c111
-rw-r--r--fs/xfs/xfs_btree.h22
-rw-r--r--fs/xfs/xfs_buf.c336
-rw-r--r--fs/xfs/xfs_buf.h168
-rw-r--r--fs/xfs/xfs_buf_item.c408
-rw-r--r--fs/xfs/xfs_buf_item.h38
-rw-r--r--fs/xfs/xfs_cksum.h63
-rw-r--r--fs/xfs/xfs_da_btree.c952
-rw-r--r--fs/xfs/xfs_da_btree.h46
-rw-r--r--fs/xfs/xfs_dfrag.c47
-rw-r--r--fs/xfs/xfs_dinode.h2
-rw-r--r--fs/xfs/xfs_dir2.c4
-rw-r--r--fs/xfs/xfs_dir2_block.c544
-rw-r--r--fs/xfs/xfs_dir2_data.c220
-rw-r--r--fs/xfs/xfs_dir2_leaf.c773
-rw-r--r--fs/xfs/xfs_dir2_node.c518
-rw-r--r--fs/xfs/xfs_dir2_priv.h63
-rw-r--r--fs/xfs/xfs_dir2_sf.c4
-rw-r--r--fs/xfs/xfs_discard.c6
-rw-r--r--fs/xfs/xfs_dquot.c134
-rw-r--r--fs/xfs/xfs_dquot.h2
-rw-r--r--fs/xfs/xfs_export.c4
-rw-r--r--fs/xfs/xfs_file.c450
-rw-r--r--fs/xfs/xfs_fs.h33
-rw-r--r--fs/xfs/xfs_fs_subr.c96
-rw-r--r--fs/xfs/xfs_fsops.c158
-rw-r--r--fs/xfs/xfs_globals.c4
-rw-r--r--fs/xfs/xfs_ialloc.c527
-rw-r--r--fs/xfs/xfs_ialloc.h6
-rw-r--r--fs/xfs/xfs_ialloc_btree.c55
-rw-r--r--fs/xfs/xfs_ialloc_btree.h2
-rw-r--r--fs/xfs/xfs_icache.c (renamed from fs/xfs/xfs_sync.c)909
-rw-r--r--fs/xfs/xfs_icache.h (renamed from fs/xfs/xfs_sync.h)28
-rw-r--r--fs/xfs/xfs_iget.c720
-rw-r--r--fs/xfs/xfs_inode.c580
-rw-r--r--fs/xfs/xfs_inode.h25
-rw-r--r--fs/xfs/xfs_ioctl.c93
-rw-r--r--fs/xfs/xfs_ioctl32.c12
-rw-r--r--fs/xfs/xfs_iomap.c54
-rw-r--r--fs/xfs/xfs_iops.c59
-rw-r--r--fs/xfs/xfs_itable.c6
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c483
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_log_priv.h20
-rw-r--r--fs/xfs/xfs_log_recover.c280
-rw-r--r--fs/xfs/xfs_mount.c201
-rw-r--r--fs/xfs/xfs_mount.h24
-rw-r--r--fs/xfs/xfs_qm.c24
-rw-r--r--fs/xfs/xfs_qm_syscalls.c10
-rw-r--r--fs/xfs/xfs_quotaops.c12
-rw-r--r--fs/xfs/xfs_rtalloc.c18
-rw-r--r--fs/xfs/xfs_sb.h7
-rw-r--r--fs/xfs/xfs_super.c335
-rw-r--r--fs/xfs/xfs_super.h3
-rw-r--r--fs/xfs/xfs_sysctl.c9
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_trace.h64
-rw-r--r--fs/xfs/xfs_trans.c17
-rw-r--r--fs/xfs/xfs_trans.h49
-rw-r--r--fs/xfs/xfs_trans_ail.c35
-rw-r--r--fs/xfs/xfs_trans_buf.c98
-rw-r--r--fs/xfs/xfs_trans_dquot.c8
-rw-r--r--fs/xfs/xfs_trans_priv.h1
-rw-r--r--fs/xfs/xfs_types.h14
-rw-r--r--fs/xfs/xfs_utils.c17
-rw-r--r--fs/xfs/xfs_vnodeops.c449
-rw-r--r--fs/xfs/xfs_vnodeops.h9
86 files changed, 6974 insertions, 4842 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 6100ec0fa1d..5a7ffe54f5d 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -2,6 +2,7 @@ config XFS_FS
tristate "XFS filesystem support"
depends on BLOCK
select EXPORTFS
+ select LIBCRC32C
help
XFS is a high performance journaling filesystem which originated
on the SGI IRIX platform. It is completely multi-threaded, can
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d2bf974b1a2..d02201df855 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -37,9 +37,8 @@ xfs-y += xfs_aops.o \
xfs_file.o \
xfs_filestream.o \
xfs_fsops.o \
- xfs_fs_subr.o \
xfs_globals.o \
- xfs_iget.o \
+ xfs_icache.o \
xfs_ioctl.o \
xfs_iomap.o \
xfs_iops.o \
@@ -47,7 +46,6 @@ xfs-y += xfs_aops.o \
xfs_message.o \
xfs_mru_cache.o \
xfs_super.o \
- xfs_sync.o \
xfs_xattr.o \
xfs_rename.o \
xfs_utils.o \
diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h
index 4732d71262c..104db0f3bed 100644
--- a/fs/xfs/uuid.h
+++ b/fs/xfs/uuid.h
@@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid);
extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
+static inline void
+uuid_copy(uuid_t *dst, uuid_t *src)
+{
+ memcpy(dst, src, sizeof(uuid_t));
+}
+
#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index ac702a6eab9..1d32f1d5276 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -337,7 +337,7 @@ xfs_xattr_acl_get(struct dentry *dentry, const char *name,
if (acl == NULL)
return -ENODATA;
- error = posix_acl_to_xattr(acl, value, size);
+ error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
posix_acl_release(acl);
return error;
@@ -361,7 +361,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
if (!value)
goto set_acl;
- acl = posix_acl_from_xattr(value, size);
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
if (!acl) {
/*
* acl_set_file(3) may request that we set default ACLs with
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 44d65c1533c..f2aeedb6a57 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -108,6 +108,8 @@ typedef struct xfs_agf {
extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+
/*
* Size of the unlinked inode hash table in the agi.
*/
@@ -161,6 +163,8 @@ typedef struct xfs_agi {
extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, struct xfs_buf **bpp);
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+
/*
* The third a.g. block contains the a.g. freelist, an array
* of block pointers to blocks owned by the allocation btree code.
@@ -233,6 +237,7 @@ typedef struct xfs_perag {
#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
in xfs_inode_ag_iterator */
#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
+#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 4f33c32affe..393055fe3ae 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -430,6 +430,60 @@ xfs_alloc_fixup_trees(
return 0;
}
+static void
+xfs_agfl_verify(
+ struct xfs_buf *bp)
+{
+#ifdef WHEN_CRCS_COME_ALONG
+ /*
+ * we cannot actually do any verification of the AGFL because mkfs does
+ * not initialise the AGFL to zero or NULL. Hence the only valid part of
+ * the AGFL is what the AGF says is active. We can't get to the AGF, so
+ * we can't verify just those entries are valid.
+ *
+ * This problem goes away when the CRC format change comes along as that
+ * requires the AGFL to be initialised by mkfs. At that point, we can
+ * verify the blocks in the agfl -active or not- lie within the bounds
+ * of the AG. Until then, just leave this check ifdef'd out.
+ */
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+ int agfl_ok = 1;
+
+ int i;
+
+ for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+ if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK ||
+ be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+ agfl_ok = 0;
+ }
+
+ if (!agfl_ok) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+#endif
+}
+
+static void
+xfs_agfl_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agfl_verify(bp);
+}
+
+static void
+xfs_agfl_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agfl_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+ .verify_read = xfs_agfl_read_verify,
+ .verify_write = xfs_agfl_write_verify,
+};
+
/*
* Read in the allocation group free block array.
*/
@@ -447,7 +501,7 @@ xfs_alloc_read_agfl(
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
if (error)
return error;
ASSERT(!xfs_buf_geterror(bp));
@@ -1866,6 +1920,7 @@ xfs_alloc_fix_freelist(
/*
* Initialize the args structure.
*/
+ memset(&targs, 0, sizeof(targs));
targs.tp = tp;
targs.mp = mp;
targs.agbp = agbp;
@@ -2090,6 +2145,63 @@ xfs_alloc_put_freelist(
return 0;
}
+static void
+xfs_agf_verify(
+ struct xfs_buf *bp)
+ {
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_agf *agf;
+ int agf_ok;
+
+ agf = XFS_BUF_TO_AGF(bp);
+
+ agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+ XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+ be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+
+ /*
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag)
+ agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) ==
+ bp->b_pag->pag_agno;
+
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+ be32_to_cpu(agf->agf_length);
+
+ if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+ XFS_RANDOM_ALLOC_READ_AGF))) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_agf_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agf_verify(bp);
+}
+
+static void
+xfs_agf_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agf_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+ .verify_read = xfs_agf_read_verify,
+ .verify_write = xfs_agf_write_verify,
+};
+
/*
* Read in the allocation group header (free/alloc section).
*/
@@ -2101,44 +2213,19 @@ xfs_read_agf(
int flags, /* XFS_BUF_ */
struct xfs_buf **bpp) /* buffer for the ag freelist header */
{
- struct xfs_agf *agf; /* ag freelist header */
- int agf_ok; /* set if agf is consistent */
int error;
ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), flags, bpp);
+ XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
if (error)
return error;
if (!*bpp)
return 0;
ASSERT(!(*bpp)->b_error);
- agf = XFS_BUF_TO_AGF(*bpp);
-
- /*
- * Validate the magic number of the agf block.
- */
- agf_ok =
- agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
- XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
- be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
- be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_seqno) == agno;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
- agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
- be32_to_cpu(agf->agf_length);
- if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
- XFS_RANDOM_ALLOC_READ_AGF))) {
- XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
- XFS_ERRLEVEL_LOW, mp, agf);
- xfs_trans_brelse(tp, *bpp);
- return XFS_ERROR(EFSCORRUPTED);
- }
xfs_buf_set_ref(*bpp, XFS_AGF_REF);
return 0;
}
@@ -2207,7 +2294,7 @@ xfs_alloc_read_agf(
* group or loop over the allocation groups to find the result.
*/
int /* error */
-__xfs_alloc_vextent(
+xfs_alloc_vextent(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
xfs_agblock_t agsize; /* allocation group size */
@@ -2417,46 +2504,6 @@ error0:
return error;
}
-static void
-xfs_alloc_vextent_worker(
- struct work_struct *work)
-{
- struct xfs_alloc_arg *args = container_of(work,
- struct xfs_alloc_arg, work);
- unsigned long pflags;
-
- /* we are in a transaction context here */
- current_set_flags_nested(&pflags, PF_FSTRANS);
-
- args->result = __xfs_alloc_vextent(args);
- complete(args->done);
-
- current_restore_flags_nested(&pflags, PF_FSTRANS);
-}
-
-/*
- * Data allocation requests often come in with little stack to work on. Push
- * them off to a worker thread so there is lots of stack to use. Metadata
- * requests, OTOH, are generally from low stack usage paths, so avoid the
- * context switch overhead here.
- */
-int
-xfs_alloc_vextent(
- struct xfs_alloc_arg *args)
-{
- DECLARE_COMPLETION_ONSTACK(done);
-
- if (!args->userdata)
- return __xfs_alloc_vextent(args);
-
-
- args->done = &done;
- INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker);
- queue_work(xfs_alloc_wq, &args->work);
- wait_for_completion(&done);
- return args->result;
-}
-
/*
* Free an extent.
* Just break up the extent address and hand off to xfs_free_ag_extent
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 93be4a667ca..99d0a610155 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -120,9 +120,6 @@ typedef struct xfs_alloc_arg {
char isfl; /* set if is freelist blocks - !acctg */
char userdata; /* set if this is user data */
xfs_fsblock_t firstblock; /* io first block allocated */
- struct completion *done;
- struct work_struct work;
- int result;
} xfs_alloc_arg_t;
/*
@@ -234,4 +231,7 @@ xfs_alloc_get_rec(
xfs_extlen_t *len, /* output: length of extent */
int *stat); /* output: success/failure */
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index f1647caace8..b1ddef6b268 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -121,6 +121,8 @@ xfs_allocbt_free_block(
xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
xfs_trans_agbtree_delta(cur->bc_tp, -1);
+
+ xfs_trans_binval(cur->bc_tp, bp);
return 0;
}
@@ -270,6 +272,82 @@ xfs_allocbt_key_diff(
return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}
+static void
+xfs_allocbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+ unsigned int level;
+ int sblock_ok; /* block passes checks */
+
+ /*
+ * magic number and level verification
+ *
+ * During growfs operations, we can't verify the exact level as the
+ * perag is not fully initialised and hence not attached to the buffer.
+ * In this case, check against the maximum tree depth.
+ */
+ level = be16_to_cpu(block->bb_level);
+ switch (block->bb_magic) {
+ case cpu_to_be32(XFS_ABTB_MAGIC):
+ if (pag)
+ sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
+ else
+ sblock_ok = level < mp->m_ag_maxlevels;
+ break;
+ case cpu_to_be32(XFS_ABTC_MAGIC):
+ if (pag)
+ sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
+ else
+ sblock_ok = level < mp->m_ag_maxlevels;
+ break;
+ default:
+ sblock_ok = 0;
+ break;
+ }
+
+ /* numrecs verification */
+ sblock_ok = sblock_ok &&
+ be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
+
+ /* sibling pointer verification */
+ sblock_ok = sblock_ok &&
+ (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+ be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
+ block->bb_u.s.bb_leftsib &&
+ (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+ be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
+ block->bb_u.s.bb_rightsib;
+
+ if (!sblock_ok) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_allocbt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_allocbt_verify(bp);
+}
+
+static void
+xfs_allocbt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_allocbt_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+ .verify_read = xfs_allocbt_read_verify,
+ .verify_write = xfs_allocbt_write_verify,
+};
+
+
#ifdef DEBUG
STATIC int
xfs_allocbt_keys_inorder(
@@ -325,6 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_allocbt_key_diff,
+ .buf_ops = &xfs_allocbt_buf_ops,
#ifdef DEBUG
.keys_inorder = xfs_allocbt_keys_inorder,
.recs_inorder = xfs_allocbt_recs_inorder,
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index a6caa0022c9..7e89a2b429d 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -51,20 +51,6 @@ typedef struct xfs_alloc_rec_incore {
typedef __be32 xfs_alloc_ptr_t;
/*
- * Minimum and maximum blocksize and sectorsize.
- * The blocksize upper limit is pretty much arbitrary.
- * The sectorsize upper limit is due to sizeof(sb_sectsize).
- */
-#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */
-#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */
-#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG)
-#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG)
-#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */
-#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */
-#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG)
-#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG)
-
-/*
* Block numbers in the AG:
* SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
*/
@@ -107,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
xfs_agnumber_t, xfs_btnum_t);
extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 8dad722c004..5f707e53717 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -86,11 +86,11 @@ xfs_destroy_ioend(
}
if (ioend->io_iocb) {
+ inode_dio_done(ioend->io_inode);
if (ioend->io_isasync) {
aio_complete(ioend->io_iocb, ioend->io_error ?
ioend->io_error : ioend->io_result, 0);
}
- inode_dio_done(ioend->io_inode);
}
mempool_free(ioend, xfs_ioend_pool);
@@ -124,6 +124,12 @@ xfs_setfilesize_trans_alloc(
ioend->io_append_trans = tp;
/*
+ * We may pass freeze protection with a transaction. So tell lockdep
+ * we released it.
+ */
+ rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 1, _THIS_IP_);
+ /*
* We hand off the transaction to the completion thread now, so
* clear the flag here.
*/
@@ -143,11 +149,13 @@ xfs_setfilesize(
xfs_fsize_t isize;
/*
- * The transaction was allocated in the I/O submission thread,
- * thus we need to mark ourselves as beeing in a transaction
- * manually.
+ * The transaction may have been allocated in the I/O submission thread,
+ * thus we need to mark ourselves as beeing in a transaction manually.
+ * Similarly for freeze protection.
*/
current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 0, 1, _THIS_IP_);
xfs_ilock(ip, XFS_ILOCK_EXCL);
isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
@@ -179,9 +187,10 @@ xfs_finish_ioend(
if (atomic_dec_and_test(&ioend->io_remaining)) {
struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
- if (ioend->io_type == IO_UNWRITTEN)
+ if (ioend->io_type == XFS_IO_UNWRITTEN)
queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
- else if (ioend->io_append_trans)
+ else if (ioend->io_append_trans ||
+ (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
queue_work(mp->m_data_workqueue, &ioend->io_work);
else
xfs_destroy_ioend(ioend);
@@ -210,36 +219,32 @@ xfs_end_io(
* For unwritten extents we need to issue transactions to convert a
* range to normal written extens after the data I/O has finished.
*/
- if (ioend->io_type == IO_UNWRITTEN) {
+ if (ioend->io_type == XFS_IO_UNWRITTEN) {
+ error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+ ioend->io_size);
+ } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
/*
- * For buffered I/O we never preallocate a transaction when
- * doing the unwritten extent conversion, but for direct I/O
- * we do not know if we are converting an unwritten extent
- * or not at the point where we preallocate the transaction.
+ * For direct I/O we do not know if we need to allocate blocks
+ * or not so we can't preallocate an append transaction as that
+ * results in nested reservations and log space deadlocks. Hence
+ * allocate the transaction here. While this is sub-optimal and
+ * can block IO completion for some time, we're stuck with doing
+ * it this way until we can pass the ioend to the direct IO
+ * allocation callbacks and avoid nesting that way.
*/
- if (ioend->io_append_trans) {
- ASSERT(ioend->io_isdirect);
-
- current_set_flags_nested(
- &ioend->io_append_trans->t_pflags, PF_FSTRANS);
- xfs_trans_cancel(ioend->io_append_trans, 0);
- }
-
- error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
- ioend->io_size);
- if (error) {
- ioend->io_error = -error;
+ error = xfs_setfilesize_trans_alloc(ioend);
+ if (error)
goto done;
- }
+ error = xfs_setfilesize(ioend);
} else if (ioend->io_append_trans) {
error = xfs_setfilesize(ioend);
- if (error)
- ioend->io_error = -error;
} else {
ASSERT(!xfs_ioend_is_append(ioend));
}
done:
+ if (error)
+ ioend->io_error = -error;
xfs_destroy_ioend(ioend);
}
@@ -312,7 +317,7 @@ xfs_map_blocks(
if (XFS_FORCED_SHUTDOWN(mp))
return -XFS_ERROR(EIO);
- if (type == IO_UNWRITTEN)
+ if (type == XFS_IO_UNWRITTEN)
bmapi_flags |= XFS_BMAPI_IGSTATE;
if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
@@ -323,10 +328,10 @@ xfs_map_blocks(
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
- ASSERT(offset <= mp->m_maxioffset);
+ ASSERT(offset <= mp->m_super->s_maxbytes);
- if (offset + count > mp->m_maxioffset)
- count = mp->m_maxioffset - offset;
+ if (offset + count > mp->m_super->s_maxbytes)
+ count = mp->m_super->s_maxbytes - offset;
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
@@ -336,7 +341,7 @@ xfs_map_blocks(
if (error)
return -XFS_ERROR(error);
- if (type == IO_DELALLOC &&
+ if (type == XFS_IO_DELALLOC &&
(!nimaps || isnullstartblock(imap->br_startblock))) {
error = xfs_iomap_write_allocate(ip, offset, count, imap);
if (!error)
@@ -345,7 +350,7 @@ xfs_map_blocks(
}
#ifdef DEBUG
- if (type == IO_UNWRITTEN) {
+ if (type == XFS_IO_UNWRITTEN) {
ASSERT(nimaps);
ASSERT(imap->br_startblock != HOLESTARTBLOCK);
ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
@@ -466,11 +471,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
*
* The fix is two passes across the ioend list - one to start writeback on the
* buffer_heads, and then submit them for I/O on the second pass.
+ *
+ * If @fail is non-zero, it means that we have a situation where some part of
+ * the submission process has failed after we have marked paged for writeback
+ * and unlocked them. In this situation, we need to fail the ioend chain rather
+ * than submit it to IO. This typically only happens on a filesystem shutdown.
*/
STATIC void
xfs_submit_ioend(
struct writeback_control *wbc,
- xfs_ioend_t *ioend)
+ xfs_ioend_t *ioend,
+ int fail)
{
xfs_ioend_t *head = ioend;
xfs_ioend_t *next;
@@ -491,6 +502,18 @@ xfs_submit_ioend(
next = ioend->io_list;
bio = NULL;
+ /*
+ * If we are failing the IO now, just mark the ioend with an
+ * error and finish it. This will run IO completion immediately
+ * as there is only one reference to the ioend at this point in
+ * time.
+ */
+ if (fail) {
+ ioend->io_error = -fail;
+ xfs_finish_ioend(ioend);
+ continue;
+ }
+
for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
if (!bio) {
@@ -634,11 +657,11 @@ xfs_check_page_type(
bh = head = page_buffers(page);
do {
if (buffer_unwritten(bh))
- acceptable += (type == IO_UNWRITTEN);
+ acceptable += (type == XFS_IO_UNWRITTEN);
else if (buffer_delay(bh))
- acceptable += (type == IO_DELALLOC);
+ acceptable += (type == XFS_IO_DELALLOC);
else if (buffer_dirty(bh) && buffer_mapped(bh))
- acceptable += (type == IO_OVERWRITE);
+ acceptable += (type == XFS_IO_OVERWRITE);
else
break;
} while ((bh = bh->b_this_page) != head);
@@ -721,11 +744,11 @@ xfs_convert_page(
if (buffer_unwritten(bh) || buffer_delay(bh) ||
buffer_mapped(bh)) {
if (buffer_unwritten(bh))
- type = IO_UNWRITTEN;
+ type = XFS_IO_UNWRITTEN;
else if (buffer_delay(bh))
- type = IO_DELALLOC;
+ type = XFS_IO_DELALLOC;
else
- type = IO_OVERWRITE;
+ type = XFS_IO_OVERWRITE;
if (!xfs_imap_valid(inode, imap, offset)) {
done = 1;
@@ -733,7 +756,7 @@ xfs_convert_page(
}
lock_buffer(bh);
- if (type != IO_OVERWRITE)
+ if (type != XFS_IO_OVERWRITE)
xfs_map_at_offset(inode, bh, imap, offset);
xfs_add_to_ioend(inode, bh, offset, type,
ioendp, done);
@@ -831,7 +854,7 @@ xfs_aops_discard_page(
struct buffer_head *bh, *head;
loff_t offset = page_offset(page);
- if (!xfs_check_page_type(page, IO_DELALLOC))
+ if (!xfs_check_page_type(page, XFS_IO_DELALLOC))
goto out_invalidate;
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -927,11 +950,26 @@ xfs_vm_writepage(
end_index = offset >> PAGE_CACHE_SHIFT;
last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
if (page->index >= end_index) {
- if ((page->index >= end_index + 1) ||
- !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
+ unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
+
+ /*
+ * Just skip the page if it is fully outside i_size, e.g. due
+ * to a truncate operation that is in progress.
+ */
+ if (page->index >= end_index + 1 || offset_into_page == 0) {
unlock_page(page);
return 0;
}
+
+ /*
+ * The page straddles i_size. It must be zeroed out on each
+ * and every writepage invocation because it may be mmapped.
+ * "A file is mapped in multiples of the page size. For a file
+ * that is not a multiple of the page size, the remaining
+ * memory is zeroed when mapped, and writes to that region are
+ * not written out to the file."
+ */
+ zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
}
end_offset = min_t(unsigned long long,
@@ -941,7 +979,7 @@ xfs_vm_writepage(
bh = head = page_buffers(page);
offset = page_offset(page);
- type = IO_OVERWRITE;
+ type = XFS_IO_OVERWRITE;
if (wbc->sync_mode == WB_SYNC_NONE)
nonblocking = 1;
@@ -966,18 +1004,18 @@ xfs_vm_writepage(
}
if (buffer_unwritten(bh)) {
- if (type != IO_UNWRITTEN) {
- type = IO_UNWRITTEN;
+ if (type != XFS_IO_UNWRITTEN) {
+ type = XFS_IO_UNWRITTEN;
imap_valid = 0;
}
} else if (buffer_delay(bh)) {
- if (type != IO_DELALLOC) {
- type = IO_DELALLOC;
+ if (type != XFS_IO_DELALLOC) {
+ type = XFS_IO_DELALLOC;
imap_valid = 0;
}
} else if (buffer_uptodate(bh)) {
- if (type != IO_OVERWRITE) {
- type = IO_OVERWRITE;
+ if (type != XFS_IO_OVERWRITE) {
+ type = XFS_IO_OVERWRITE;
imap_valid = 0;
}
} else {
@@ -1013,7 +1051,7 @@ xfs_vm_writepage(
}
if (imap_valid) {
lock_buffer(bh);
- if (type != IO_OVERWRITE)
+ if (type != XFS_IO_OVERWRITE)
xfs_map_at_offset(inode, bh, &imap, offset);
xfs_add_to_ioend(inode, bh, offset, type, &ioend,
new_ioend);
@@ -1030,7 +1068,18 @@ xfs_vm_writepage(
xfs_start_page_writeback(page, 1, count);
- if (ioend && imap_valid) {
+ /* if there is no IO to be submitted for this page, we are done */
+ if (!ioend)
+ return 0;
+
+ ASSERT(iohead);
+
+ /*
+ * Any errors from this point onwards need tobe reported through the IO
+ * completion path as we have marked the initial page as under writeback
+ * and unlocked it.
+ */
+ if (imap_valid) {
xfs_off_t end_index;
end_index = imap.br_startoff + imap.br_blockcount;
@@ -1049,20 +1098,15 @@ xfs_vm_writepage(
wbc, end_index);
}
- if (iohead) {
- /*
- * Reserve log space if we might write beyond the on-disk
- * inode size.
- */
- if (ioend->io_type != IO_UNWRITTEN &&
- xfs_ioend_is_append(ioend)) {
- err = xfs_setfilesize_trans_alloc(ioend);
- if (err)
- goto error;
- }
- xfs_submit_ioend(wbc, iohead);
- }
+ /*
+ * Reserve log space if we might write beyond the on-disk inode size.
+ */
+ err = 0;
+ if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+ err = xfs_setfilesize_trans_alloc(ioend);
+
+ xfs_submit_ioend(wbc, iohead, err);
return 0;
@@ -1162,9 +1206,9 @@ __xfs_get_blocks(
lockmode = xfs_ilock_map_shared(ip);
}
- ASSERT(offset <= mp->m_maxioffset);
- if (offset + size > mp->m_maxioffset)
- size = mp->m_maxioffset - offset;
+ ASSERT(offset <= mp->m_super->s_maxbytes);
+ if (offset + size > mp->m_super->s_maxbytes)
+ size = mp->m_super->s_maxbytes - offset;
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -1351,7 +1395,7 @@ xfs_end_io_direct_write(
ioend->io_iocb = iocb;
ioend->io_result = ret;
if (private && size > 0)
- ioend->io_type = IO_UNWRITTEN;
+ ioend->io_type = XFS_IO_UNWRITTEN;
if (is_async) {
ioend->io_isasync = 1;
@@ -1378,25 +1422,21 @@ xfs_vm_direct_IO(
size_t size = iov_length(iov, nr_segs);
/*
- * We need to preallocate a transaction for a size update
- * here. In the case that this write both updates the size
- * and converts at least on unwritten extent we will cancel
- * the still clean transaction after the I/O has finished.
+ * We cannot preallocate a size update transaction here as we
+ * don't know whether allocation is necessary or not. Hence we
+ * can only tell IO completion that one is necessary if we are
+ * not doing unwritten extent conversion.
*/
- iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT);
- if (offset + size > XFS_I(inode)->i_d.di_size) {
- ret = xfs_setfilesize_trans_alloc(ioend);
- if (ret)
- goto out_destroy_ioend;
+ iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
+ if (offset + size > XFS_I(inode)->i_d.di_size)
ioend->io_isdirect = 1;
- }
ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
offset, nr_segs,
xfs_get_blocks_direct,
xfs_end_io_direct_write, NULL, 0);
if (ret != -EIOCBQUEUED && iocb->private)
- goto out_trans_cancel;
+ goto out_destroy_ioend;
} else {
ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
offset, nr_segs,
@@ -1406,12 +1446,6 @@ xfs_vm_direct_IO(
return ret;
-out_trans_cancel:
- if (ioend->io_append_trans) {
- current_set_flags_nested(&ioend->io_append_trans->t_pflags,
- PF_FSTRANS);
- xfs_trans_cancel(ioend->io_append_trans, 0);
- }
out_destroy_ioend:
xfs_destroy_ioend(ioend);
return ret;
@@ -1584,7 +1618,7 @@ xfs_vm_bmap(
trace_xfs_vm_bmap(XFS_I(inode));
xfs_ilock(ip, XFS_IOLOCK_SHARED);
- xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
+ filemap_write_and_wait(mapping);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return generic_block_bmap(mapping, block, xfs_get_blocks);
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 84eafbcb0d9..c325abb8d61 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -24,17 +24,17 @@ extern mempool_t *xfs_ioend_pool;
* Types of I/O for bmap clustering and I/O completion tracking.
*/
enum {
- IO_DIRECT = 0, /* special case for direct I/O ioends */
- IO_DELALLOC, /* mapping covers delalloc region */
- IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
- IO_OVERWRITE, /* mapping covers already allocated extent */
+ XFS_IO_DIRECT = 0, /* special case for direct I/O ioends */
+ XFS_IO_DELALLOC, /* covers delalloc region */
+ XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
+ XFS_IO_OVERWRITE, /* covers already allocated extent */
};
#define XFS_IO_TYPES \
{ 0, "" }, \
- { IO_DELALLOC, "delalloc" }, \
- { IO_UNWRITTEN, "unwritten" }, \
- { IO_OVERWRITE, "overwrite" }
+ { XFS_IO_DELALLOC, "delalloc" }, \
+ { XFS_IO_UNWRITTEN, "unwritten" }, \
+ { XFS_IO_OVERWRITE, "overwrite" }
/*
* xfs_ioend struct manages large extent writes for XFS.
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index a17ff01b5ad..aaf472532b3 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -893,7 +893,7 @@ STATIC int
xfs_attr_leaf_addname(xfs_da_args_t *args)
{
xfs_inode_t *dp;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int retval, error, committed, forkoff;
trace_xfs_attr_leaf_addname(args);
@@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
*/
dp = args->dp;
args->blkno = 0;
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
- return(error);
- ASSERT(bp != NULL);
+ return error;
/*
* Look up the given attribute in the leaf block. Figure out if
@@ -915,11 +913,11 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
*/
retval = xfs_attr_leaf_lookup_int(bp, args);
if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
- xfs_da_brelse(args->trans, bp);
+ xfs_trans_brelse(args->trans, bp);
return(retval);
} else if (retval == EEXIST) {
if (args->flags & ATTR_CREATE) { /* pure create op */
- xfs_da_brelse(args->trans, bp);
+ xfs_trans_brelse(args->trans, bp);
return(retval);
}
@@ -937,7 +935,6 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* if required.
*/
retval = xfs_attr_leaf_add(bp, args);
- xfs_da_buf_done(bp);
if (retval == ENOSPC) {
/*
* Promote the attribute list to the Btree format, then
@@ -1032,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* Read in the block containing the "old" attr, then
* remove the "old" attr from that block (neat, huh!)
*/
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1,
- &bp, XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno,
+ -1, &bp);
if (error)
- return(error);
- ASSERT(bp != NULL);
- (void)xfs_attr_leaf_remove(bp, args);
+ return error;
+
+ xfs_attr_leaf_remove(bp, args);
/*
* If the result is small enough, shrink it all into the inode.
@@ -1065,8 +1062,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
*/
if (committed)
xfs_trans_ijoin(args->trans, dp, 0);
- } else
- xfs_da_buf_done(bp);
+ }
/*
* Commit the remove and start the next trans in series.
@@ -1092,7 +1088,7 @@ STATIC int
xfs_attr_leaf_removename(xfs_da_args_t *args)
{
xfs_inode_t *dp;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int error, committed, forkoff;
trace_xfs_attr_leaf_removename(args);
@@ -1102,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
*/
dp = args->dp;
args->blkno = 0;
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
- if (error) {
- return(error);
- }
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ if (error)
+ return error;
- ASSERT(bp != NULL);
error = xfs_attr_leaf_lookup_int(bp, args);
if (error == ENOATTR) {
- xfs_da_brelse(args->trans, bp);
+ xfs_trans_brelse(args->trans, bp);
return(error);
}
- (void)xfs_attr_leaf_remove(bp, args);
+ xfs_attr_leaf_remove(bp, args);
/*
* If the result is small enough, shrink it all into the inode.
@@ -1141,8 +1134,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
*/
if (committed)
xfs_trans_ijoin(args->trans, dp, 0);
- } else
- xfs_da_buf_done(bp);
+ }
return(0);
}
@@ -1155,23 +1147,23 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
STATIC int
xfs_attr_leaf_get(xfs_da_args_t *args)
{
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int error;
+ trace_xfs_attr_leaf_get(args);
+
args->blkno = 0;
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
- return(error);
- ASSERT(bp != NULL);
+ return error;
error = xfs_attr_leaf_lookup_int(bp, args);
if (error != EEXIST) {
- xfs_da_brelse(args->trans, bp);
+ xfs_trans_brelse(args->trans, bp);
return(error);
}
error = xfs_attr_leaf_getvalue(bp, args);
- xfs_da_brelse(args->trans, bp);
+ xfs_trans_brelse(args->trans, bp);
if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
error = xfs_attr_rmtval_get(args);
}
@@ -1184,25 +1176,18 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
STATIC int
xfs_attr_leaf_list(xfs_attr_list_context_t *context)
{
- xfs_attr_leafblock_t *leaf;
int error;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
+
+ trace_xfs_attr_leaf_list(context);
context->cursor->blkno = 0;
- error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp);
if (error)
return XFS_ERROR(error);
- ASSERT(bp != NULL);
- leaf = bp->data;
- if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
- XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
- context->dp->i_mount, leaf);
- xfs_da_brelse(NULL, bp);
- return XFS_ERROR(EFSCORRUPTED);
- }
error = xfs_attr_leaf_list_int(bp, context);
- xfs_da_brelse(NULL, bp);
+ xfs_trans_brelse(NULL, bp);
return XFS_ERROR(error);
}
@@ -1489,7 +1474,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
xfs_da_state_t *state;
xfs_da_state_blk_t *blk;
xfs_inode_t *dp;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int retval, error, committed, forkoff;
trace_xfs_attr_node_removename(args);
@@ -1601,15 +1586,11 @@ xfs_attr_node_removename(xfs_da_args_t *args)
*/
ASSERT(state->path.active == 1);
ASSERT(state->path.blk[0].bp);
- xfs_da_buf_done(state->path.blk[0].bp);
state->path.blk[0].bp = NULL;
- error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
- XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp);
if (error)
goto out;
- ASSERT((((xfs_attr_leafblock_t *)bp->data)->hdr.info.magic) ==
- cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
xfs_bmap_init(args->flist, args->firstblock);
@@ -1635,7 +1616,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
if (committed)
xfs_trans_ijoin(args->trans, dp, 0);
} else
- xfs_da_brelse(args->trans, bp);
+ xfs_trans_brelse(args->trans, bp);
}
error = 0;
@@ -1657,6 +1638,8 @@ xfs_attr_fillstate(xfs_da_state_t *state)
xfs_da_state_blk_t *blk;
int level;
+ trace_xfs_attr_fillstate(state->args);
+
/*
* Roll down the "path" in the state structure, storing the on-disk
* block number for those buffers in the "path".
@@ -1665,8 +1648,7 @@ xfs_attr_fillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->bp) {
- blk->disk_blkno = xfs_da_blkno(blk->bp);
- xfs_da_buf_done(blk->bp);
+ blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
blk->bp = NULL;
} else {
blk->disk_blkno = 0;
@@ -1681,8 +1663,7 @@ xfs_attr_fillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->bp) {
- blk->disk_blkno = xfs_da_blkno(blk->bp);
- xfs_da_buf_done(blk->bp);
+ blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
blk->bp = NULL;
} else {
blk->disk_blkno = 0;
@@ -1705,6 +1686,8 @@ xfs_attr_refillstate(xfs_da_state_t *state)
xfs_da_state_blk_t *blk;
int level, error;
+ trace_xfs_attr_refillstate(state->args);
+
/*
* Roll down the "path" in the state structure, storing the on-disk
* block number for those buffers in the "path".
@@ -1713,7 +1696,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->disk_blkno) {
- error = xfs_da_read_buf(state->args->trans,
+ error = xfs_da_node_read(state->args->trans,
state->args->dp,
blk->blkno, blk->disk_blkno,
&blk->bp, XFS_ATTR_FORK);
@@ -1732,7 +1715,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->disk_blkno) {
- error = xfs_da_read_buf(state->args->trans,
+ error = xfs_da_node_read(state->args->trans,
state->args->dp,
blk->blkno, blk->disk_blkno,
&blk->bp, XFS_ATTR_FORK);
@@ -1761,6 +1744,8 @@ xfs_attr_node_get(xfs_da_args_t *args)
int error, retval;
int i;
+ trace_xfs_attr_node_get(args);
+
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
@@ -1792,7 +1777,7 @@ xfs_attr_node_get(xfs_da_args_t *args)
* If not in a transaction, we have to release all the buffers.
*/
for (i = 0; i < state->path.active; i++) {
- xfs_da_brelse(args->trans, state->path.blk[i].bp);
+ xfs_trans_brelse(args->trans, state->path.blk[i].bp);
state->path.blk[i].bp = NULL;
}
@@ -1808,7 +1793,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
xfs_da_intnode_t *node;
xfs_da_node_entry_t *btree;
int error, i;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
+
+ trace_xfs_attr_node_list(context);
cursor = context->cursor;
cursor->initted = 1;
@@ -1820,35 +1807,35 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
*/
bp = NULL;
if (cursor->blkno > 0) {
- error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+ error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1,
&bp, XFS_ATTR_FORK);
if ((error != 0) && (error != EFSCORRUPTED))
return(error);
if (bp) {
- node = bp->data;
+ node = bp->b_addr;
switch (be16_to_cpu(node->hdr.info.magic)) {
case XFS_DA_NODE_MAGIC:
trace_xfs_attr_list_wrong_blk(context);
- xfs_da_brelse(NULL, bp);
+ xfs_trans_brelse(NULL, bp);
bp = NULL;
break;
case XFS_ATTR_LEAF_MAGIC:
- leaf = bp->data;
+ leaf = bp->b_addr;
if (cursor->hashval > be32_to_cpu(leaf->entries[
be16_to_cpu(leaf->hdr.count)-1].hashval)) {
trace_xfs_attr_list_wrong_blk(context);
- xfs_da_brelse(NULL, bp);
+ xfs_trans_brelse(NULL, bp);
bp = NULL;
} else if (cursor->hashval <=
be32_to_cpu(leaf->entries[0].hashval)) {
trace_xfs_attr_list_wrong_blk(context);
- xfs_da_brelse(NULL, bp);
+ xfs_trans_brelse(NULL, bp);
bp = NULL;
}
break;
default:
trace_xfs_attr_list_wrong_blk(context);
- xfs_da_brelse(NULL, bp);
+ xfs_trans_brelse(NULL, bp);
bp = NULL;
}
}
@@ -1862,18 +1849,12 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
if (bp == NULL) {
cursor->blkno = 0;
for (;;) {
- error = xfs_da_read_buf(NULL, context->dp,
+ error = xfs_da_node_read(NULL, context->dp,
cursor->blkno, -1, &bp,
XFS_ATTR_FORK);
if (error)
return(error);
- if (unlikely(bp == NULL)) {
- XFS_ERROR_REPORT("xfs_attr_node_list(2)",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount);
- return(XFS_ERROR(EFSCORRUPTED));
- }
- node = bp->data;
+ node = bp->b_addr;
if (node->hdr.info.magic ==
cpu_to_be16(XFS_ATTR_LEAF_MAGIC))
break;
@@ -1883,7 +1864,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
XFS_ERRLEVEL_LOW,
context->dp->i_mount,
node);
- xfs_da_brelse(NULL, bp);
+ xfs_trans_brelse(NULL, bp);
return(XFS_ERROR(EFSCORRUPTED));
}
btree = node->btree;
@@ -1898,10 +1879,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
}
}
if (i == be16_to_cpu(node->hdr.count)) {
- xfs_da_brelse(NULL, bp);
+ xfs_trans_brelse(NULL, bp);
return(0);
}
- xfs_da_brelse(NULL, bp);
+ xfs_trans_brelse(NULL, bp);
}
}
ASSERT(bp != NULL);
@@ -1912,36 +1893,22 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
* adding the information.
*/
for (;;) {
- leaf = bp->data;
- if (unlikely(leaf->hdr.info.magic !=
- cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
- XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount, leaf);
- xfs_da_brelse(NULL, bp);
- return(XFS_ERROR(EFSCORRUPTED));
- }
+ leaf = bp->b_addr;
error = xfs_attr_leaf_list_int(bp, context);
if (error) {
- xfs_da_brelse(NULL, bp);
+ xfs_trans_brelse(NULL, bp);
return error;
}
if (context->seen_enough || leaf->hdr.info.forw == 0)
break;
cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
- xfs_da_brelse(NULL, bp);
- error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
- &bp, XFS_ATTR_FORK);
+ xfs_trans_brelse(NULL, bp);
+ error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1,
+ &bp);
if (error)
- return(error);
- if (unlikely((bp == NULL))) {
- XFS_ERROR_REPORT("xfs_attr_node_list(5)",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount);
- return(XFS_ERROR(EFSCORRUPTED));
- }
+ return error;
}
- xfs_da_brelse(NULL, bp);
+ xfs_trans_brelse(NULL, bp);
return(0);
}
@@ -1965,6 +1932,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
int nmap, error, tmp, valuelen, blkcnt, i;
xfs_dablk_t lblkno;
+ trace_xfs_attr_rmtval_get(args);
+
ASSERT(!(args->flags & ATTR_KERNOVAL));
mp = args->dp->i_mount;
@@ -1986,7 +1955,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
- dblkno, blkcnt, 0, &bp);
+ dblkno, blkcnt, 0, &bp, NULL);
if (error)
return(error);
@@ -2020,6 +1989,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
xfs_dablk_t lblkno;
int blkcnt, valuelen, nmap, error, tmp, committed;
+ trace_xfs_attr_rmtval_set(args);
+
dp = args->dp;
mp = dp->i_mount;
src = args->value;
@@ -2149,6 +2120,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
xfs_dablk_t lblkno;
int valuelen, blkcnt, nmap, error, done, committed;
+ trace_xfs_attr_rmtval_remove(args);
+
mp = args->dp->i_mount;
/*
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 7d89d800f51..ee24993c7d1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -54,10 +54,11 @@
* Routines used for growing the Btree.
*/
STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
- xfs_dabuf_t **bpp);
-STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
- int freemap_index);
-STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer);
+ struct xfs_buf **bpp);
+STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer,
+ xfs_da_args_t *args, int freemap_index);
+STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args,
+ struct xfs_buf *leaf_buffer);
STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
xfs_da_state_blk_t *blk1,
xfs_da_state_blk_t *blk2);
@@ -71,9 +72,9 @@ STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
* Routines used for shrinking the Btree.
*/
STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
- xfs_dabuf_t *bp, int level);
+ struct xfs_buf *bp, int level);
STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
- xfs_dabuf_t *bp);
+ struct xfs_buf *bp);
STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
xfs_dablk_t blkno, int blkcnt);
@@ -87,6 +88,52 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
xfs_mount_t *mp);
STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+static void
+xfs_attr_leaf_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_attr_leaf_hdr *hdr = bp->b_addr;
+ int block_ok = 0;
+
+ block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
+ if (!block_ok) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_attr_leaf_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_attr_leaf_verify(bp);
+}
+
+static void
+xfs_attr_leaf_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_attr_leaf_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_attr_leaf_buf_ops = {
+ .verify_read = xfs_attr_leaf_read_verify,
+ .verify_write = xfs_attr_leaf_write_verify,
+};
+
+int
+xfs_attr_leaf_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp)
+{
+ return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops);
+}
+
/*========================================================================
* Namespace helper routines
*========================================================================*/
@@ -480,7 +527,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
char *tmpbuffer;
int error, i, size;
xfs_dablk_t blkno;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
xfs_ifork_t *ifp;
trace_xfs_attr_sf_to_leaf(args);
@@ -550,8 +597,6 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
error = 0;
out:
- if(bp)
- xfs_da_buf_done(bp);
kmem_free(tmpbuffer);
return(error);
}
@@ -737,14 +782,16 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
* a shortform attribute list.
*/
int
-xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
+xfs_attr_shortform_allfit(
+ struct xfs_buf *bp,
+ struct xfs_inode *dp)
{
xfs_attr_leafblock_t *leaf;
xfs_attr_leaf_entry_t *entry;
xfs_attr_leaf_name_local_t *name_loc;
int bytes, i;
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
entry = &leaf->entries[0];
@@ -774,7 +821,10 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
* Convert a leaf attribute list to shortform attribute list
*/
int
-xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
+xfs_attr_leaf_to_shortform(
+ struct xfs_buf *bp,
+ xfs_da_args_t *args,
+ int forkoff)
{
xfs_attr_leafblock_t *leaf;
xfs_attr_leaf_entry_t *entry;
@@ -791,10 +841,10 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
ASSERT(tmpbuffer != NULL);
ASSERT(bp != NULL);
- memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount));
+ memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount));
leaf = (xfs_attr_leafblock_t *)tmpbuffer;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- memset(bp->data, 0, XFS_LBSIZE(dp->i_mount));
+ memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount));
/*
* Clean out the prior contents of the attribute list.
@@ -855,7 +905,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
xfs_attr_leafblock_t *leaf;
xfs_da_intnode_t *node;
xfs_inode_t *dp;
- xfs_dabuf_t *bp1, *bp2;
+ struct xfs_buf *bp1, *bp2;
xfs_dablk_t blkno;
int error;
@@ -866,21 +916,19 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
error = xfs_da_grow_inode(args, &blkno);
if (error)
goto out;
- error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
- XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1);
if (error)
goto out;
- ASSERT(bp1 != NULL);
+
bp2 = NULL;
error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
XFS_ATTR_FORK);
if (error)
goto out;
- ASSERT(bp2 != NULL);
- memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount));
- xfs_da_buf_done(bp1);
+ bp2->b_ops = bp1->b_ops;
+ memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount));
bp1 = NULL;
- xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
+ xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
/*
* Set up the new root node.
@@ -888,21 +936,17 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
if (error)
goto out;
- node = bp1->data;
- leaf = bp2->data;
+ node = bp1->b_addr;
+ leaf = bp2->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
/* both on-disk, don't endian-flip twice */
node->btree[0].hashval =
leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval;
node->btree[0].before = cpu_to_be32(blkno);
node->hdr.count = cpu_to_be16(1);
- xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1);
+ xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1);
error = 0;
out:
- if (bp1)
- xfs_da_buf_done(bp1);
- if (bp2)
- xfs_da_buf_done(bp2);
return(error);
}
@@ -916,12 +960,15 @@ out:
* or a leaf in a node attribute list.
*/
STATIC int
-xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
+xfs_attr_leaf_create(
+ xfs_da_args_t *args,
+ xfs_dablk_t blkno,
+ struct xfs_buf **bpp)
{
xfs_attr_leafblock_t *leaf;
xfs_attr_leaf_hdr_t *hdr;
xfs_inode_t *dp;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int error;
trace_xfs_attr_leaf_create(args);
@@ -932,8 +979,8 @@ xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
XFS_ATTR_FORK);
if (error)
return(error);
- ASSERT(bp != NULL);
- leaf = bp->data;
+ bp->b_ops = &xfs_attr_leaf_buf_ops;
+ leaf = bp->b_addr;
memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
hdr = &leaf->hdr;
hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
@@ -947,7 +994,7 @@ xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) -
sizeof(xfs_attr_leaf_hdr_t));
- xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
+ xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
*bpp = bp;
return(0);
@@ -1014,7 +1061,9 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* Add a name to the leaf attribute list structure.
*/
int
-xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
+xfs_attr_leaf_add(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
{
xfs_attr_leafblock_t *leaf;
xfs_attr_leaf_hdr_t *hdr;
@@ -1023,7 +1072,7 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
trace_xfs_attr_leaf_add(args);
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
ASSERT((args->index >= 0)
&& (args->index <= be16_to_cpu(leaf->hdr.count)));
@@ -1068,7 +1117,7 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
* Compact the entries to coalesce free space.
* This may change the hdr->count via dropping INCOMPLETE entries.
*/
- xfs_attr_leaf_compact(args->trans, bp);
+ xfs_attr_leaf_compact(args, bp);
/*
* After compaction, the block is guaranteed to have only one
@@ -1085,7 +1134,10 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
* Add a name to a leaf attribute list structure.
*/
STATIC int
-xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
+xfs_attr_leaf_add_work(
+ struct xfs_buf *bp,
+ xfs_da_args_t *args,
+ int mapindex)
{
xfs_attr_leafblock_t *leaf;
xfs_attr_leaf_hdr_t *hdr;
@@ -1096,7 +1148,9 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
xfs_mount_t *mp;
int tmp, i;
- leaf = bp->data;
+ trace_xfs_attr_leaf_add_work(args);
+
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
hdr = &leaf->hdr;
ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE));
@@ -1110,7 +1164,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
tmp = be16_to_cpu(hdr->count) - args->index;
tmp *= sizeof(xfs_attr_leaf_entry_t);
memmove((char *)(entry+1), (char *)entry, tmp);
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
}
be16_add_cpu(&hdr->count, 1);
@@ -1142,7 +1196,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
args->index2++;
}
}
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
ASSERT((args->index == 0) ||
(be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
@@ -1174,7 +1228,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
args->rmtblkno = 1;
args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
}
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
xfs_attr_leaf_entsize(leaf, args->index)));
@@ -1198,7 +1252,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
}
}
be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index));
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
return(0);
}
@@ -1207,24 +1261,28 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
* Garbage collect a leaf attribute list block by copying it to a new buffer.
*/
STATIC void
-xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp)
+xfs_attr_leaf_compact(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp)
{
- xfs_attr_leafblock_t *leaf_s, *leaf_d;
- xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
- xfs_mount_t *mp;
- char *tmpbuffer;
+ xfs_attr_leafblock_t *leaf_s, *leaf_d;
+ xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+ struct xfs_trans *trans = args->trans;
+ struct xfs_mount *mp = trans->t_mountp;
+ char *tmpbuffer;
+
+ trace_xfs_attr_leaf_compact(args);
- mp = trans->t_mountp;
tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
ASSERT(tmpbuffer != NULL);
- memcpy(tmpbuffer, bp->data, XFS_LBSIZE(mp));
- memset(bp->data, 0, XFS_LBSIZE(mp));
+ memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
+ memset(bp->b_addr, 0, XFS_LBSIZE(mp));
/*
* Copy basic information
*/
leaf_s = (xfs_attr_leafblock_t *)tmpbuffer;
- leaf_d = bp->data;
+ leaf_d = bp->b_addr;
hdr_s = &leaf_s->hdr;
hdr_d = &leaf_d->hdr;
hdr_d->info = hdr_s->info; /* struct copy */
@@ -1247,7 +1305,7 @@ xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp)
*/
xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0,
be16_to_cpu(hdr_s->count), mp);
- xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
+ xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
kmem_free(tmpbuffer);
}
@@ -1279,10 +1337,11 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
*/
ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
- leaf1 = blk1->bp->data;
- leaf2 = blk2->bp->data;
+ leaf1 = blk1->bp->b_addr;
+ leaf2 = blk2->bp->b_addr;
ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ ASSERT(leaf2->hdr.count == 0);
args = state->args;
trace_xfs_attr_leaf_rebalance(args);
@@ -1298,8 +1357,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
tmp_blk = blk1;
blk1 = blk2;
blk2 = tmp_blk;
- leaf1 = blk1->bp->data;
- leaf2 = blk2->bp->data;
+ leaf1 = blk1->bp->b_addr;
+ leaf2 = blk2->bp->b_addr;
swap = 1;
}
hdr1 = &leaf1->hdr;
@@ -1336,9 +1395,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
max = be16_to_cpu(hdr2->firstused)
- sizeof(xfs_attr_leaf_hdr_t);
max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
- if (space > max) {
- xfs_attr_leaf_compact(args->trans, blk2->bp);
- }
+ if (space > max)
+ xfs_attr_leaf_compact(args, blk2->bp);
/*
* Move high entries from leaf1 to low end of leaf2.
@@ -1346,13 +1404,14 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count,
leaf2, 0, count, state->mp);
- xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
- xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+ xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
+ xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
} else if (count > be16_to_cpu(hdr1->count)) {
/*
* I assert that since all callers pass in an empty
* second buffer, this code should never execute.
*/
+ ASSERT(0);
/*
* Figure the total bytes to be added to the destination leaf.
@@ -1368,9 +1427,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
max = be16_to_cpu(hdr1->firstused)
- sizeof(xfs_attr_leaf_hdr_t);
max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
- if (space > max) {
- xfs_attr_leaf_compact(args->trans, blk1->bp);
- }
+ if (space > max)
+ xfs_attr_leaf_compact(args, blk1->bp);
/*
* Move low entries from leaf2 to high end of leaf1.
@@ -1378,8 +1436,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
xfs_attr_leaf_moveents(leaf2, 0, leaf1,
be16_to_cpu(hdr1->count), count, state->mp);
- xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
- xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+ xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
+ xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
}
/*
@@ -1414,10 +1472,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
args->index2 = 0;
args->blkno2 = blk2->blkno;
} else {
+ /*
+ * On a double leaf split, the original attr location
+ * is already stored in blkno2/index2, so don't
+ * overwrite it overwise we corrupt the tree.
+ */
blk2->index = blk1->index
- be16_to_cpu(leaf1->hdr.count);
- args->index = args->index2 = blk2->index;
- args->blkno = args->blkno2 = blk2->blkno;
+ args->index = blk2->index;
+ args->blkno = blk2->blkno;
+ if (!state->extravalid) {
+ /*
+ * set the new attr location to match the old
+ * one and let the higher level split code
+ * decide where in the leaf to place it.
+ */
+ args->index2 = blk2->index;
+ args->blkno2 = blk2->blkno;
+ }
}
} else {
ASSERT(state->inleaf == 1);
@@ -1448,8 +1520,8 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
/*
* Set up environment.
*/
- leaf1 = blk1->bp->data;
- leaf2 = blk2->bp->data;
+ leaf1 = blk1->bp->b_addr;
+ leaf2 = blk2->bp->b_addr;
hdr1 = &leaf1->hdr;
hdr2 = &leaf2->hdr;
foundit = 0;
@@ -1551,7 +1623,9 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
xfs_da_blkinfo_t *info;
int count, bytes, forward, error, retval, i;
xfs_dablk_t blkno;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
+
+ trace_xfs_attr_leaf_toosmall(state->args);
/*
* Check for the degenerate case of the block being over 50% full.
@@ -1559,7 +1633,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
* to coalesce with a sibling.
*/
blk = &state->path.blk[ state->path.active-1 ];
- info = blk->bp->data;
+ info = blk->bp->b_addr;
ASSERT(info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
leaf = (xfs_attr_leafblock_t *)info;
count = be16_to_cpu(leaf->hdr.count);
@@ -1612,23 +1686,21 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
blkno = be32_to_cpu(info->back);
if (blkno == 0)
continue;
- error = xfs_da_read_buf(state->args->trans, state->args->dp,
- blkno, -1, &bp, XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(state->args->trans, state->args->dp,
+ blkno, -1, &bp);
if (error)
return(error);
- ASSERT(bp != NULL);
leaf = (xfs_attr_leafblock_t *)info;
count = be16_to_cpu(leaf->hdr.count);
bytes = state->blocksize - (state->blocksize>>2);
bytes -= be16_to_cpu(leaf->hdr.usedbytes);
- leaf = bp->data;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ leaf = bp->b_addr;
count += be16_to_cpu(leaf->hdr.count);
bytes -= be16_to_cpu(leaf->hdr.usedbytes);
bytes -= count * sizeof(xfs_attr_leaf_entry_t);
bytes -= sizeof(xfs_attr_leaf_hdr_t);
- xfs_da_brelse(state->args->trans, bp);
+ xfs_trans_brelse(state->args->trans, bp);
if (bytes >= 0)
break; /* fits with at least 25% to spare */
}
@@ -1666,7 +1738,9 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
* If two leaves are 37% full, when combined they will leave 25% free.
*/
int
-xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
+xfs_attr_leaf_remove(
+ struct xfs_buf *bp,
+ xfs_da_args_t *args)
{
xfs_attr_leafblock_t *leaf;
xfs_attr_leaf_hdr_t *hdr;
@@ -1676,7 +1750,9 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
int tablesize, tmp, i;
xfs_mount_t *mp;
- leaf = bp->data;
+ trace_xfs_attr_leaf_remove(args);
+
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
hdr = &leaf->hdr;
mp = args->trans->t_mountp;
@@ -1769,7 +1845,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
*/
memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize);
be16_add_cpu(&hdr->usedbytes, -entsize);
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
entsize));
@@ -1777,7 +1853,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
* sizeof(xfs_attr_leaf_entry_t);
memmove((char *)entry, (char *)(entry+1), tmp);
be16_add_cpu(&hdr->count, -1);
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
entry = &leaf->entries[be16_to_cpu(hdr->count)];
memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t));
@@ -1807,7 +1883,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
} else {
hdr->holes = 1; /* mark as needing compaction */
}
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
/*
@@ -1840,8 +1916,8 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
mp = state->mp;
ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC);
ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC);
- drop_leaf = drop_blk->bp->data;
- save_leaf = save_blk->bp->data;
+ drop_leaf = drop_blk->bp->b_addr;
+ save_leaf = save_blk->bp->b_addr;
ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
drop_hdr = &drop_leaf->hdr;
@@ -1906,7 +1982,7 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
kmem_free(tmpbuffer);
}
- xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
+ xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
state->blocksize - 1);
/*
@@ -1934,7 +2010,9 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
* Don't change the args->value unless we find the attribute.
*/
int
-xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
+xfs_attr_leaf_lookup_int(
+ struct xfs_buf *bp,
+ xfs_da_args_t *args)
{
xfs_attr_leafblock_t *leaf;
xfs_attr_leaf_entry_t *entry;
@@ -1945,7 +2023,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
trace_xfs_attr_leaf_lookup(args);
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
ASSERT(be16_to_cpu(leaf->hdr.count)
< (XFS_LBSIZE(args->dp->i_mount)/8));
@@ -2041,7 +2119,9 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
* list structure.
*/
int
-xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
+xfs_attr_leaf_getvalue(
+ struct xfs_buf *bp,
+ xfs_da_args_t *args)
{
int valuelen;
xfs_attr_leafblock_t *leaf;
@@ -2049,7 +2129,7 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
xfs_attr_leaf_name_local_t *name_loc;
xfs_attr_leaf_name_remote_t *name_rmt;
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
ASSERT(be16_to_cpu(leaf->hdr.count)
< (XFS_LBSIZE(args->dp->i_mount)/8));
@@ -2247,12 +2327,14 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
* Return 0 unless leaf2 should go before leaf1.
*/
int
-xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
+xfs_attr_leaf_order(
+ struct xfs_buf *leaf1_bp,
+ struct xfs_buf *leaf2_bp)
{
xfs_attr_leafblock_t *leaf1, *leaf2;
- leaf1 = leaf1_bp->data;
- leaf2 = leaf2_bp->data;
+ leaf1 = leaf1_bp->b_addr;
+ leaf2 = leaf2_bp->b_addr;
ASSERT((leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) &&
(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)));
if ((be16_to_cpu(leaf1->hdr.count) > 0) &&
@@ -2272,11 +2354,13 @@ xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
* Pick up the last hashvalue from a leaf block.
*/
xfs_dahash_t
-xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count)
+xfs_attr_leaf_lasthash(
+ struct xfs_buf *bp,
+ int *count)
{
xfs_attr_leafblock_t *leaf;
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
if (count)
*count = be16_to_cpu(leaf->hdr.count);
@@ -2337,7 +2421,9 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
* Copy out attribute list entries for attr_list(), for leaf attribute lists.
*/
int
-xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
+xfs_attr_leaf_list_int(
+ struct xfs_buf *bp,
+ xfs_attr_list_context_t *context)
{
attrlist_cursor_kern_t *cursor;
xfs_attr_leafblock_t *leaf;
@@ -2345,7 +2431,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
int retval, i;
ASSERT(bp != NULL);
- leaf = bp->data;
+ leaf = bp->b_addr;
cursor = context->cursor;
cursor->initted = 1;
@@ -2463,7 +2549,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
xfs_attr_leafblock_t *leaf;
xfs_attr_leaf_entry_t *entry;
xfs_attr_leaf_name_remote_t *name_rmt;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int error;
#ifdef DEBUG
xfs_attr_leaf_name_local_t *name_loc;
@@ -2475,15 +2561,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
/*
* Set up the operation.
*/
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
- if (error) {
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ if (error)
return(error);
- }
- ASSERT(bp != NULL);
- leaf = bp->data;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ leaf = bp->b_addr;
ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
ASSERT(args->index >= 0);
entry = &leaf->entries[ args->index ];
@@ -2505,7 +2587,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
#endif /* DEBUG */
entry->flags &= ~XFS_ATTR_INCOMPLETE;
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
if (args->rmtblkno) {
@@ -2513,10 +2595,9 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
name_rmt->valuelen = cpu_to_be32(args->valuelen);
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
}
- xfs_da_buf_done(bp);
/*
* Commit the flag value change and start the next trans in series.
@@ -2533,7 +2614,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
xfs_attr_leafblock_t *leaf;
xfs_attr_leaf_entry_t *entry;
xfs_attr_leaf_name_remote_t *name_rmt;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int error;
trace_xfs_attr_leaf_setflag(args);
@@ -2541,31 +2622,26 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
/*
* Set up the operation.
*/
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
- if (error) {
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ if (error)
return(error);
- }
- ASSERT(bp != NULL);
- leaf = bp->data;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ leaf = bp->b_addr;
ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
ASSERT(args->index >= 0);
entry = &leaf->entries[ args->index ];
ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
entry->flags |= XFS_ATTR_INCOMPLETE;
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
name_rmt->valueblk = 0;
name_rmt->valuelen = 0;
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
}
- xfs_da_buf_done(bp);
/*
* Commit the flag value change and start the next trans in series.
@@ -2586,7 +2662,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
xfs_attr_leafblock_t *leaf1, *leaf2;
xfs_attr_leaf_entry_t *entry1, *entry2;
xfs_attr_leaf_name_remote_t *name_rmt;
- xfs_dabuf_t *bp1, *bp2;
+ struct xfs_buf *bp1, *bp2;
int error;
#ifdef DEBUG
xfs_attr_leaf_name_local_t *name_loc;
@@ -2599,35 +2675,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
/*
* Read the block containing the "old" attr
*/
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1,
- XFS_ATTR_FORK);
- if (error) {
- return(error);
- }
- ASSERT(bp1 != NULL);
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
+ if (error)
+ return error;
/*
* Read the block containing the "new" attr, if it is different
*/
if (args->blkno2 != args->blkno) {
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno2,
- -1, &bp2, XFS_ATTR_FORK);
- if (error) {
- return(error);
- }
- ASSERT(bp2 != NULL);
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2,
+ -1, &bp2);
+ if (error)
+ return error;
} else {
bp2 = bp1;
}
- leaf1 = bp1->data;
- ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ leaf1 = bp1->b_addr;
ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
ASSERT(args->index >= 0);
entry1 = &leaf1->entries[ args->index ];
- leaf2 = bp2->data;
- ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ leaf2 = bp2->b_addr;
ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
ASSERT(args->index2 >= 0);
entry2 = &leaf2->entries[ args->index2 ];
@@ -2660,30 +2729,27 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
entry1->flags &= ~XFS_ATTR_INCOMPLETE;
- xfs_da_log_buf(args->trans, bp1,
+ xfs_trans_log_buf(args->trans, bp1,
XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
if (args->rmtblkno) {
ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
name_rmt->valuelen = cpu_to_be32(args->valuelen);
- xfs_da_log_buf(args->trans, bp1,
+ xfs_trans_log_buf(args->trans, bp1,
XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
}
entry2->flags |= XFS_ATTR_INCOMPLETE;
- xfs_da_log_buf(args->trans, bp2,
+ xfs_trans_log_buf(args->trans, bp2,
XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
name_rmt->valueblk = 0;
name_rmt->valuelen = 0;
- xfs_da_log_buf(args->trans, bp2,
+ xfs_trans_log_buf(args->trans, bp2,
XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
}
- xfs_da_buf_done(bp1);
- if (bp1 != bp2)
- xfs_da_buf_done(bp2);
/*
* Commit the flag value change and start the next trans in series.
@@ -2706,7 +2772,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
{
xfs_da_blkinfo_t *info;
xfs_daddr_t blkno;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int error;
/*
@@ -2715,23 +2781,23 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
* the extents in reverse order the extent containing
* block 0 must still be there.
*/
- error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+ error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
if (error)
return(error);
- blkno = xfs_da_blkno(bp);
+ blkno = XFS_BUF_ADDR(bp);
/*
* Invalidate the tree, even if the "tree" is only a single leaf block.
* This is a depth-first traversal!
*/
- info = bp->data;
+ info = bp->b_addr;
if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) {
error = xfs_attr_node_inactive(trans, dp, bp, 1);
} else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) {
error = xfs_attr_leaf_inactive(trans, dp, bp);
} else {
error = XFS_ERROR(EIO);
- xfs_da_brelse(*trans, bp);
+ xfs_trans_brelse(*trans, bp);
}
if (error)
return(error);
@@ -2742,7 +2808,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
if (error)
return(error);
- xfs_da_binval(*trans, bp); /* remove from cache */
+ xfs_trans_binval(*trans, bp); /* remove from cache */
/*
* Commit the invalidate and start the next transaction.
*/
@@ -2756,34 +2822,37 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
* We're doing a depth-first traversal in order to invalidate everything.
*/
STATIC int
-xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
- int level)
+xfs_attr_node_inactive(
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ struct xfs_buf *bp,
+ int level)
{
xfs_da_blkinfo_t *info;
xfs_da_intnode_t *node;
xfs_dablk_t child_fsb;
xfs_daddr_t parent_blkno, child_blkno;
int error, count, i;
- xfs_dabuf_t *child_bp;
+ struct xfs_buf *child_bp;
/*
* Since this code is recursive (gasp!) we must protect ourselves.
*/
if (level > XFS_DA_NODE_MAXDEPTH) {
- xfs_da_brelse(*trans, bp); /* no locks for later trans */
+ xfs_trans_brelse(*trans, bp); /* no locks for later trans */
return(XFS_ERROR(EIO));
}
- node = bp->data;
+ node = bp->b_addr;
ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- parent_blkno = xfs_da_blkno(bp); /* save for re-read later */
+ parent_blkno = XFS_BUF_ADDR(bp); /* save for re-read later */
count = be16_to_cpu(node->hdr.count);
if (!count) {
- xfs_da_brelse(*trans, bp);
+ xfs_trans_brelse(*trans, bp);
return(0);
}
child_fsb = be32_to_cpu(node->btree[0].before);
- xfs_da_brelse(*trans, bp); /* no locks for later trans */
+ xfs_trans_brelse(*trans, bp); /* no locks for later trans */
/*
* If this is the node level just above the leaves, simply loop
@@ -2797,18 +2866,18 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
* traversal of the tree so we may deal with many blocks
* before we come back to this one.
*/
- error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp,
+ error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp,
XFS_ATTR_FORK);
if (error)
return(error);
if (child_bp) {
/* save for re-read later */
- child_blkno = xfs_da_blkno(child_bp);
+ child_blkno = XFS_BUF_ADDR(child_bp);
/*
* Invalidate the subtree, however we have to.
*/
- info = child_bp->data;
+ info = child_bp->b_addr;
if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) {
error = xfs_attr_node_inactive(trans, dp,
child_bp, level+1);
@@ -2817,7 +2886,7 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
child_bp);
} else {
error = XFS_ERROR(EIO);
- xfs_da_brelse(*trans, child_bp);
+ xfs_trans_brelse(*trans, child_bp);
}
if (error)
return(error);
@@ -2830,7 +2899,7 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
&child_bp, XFS_ATTR_FORK);
if (error)
return(error);
- xfs_da_binval(*trans, child_bp);
+ xfs_trans_binval(*trans, child_bp);
}
/*
@@ -2838,12 +2907,12 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
* child block number.
*/
if ((i+1) < count) {
- error = xfs_da_read_buf(*trans, dp, 0, parent_blkno,
- &bp, XFS_ATTR_FORK);
+ error = xfs_da_node_read(*trans, dp, 0, parent_blkno,
+ &bp, XFS_ATTR_FORK);
if (error)
return(error);
child_fsb = be32_to_cpu(node->btree[i+1].before);
- xfs_da_brelse(*trans, bp);
+ xfs_trans_brelse(*trans, bp);
}
/*
* Atomically commit the whole invalidate stuff.
@@ -2863,7 +2932,10 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
* caught holding something that the logging code wants to flush to disk.
*/
STATIC int
-xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
+xfs_attr_leaf_inactive(
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
{
xfs_attr_leafblock_t *leaf;
xfs_attr_leaf_entry_t *entry;
@@ -2871,7 +2943,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
xfs_attr_inactive_list_t *list, *lp;
int error, count, size, tmp, i;
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
/*
@@ -2892,7 +2964,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
* If there are no "remote" values, we're done.
*/
if (count == 0) {
- xfs_da_brelse(*trans, bp);
+ xfs_trans_brelse(*trans, bp);
return(0);
}
@@ -2919,7 +2991,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
}
}
}
- xfs_da_brelse(*trans, bp); /* unlock for trans. in freextent() */
+ xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */
/*
* Invalidate each of the "remote" value extents.
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 9c7d22fdcf4..77de139a58f 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -31,7 +31,6 @@
struct attrlist;
struct attrlist_cursor_kern;
struct xfs_attr_list_context;
-struct xfs_dabuf;
struct xfs_da_args;
struct xfs_da_state;
struct xfs_da_state_blk;
@@ -215,7 +214,7 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int xfs_attr_shortform_remove(struct xfs_da_args *args);
int xfs_attr_shortform_list(struct xfs_attr_list_context *context);
-int xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp);
+int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
@@ -223,7 +222,7 @@ int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
* Internal routines when attribute fork size == XFS_LBSIZE(mp).
*/
int xfs_attr_leaf_to_node(struct xfs_da_args *args);
-int xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp,
+int xfs_attr_leaf_to_shortform(struct xfs_buf *bp,
struct xfs_da_args *args, int forkoff);
int xfs_attr_leaf_clearflag(struct xfs_da_args *args);
int xfs_attr_leaf_setflag(struct xfs_da_args *args);
@@ -235,14 +234,14 @@ int xfs_attr_leaf_flipflags(xfs_da_args_t *args);
int xfs_attr_leaf_split(struct xfs_da_state *state,
struct xfs_da_state_blk *oldblk,
struct xfs_da_state_blk *newblk);
-int xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf,
+int xfs_attr_leaf_lookup_int(struct xfs_buf *leaf,
struct xfs_da_args *args);
-int xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args);
-int xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer,
+int xfs_attr_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
+int xfs_attr_leaf_add(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
-int xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer,
+int xfs_attr_leaf_remove(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
-int xfs_attr_leaf_list_int(struct xfs_dabuf *bp,
+int xfs_attr_leaf_list_int(struct xfs_buf *bp,
struct xfs_attr_list_context *context);
/*
@@ -257,9 +256,15 @@ int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
/*
* Utility routines.
*/
-xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count);
-int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
- struct xfs_dabuf *leaf2_bp);
+xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count);
+int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
+ struct xfs_buf *leaf2_bp);
int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
int *local);
+int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp);
+
+extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops;
+
#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 58b815ec8c9..cdb2d334858 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2437,6 +2437,7 @@ xfs_bmap_btalloc(
* Normal allocation, done through xfs_alloc_vextent.
*/
tryagain = isaligned = 0;
+ memset(&args, 0, sizeof(args));
args.tp = ap->tp;
args.mp = mp;
args.fsbno = ap->blkno;
@@ -2661,8 +2662,9 @@ xfs_bmap_btree_to_extents(
if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
return error;
#endif
- if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
- XFS_BMAP_BTREE_REF)))
+ error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
return error;
cblock = XFS_BUF_TO_BLOCK(cbp);
if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
@@ -3082,6 +3084,7 @@ xfs_bmap_extents_to_btree(
* Convert to a btree with two levels, one record in root.
*/
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+ memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = mp;
args.firstblock = *firstblock;
@@ -3121,6 +3124,7 @@ xfs_bmap_extents_to_btree(
/*
* Fill in the child block.
*/
+ abp->b_ops = &xfs_bmbt_buf_ops;
ablock = XFS_BUF_TO_BLOCK(abp);
ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
ablock->bb_level = 0;
@@ -3237,6 +3241,7 @@ xfs_bmap_local_to_extents(
xfs_buf_t *bp; /* buffer for extent block */
xfs_bmbt_rec_host_t *ep;/* extent record pointer */
+ memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = ip->i_mount;
args.firstblock = *firstblock;
@@ -3266,6 +3271,7 @@ xfs_bmap_local_to_extents(
ASSERT(args.len == 1);
*firstblock = args.fsbno;
bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+ bp->b_ops = &xfs_bmbt_buf_ops;
memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
@@ -4075,8 +4081,9 @@ xfs_bmap_read_extents(
* pointer (leftmost) at each level.
*/
while (level-- > 0) {
- if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+ if (error)
return error;
block = XFS_BUF_TO_BLOCK(bp);
XFS_WANT_CORRUPTED_GOTO(
@@ -4121,7 +4128,8 @@ xfs_bmap_read_extents(
*/
nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
if (nextbno != NULLFSBLOCK)
- xfs_btree_reada_bufl(mp, nextbno, 1);
+ xfs_btree_reada_bufl(mp, nextbno, 1,
+ &xfs_bmbt_buf_ops);
/*
* Copy records into the extent records.
*/
@@ -4153,8 +4161,9 @@ xfs_bmap_read_extents(
*/
if (bno == NULLFSBLOCK)
break;
- if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+ if (error)
return error;
block = XFS_BUF_TO_BLOCK(bp);
}
@@ -4616,12 +4625,11 @@ xfs_bmapi_delay(
STATIC int
-xfs_bmapi_allocate(
- struct xfs_bmalloca *bma,
- int flags)
+__xfs_bmapi_allocate(
+ struct xfs_bmalloca *bma)
{
struct xfs_mount *mp = bma->ip->i_mount;
- int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+ int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
XFS_ATTR_FORK : XFS_DATA_FORK;
struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
int tmp_logflags = 0;
@@ -4654,19 +4662,19 @@ xfs_bmapi_allocate(
* Indicate if this is the first user data in the file, or just any
* user data.
*/
- if (!(flags & XFS_BMAPI_METADATA)) {
+ if (!(bma->flags & XFS_BMAPI_METADATA)) {
bma->userdata = (bma->offset == 0) ?
XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
}
- bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+ bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
/*
* Only want to do the alignment at the eof if it is userdata and
* allocation length is larger than a stripe unit.
*/
if (mp->m_dalign && bma->length >= mp->m_dalign &&
- !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
+ !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
error = xfs_bmap_isaeof(bma, whichfork);
if (error)
return error;
@@ -4706,7 +4714,7 @@ xfs_bmapi_allocate(
* A wasdelay extent has been initialized, so shouldn't be flagged
* as unwritten.
*/
- if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) &&
+ if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
xfs_sb_version_hasextflgbit(&mp->m_sb))
bma->got.br_state = XFS_EXT_UNWRITTEN;
@@ -4734,6 +4742,45 @@ xfs_bmapi_allocate(
return 0;
}
+static void
+xfs_bmapi_allocate_worker(
+ struct work_struct *work)
+{
+ struct xfs_bmalloca *args = container_of(work,
+ struct xfs_bmalloca, work);
+ unsigned long pflags;
+
+ /* we are in a transaction context here */
+ current_set_flags_nested(&pflags, PF_FSTRANS);
+
+ args->result = __xfs_bmapi_allocate(args);
+ complete(args->done);
+
+ current_restore_flags_nested(&pflags, PF_FSTRANS);
+}
+
+/*
+ * Some allocation requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. Otherwise just
+ * call directly to avoid the context switch overhead here.
+ */
+int
+xfs_bmapi_allocate(
+ struct xfs_bmalloca *args)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+
+ if (!args->stack_switch)
+ return __xfs_bmapi_allocate(args);
+
+
+ args->done = &done;
+ INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
+ queue_work(xfs_alloc_wq, &args->work);
+ wait_for_completion(&done);
+ return args->result;
+}
+
STATIC int
xfs_bmapi_convert_unwritten(
struct xfs_bmalloca *bma,
@@ -4906,6 +4953,9 @@ xfs_bmapi_write(
bma.flist = flist;
bma.firstblock = firstblock;
+ if (flags & XFS_BMAPI_STACK_SWITCH)
+ bma.stack_switch = 1;
+
while (bno < end && n < *nmap) {
inhole = eof || bma.got.br_startoff > bno;
wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
@@ -4919,6 +4969,7 @@ xfs_bmapi_write(
bma.conv = !!(flags & XFS_BMAPI_CONVERT);
bma.wasdel = wasdelay;
bma.offset = bno;
+ bma.flags = flags;
/*
* There's a 32/64 bit type mismatch between the
@@ -4934,7 +4985,7 @@ xfs_bmapi_write(
ASSERT(len > 0);
ASSERT(bma.length > 0);
- error = xfs_bmapi_allocate(&bma, flags);
+ error = xfs_bmapi_allocate(&bma);
if (error)
goto error0;
if (bma.blkno == NULLFSBLOCK)
@@ -5517,7 +5568,7 @@ xfs_getbmap(
if (xfs_get_extsz_hint(ip) ||
ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
prealloced = 1;
- fixlen = XFS_MAXIOFFSET(mp);
+ fixlen = mp->m_super->s_maxbytes;
} else {
prealloced = 0;
fixlen = XFS_ISIZE(ip);
@@ -5554,7 +5605,7 @@ xfs_getbmap(
xfs_ilock(ip, XFS_IOLOCK_SHARED);
if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
- error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
+ error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
if (error)
goto out_unlock_iolock;
}
@@ -5823,15 +5874,16 @@ xfs_bmap_check_leaf_extents(
*/
while (level-- > 0) {
/* See if buf is in cur first */
+ bp_release = 0;
bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
- if (bp) {
- bp_release = 0;
- } else {
+ if (!bp) {
bp_release = 1;
+ error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ goto error_norelse;
}
- if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
- goto error_norelse;
block = XFS_BUF_TO_BLOCK(bp);
XFS_WANT_CORRUPTED_GOTO(
xfs_bmap_sanity_check(mp, bp, level),
@@ -5908,15 +5960,16 @@ xfs_bmap_check_leaf_extents(
if (bno == NULLFSBLOCK)
break;
+ bp_release = 0;
bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
- if (bp) {
- bp_release = 0;
- } else {
+ if (!bp) {
bp_release = 1;
+ error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ goto error_norelse;
}
- if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
- goto error_norelse;
block = XFS_BUF_TO_BLOCK(bp);
}
if (bp_release) {
@@ -6007,7 +6060,9 @@ xfs_bmap_count_tree(
struct xfs_btree_block *block, *nextblock;
int numrecs;
- if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
return error;
*count += 1;
block = XFS_BUF_TO_BLOCK(bp);
@@ -6016,8 +6071,10 @@ xfs_bmap_count_tree(
/* Not at node above leaves, count this level of nodes */
nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
while (nextbno != NULLFSBLOCK) {
- if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
- 0, &nbp, XFS_BMAP_BTREE_REF)))
+ error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
return error;
*count += 1;
nextblock = XFS_BUF_TO_BLOCK(nbp);
@@ -6046,8 +6103,10 @@ xfs_bmap_count_tree(
if (nextbno == NULLFSBLOCK)
break;
bno = nextbno;
- if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
return error;
*count += 1;
block = XFS_BUF_TO_BLOCK(bp);
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 803b56d7ce1..5f469c3516e 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -77,6 +77,7 @@ typedef struct xfs_bmap_free
* from written to unwritten, otherwise convert from unwritten to written.
*/
#define XFS_BMAPI_CONVERT 0x040
+#define XFS_BMAPI_STACK_SWITCH 0x080
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
@@ -85,7 +86,8 @@ typedef struct xfs_bmap_free
{ XFS_BMAPI_PREALLOC, "PREALLOC" }, \
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
{ XFS_BMAPI_CONTIG, "CONTIG" }, \
- { XFS_BMAPI_CONVERT, "CONVERT" }
+ { XFS_BMAPI_CONVERT, "CONVERT" }, \
+ { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" }
static inline int xfs_bmapi_aflag(int w)
@@ -133,6 +135,11 @@ typedef struct xfs_bmalloca {
char userdata;/* set if is user data */
char aeof; /* allocated space at eof */
char conv; /* overwriting unwritten extents */
+ char stack_switch;
+ int flags;
+ struct completion *done;
+ struct work_struct work;
+ int result;
} xfs_bmalloca_t;
/*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 862084a47a7..061b45cbe61 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -36,6 +36,7 @@
#include "xfs_bmap.h"
#include "xfs_error.h"
#include "xfs_quota.h"
+#include "xfs_trace.h"
/*
* Determine the extent state.
@@ -707,6 +708,67 @@ xfs_bmbt_key_diff(
cur->bc_rec.b.br_startoff;
}
+static void
+xfs_bmbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ unsigned int level;
+ int lblock_ok; /* block passes checks */
+
+ /* magic number and level verification.
+ *
+ * We don't know waht fork we belong to, so just verify that the level
+ * is less than the maximum of the two. Later checks will be more
+ * precise.
+ */
+ level = be16_to_cpu(block->bb_level);
+ lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) &&
+ level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]);
+
+ /* numrecs verification */
+ lblock_ok = lblock_ok &&
+ be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0];
+
+ /* sibling pointer verification */
+ lblock_ok = lblock_ok &&
+ block->bb_u.l.bb_leftsib &&
+ (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
+ XFS_FSB_SANITY_CHECK(mp,
+ be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+ block->bb_u.l.bb_rightsib &&
+ (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
+ XFS_FSB_SANITY_CHECK(mp,
+ be64_to_cpu(block->bb_u.l.bb_rightsib)));
+
+ if (!lblock_ok) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_bmbt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_bmbt_verify(bp);
+}
+
+static void
+xfs_bmbt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_bmbt_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+ .verify_read = xfs_bmbt_read_verify,
+ .verify_write = xfs_bmbt_write_verify,
+};
+
+
#ifdef DEBUG
STATIC int
xfs_bmbt_keys_inorder(
@@ -746,6 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
.key_diff = xfs_bmbt_key_diff,
+ .buf_ops = &xfs_bmbt_buf_ops,
#ifdef DEBUG
.keys_inorder = xfs_bmbt_keys_inorder,
.recs_inorder = xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e66c4ea0f8..88469ca0869 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -236,5 +236,6 @@ extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
+extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e53e317b158..db010408d70 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -266,9 +266,13 @@ xfs_btree_dup_cursor(
for (i = 0; i < new->bc_nlevels; i++) {
new->bc_ptrs[i] = cur->bc_ptrs[i];
new->bc_ra[i] = cur->bc_ra[i];
- if ((bp = cur->bc_bufs[i])) {
- if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) {
+ bp = cur->bc_bufs[i];
+ if (bp) {
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_BUF_ADDR(bp), mp->m_bsize,
+ 0, &bp,
+ cur->bc_ops->buf_ops);
+ if (error) {
xfs_btree_del_cursor(new, error);
*ncur = NULL;
return error;
@@ -609,25 +613,26 @@ xfs_btree_offsets(
* Get a buffer for the block, return it read in.
* Long-form addressing.
*/
-int /* error */
+int
xfs_btree_read_bufl(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_fsblock_t fsbno, /* file system block number */
- uint lock, /* lock flags for read_buf */
- xfs_buf_t **bpp, /* buffer for fsbno */
- int refval) /* ref count value for buffer */
-{
- xfs_buf_t *bp; /* return value */
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t fsbno, /* file system block number */
+ uint lock, /* lock flags for read_buf */
+ struct xfs_buf **bpp, /* buffer for fsbno */
+ int refval, /* ref count value for buffer */
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_buf *bp; /* return value */
xfs_daddr_t d; /* real disk block address */
- int error;
+ int error;
ASSERT(fsbno != NULLFSBLOCK);
d = XFS_FSB_TO_DADDR(mp, fsbno);
- if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
- mp->m_bsize, lock, &bp))) {
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+ mp->m_bsize, lock, &bp, ops);
+ if (error)
return error;
- }
ASSERT(!xfs_buf_geterror(bp));
if (bp)
xfs_buf_set_ref(bp, refval);
@@ -642,15 +647,16 @@ xfs_btree_read_bufl(
/* ARGSUSED */
void
xfs_btree_reada_bufl(
- xfs_mount_t *mp, /* file system mount point */
- xfs_fsblock_t fsbno, /* file system block number */
- xfs_extlen_t count) /* count of filesystem blocks */
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_fsblock_t fsbno, /* file system block number */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops)
{
xfs_daddr_t d;
ASSERT(fsbno != NULLFSBLOCK);
d = XFS_FSB_TO_DADDR(mp, fsbno);
- xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+ xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
}
/*
@@ -660,17 +666,18 @@ xfs_btree_reada_bufl(
/* ARGSUSED */
void
xfs_btree_reada_bufs(
- xfs_mount_t *mp, /* file system mount point */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- xfs_extlen_t count) /* count of filesystem blocks */
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_agblock_t agbno, /* allocation group block number */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops)
{
xfs_daddr_t d;
ASSERT(agno != NULLAGNUMBER);
ASSERT(agbno != NULLAGBLOCK);
d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+ xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
}
STATIC int
@@ -684,12 +691,14 @@ xfs_btree_readahead_lblock(
xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
- xfs_btree_reada_bufl(cur->bc_mp, left, 1);
+ xfs_btree_reada_bufl(cur->bc_mp, left, 1,
+ cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
- xfs_btree_reada_bufl(cur->bc_mp, right, 1);
+ xfs_btree_reada_bufl(cur->bc_mp, right, 1,
+ cur->bc_ops->buf_ops);
rval++;
}
@@ -709,13 +718,13 @@ xfs_btree_readahead_sblock(
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
- left, 1);
+ left, 1, cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
- right, 1);
+ right, 1, cur->bc_ops->buf_ops);
rval++;
}
@@ -853,18 +862,22 @@ xfs_btree_set_sibling(
}
}
-STATIC void
+void
xfs_btree_init_block(
- struct xfs_btree_cur *cur,
- int level,
- int numrecs,
- struct xfs_btree_block *new) /* new block */
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ __u32 magic,
+ __u16 level,
+ __u16 numrecs,
+ unsigned int flags)
{
- new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+ struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp);
+
+ new->bb_magic = cpu_to_be32(magic);
new->bb_level = cpu_to_be16(level);
new->bb_numrecs = cpu_to_be16(numrecs);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (flags & XFS_BTREE_LONG_PTRS) {
new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
} else {
@@ -873,6 +886,17 @@ xfs_btree_init_block(
}
}
+STATIC void
+xfs_btree_init_block_cur(
+ struct xfs_btree_cur *cur,
+ int level,
+ int numrecs,
+ struct xfs_buf *bp)
+{
+ xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum],
+ level, numrecs, cur->bc_flags);
+}
+
/*
* Return true if ptr is the last record in the btree and
* we need to track updateѕ to this record. The decision
@@ -972,6 +996,7 @@ xfs_btree_get_buf_block(
if (!*bpp)
return ENOMEM;
+ (*bpp)->b_ops = cur->bc_ops->buf_ops;
*block = XFS_BUF_TO_BLOCK(*bpp);
return 0;
}
@@ -998,19 +1023,15 @@ xfs_btree_read_buf_block(
d = xfs_btree_ptr_to_daddr(cur, ptr);
error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
- mp->m_bsize, flags, bpp);
+ mp->m_bsize, flags, bpp,
+ cur->bc_ops->buf_ops);
if (error)
return error;
ASSERT(!xfs_buf_geterror(*bpp));
-
xfs_btree_set_refs(cur, *bpp);
*block = XFS_BUF_TO_BLOCK(*bpp);
-
- error = xfs_btree_check_block(cur, *block, level, *bpp);
- if (error)
- xfs_trans_brelse(cur->bc_tp, *bpp);
- return error;
+ return 0;
}
/*
@@ -2183,7 +2204,7 @@ xfs_btree_split(
goto error0;
/* Fill in the btree header for the new right block. */
- xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
+ xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp);
/*
* Split the entries between the old and the new block evenly.
@@ -2492,7 +2513,7 @@ xfs_btree_new_root(
nptr = 2;
}
/* Fill in the new block's btree header and log it. */
- xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
+ xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp);
xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
!xfs_btree_ptr_is_null(cur, &rptr));
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 5b240de104c..f932897194e 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -188,6 +188,8 @@ struct xfs_btree_ops {
__int64_t (*key_diff)(struct xfs_btree_cur *cur,
union xfs_btree_key *key);
+ const struct xfs_buf_ops *buf_ops;
+
#ifdef DEBUG
/* check that k1 is lower than k2 */
int (*keys_inorder)(struct xfs_btree_cur *cur,
@@ -355,7 +357,8 @@ xfs_btree_read_bufl(
xfs_fsblock_t fsbno, /* file system block number */
uint lock, /* lock flags for read_buf */
struct xfs_buf **bpp, /* buffer for fsbno */
- int refval);/* ref count value for buffer */
+ int refval, /* ref count value for buffer */
+ const struct xfs_buf_ops *ops);
/*
* Read-ahead the block, don't wait for it, don't return a buffer.
@@ -365,7 +368,8 @@ void /* error */
xfs_btree_reada_bufl(
struct xfs_mount *mp, /* file system mount point */
xfs_fsblock_t fsbno, /* file system block number */
- xfs_extlen_t count); /* count of filesystem blocks */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops);
/*
* Read-ahead the block, don't wait for it, don't return a buffer.
@@ -376,8 +380,20 @@ xfs_btree_reada_bufs(
struct xfs_mount *mp, /* file system mount point */
xfs_agnumber_t agno, /* allocation group number */
xfs_agblock_t agbno, /* allocation group block number */
- xfs_extlen_t count); /* count of filesystem blocks */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops);
+/*
+ * Initialise a new btree block header
+ */
+void
+xfs_btree_init_block(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ __u32 magic,
+ __u16 level,
+ __u16 numrecs,
+ unsigned int flags);
/*
* Common btree core entry points.
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 269b35c084d..fbbb9eb92e3 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -96,6 +96,7 @@ xfs_buf_lru_add(
atomic_inc(&bp->b_hold);
list_add_tail(&bp->b_lru, &btp->bt_lru);
btp->bt_lru_nr++;
+ bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
}
spin_unlock(&btp->bt_lru_lock);
}
@@ -154,7 +155,8 @@ xfs_buf_stale(
struct xfs_buftarg *btp = bp->b_target;
spin_lock(&btp->bt_lru_lock);
- if (!list_empty(&bp->b_lru)) {
+ if (!list_empty(&bp->b_lru) &&
+ !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) {
list_del_init(&bp->b_lru);
btp->bt_lru_nr--;
atomic_dec(&bp->b_hold);
@@ -164,14 +166,49 @@ xfs_buf_stale(
ASSERT(atomic_read(&bp->b_hold) >= 1);
}
+static int
+xfs_buf_get_maps(
+ struct xfs_buf *bp,
+ int map_count)
+{
+ ASSERT(bp->b_maps == NULL);
+ bp->b_map_count = map_count;
+
+ if (map_count == 1) {
+ bp->b_maps = &bp->__b_map;
+ return 0;
+ }
+
+ bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
+ KM_NOFS);
+ if (!bp->b_maps)
+ return ENOMEM;
+ return 0;
+}
+
+/*
+ * Frees b_pages if it was allocated.
+ */
+static void
+xfs_buf_free_maps(
+ struct xfs_buf *bp)
+{
+ if (bp->b_maps != &bp->__b_map) {
+ kmem_free(bp->b_maps);
+ bp->b_maps = NULL;
+ }
+}
+
struct xfs_buf *
-xfs_buf_alloc(
+_xfs_buf_alloc(
struct xfs_buftarg *target,
- xfs_daddr_t blkno,
- size_t numblks,
+ struct xfs_buf_map *map,
+ int nmaps,
xfs_buf_flags_t flags)
{
struct xfs_buf *bp;
+ int error;
+ int i;
bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
if (unlikely(!bp))
@@ -192,16 +229,28 @@ xfs_buf_alloc(
sema_init(&bp->b_sema, 0); /* held, no waiters */
XB_SET_OWNER(bp);
bp->b_target = target;
+ bp->b_flags = flags;
/*
* Set length and io_length to the same value initially.
* I/O routines should use io_length, which will be the same in
* most cases but may be reset (e.g. XFS recovery).
*/
- bp->b_length = numblks;
- bp->b_io_length = numblks;
- bp->b_flags = flags;
- bp->b_bn = blkno;
+ error = xfs_buf_get_maps(bp, nmaps);
+ if (error) {
+ kmem_zone_free(xfs_buf_zone, bp);
+ return NULL;
+ }
+
+ bp->b_bn = map[0].bm_bn;
+ bp->b_length = 0;
+ for (i = 0; i < nmaps; i++) {
+ bp->b_maps[i].bm_bn = map[i].bm_bn;
+ bp->b_maps[i].bm_len = map[i].bm_len;
+ bp->b_length += map[i].bm_len;
+ }
+ bp->b_io_length = bp->b_length;
+
atomic_set(&bp->b_pin_count, 0);
init_waitqueue_head(&bp->b_waiters);
@@ -280,6 +329,7 @@ xfs_buf_free(
} else if (bp->b_flags & _XBF_KMEM)
kmem_free(bp->b_addr);
_xfs_buf_free_pages(bp);
+ xfs_buf_free_maps(bp);
kmem_zone_free(xfs_buf_zone, bp);
}
@@ -327,8 +377,9 @@ xfs_buf_allocate_memory(
}
use_alloc_page:
- start = BBTOB(bp->b_bn) >> PAGE_SHIFT;
- end = (BBTOB(bp->b_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
+ end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
+ >> PAGE_SHIFT;
page_count = end - start;
error = _xfs_buf_get_pages(bp, page_count, flags);
if (unlikely(error))
@@ -425,8 +476,8 @@ _xfs_buf_map_pages(
xfs_buf_t *
_xfs_buf_find(
struct xfs_buftarg *btp,
- xfs_daddr_t blkno,
- size_t numblks,
+ struct xfs_buf_map *map,
+ int nmaps,
xfs_buf_flags_t flags,
xfs_buf_t *new_bp)
{
@@ -435,13 +486,36 @@ _xfs_buf_find(
struct rb_node **rbp;
struct rb_node *parent;
xfs_buf_t *bp;
+ xfs_daddr_t blkno = map[0].bm_bn;
+ xfs_daddr_t eofs;
+ int numblks = 0;
+ int i;
+ for (i = 0; i < nmaps; i++)
+ numblks += map[i].bm_len;
numbytes = BBTOB(numblks);
/* Check for IOs smaller than the sector size / not sector aligned */
ASSERT(!(numbytes < (1 << btp->bt_sshift)));
ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
+ /*
+ * Corrupted block numbers can get through to here, unfortunately, so we
+ * have to check that the buffer falls within the filesystem bounds.
+ */
+ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
+ if (blkno >= eofs) {
+ /*
+ * XXX (dgc): we should really be returning EFSCORRUPTED here,
+ * but none of the higher level infrastructure supports
+ * returning a specific error on buffer lookup failures.
+ */
+ xfs_alert(btp->bt_mount,
+ "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
+ __func__, blkno, eofs);
+ return NULL;
+ }
+
/* get tree root */
pag = xfs_perag_get(btp->bt_mount,
xfs_daddr_to_agno(btp->bt_mount, blkno));
@@ -513,7 +587,9 @@ found:
*/
if (bp->b_flags & XBF_STALE) {
ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
+ ASSERT(bp->b_iodone == NULL);
bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+ bp->b_ops = NULL;
}
trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -527,31 +603,31 @@ found:
* more hits than misses.
*/
struct xfs_buf *
-xfs_buf_get(
- xfs_buftarg_t *target,
- xfs_daddr_t blkno,
- size_t numblks,
+xfs_buf_get_map(
+ struct xfs_buftarg *target,
+ struct xfs_buf_map *map,
+ int nmaps,
xfs_buf_flags_t flags)
{
struct xfs_buf *bp;
struct xfs_buf *new_bp;
int error = 0;
- bp = _xfs_buf_find(target, blkno, numblks, flags, NULL);
+ bp = _xfs_buf_find(target, map, nmaps, flags, NULL);
if (likely(bp))
goto found;
- new_bp = xfs_buf_alloc(target, blkno, numblks, flags);
+ new_bp = _xfs_buf_alloc(target, map, nmaps, flags);
if (unlikely(!new_bp))
return NULL;
error = xfs_buf_allocate_memory(new_bp, flags);
if (error) {
- kmem_zone_free(xfs_buf_zone, new_bp);
+ xfs_buf_free(new_bp);
return NULL;
}
- bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp);
+ bp = _xfs_buf_find(target, map, nmaps, flags, new_bp);
if (!bp) {
xfs_buf_free(new_bp);
return NULL;
@@ -560,8 +636,6 @@ xfs_buf_get(
if (bp != new_bp)
xfs_buf_free(new_bp);
- bp->b_io_length = bp->b_length;
-
found:
if (!bp->b_addr) {
error = _xfs_buf_map_pages(bp, flags);
@@ -584,7 +658,7 @@ _xfs_buf_read(
xfs_buf_flags_t flags)
{
ASSERT(!(flags & XBF_WRITE));
- ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
+ ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
@@ -596,22 +670,24 @@ _xfs_buf_read(
}
xfs_buf_t *
-xfs_buf_read(
- xfs_buftarg_t *target,
- xfs_daddr_t blkno,
- size_t numblks,
- xfs_buf_flags_t flags)
+xfs_buf_read_map(
+ struct xfs_buftarg *target,
+ struct xfs_buf_map *map,
+ int nmaps,
+ xfs_buf_flags_t flags,
+ const struct xfs_buf_ops *ops)
{
- xfs_buf_t *bp;
+ struct xfs_buf *bp;
flags |= XBF_READ;
- bp = xfs_buf_get(target, blkno, numblks, flags);
+ bp = xfs_buf_get_map(target, map, nmaps, flags);
if (bp) {
trace_xfs_buf_read(bp, flags, _RET_IP_);
if (!XFS_BUF_ISDONE(bp)) {
XFS_STATS_INC(xb_get_read);
+ bp->b_ops = ops;
_xfs_buf_read(bp, flags);
} else if (flags & XBF_ASYNC) {
/*
@@ -634,16 +710,17 @@ xfs_buf_read(
* safe manner.
*/
void
-xfs_buf_readahead(
- xfs_buftarg_t *target,
- xfs_daddr_t blkno,
- size_t numblks)
+xfs_buf_readahead_map(
+ struct xfs_buftarg *target,
+ struct xfs_buf_map *map,
+ int nmaps,
+ const struct xfs_buf_ops *ops)
{
if (bdi_read_congested(target->bt_bdi))
return;
- xfs_buf_read(target, blkno, numblks,
- XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
+ xfs_buf_read_map(target, map, nmaps,
+ XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
}
/*
@@ -655,25 +732,24 @@ xfs_buf_read_uncached(
struct xfs_buftarg *target,
xfs_daddr_t daddr,
size_t numblks,
- int flags)
+ int flags,
+ const struct xfs_buf_ops *ops)
{
- xfs_buf_t *bp;
- int error;
+ struct xfs_buf *bp;
bp = xfs_buf_get_uncached(target, numblks, flags);
if (!bp)
return NULL;
/* set up the buffer for a read IO */
- XFS_BUF_SET_ADDR(bp, daddr);
- XFS_BUF_READ(bp);
+ ASSERT(bp->b_map_count == 1);
+ bp->b_bn = daddr;
+ bp->b_maps[0].bm_bn = daddr;
+ bp->b_flags |= XBF_READ;
+ bp->b_ops = ops;
xfsbdstrat(target->bt_mount, bp);
- error = xfs_buf_iowait(bp);
- if (error) {
- xfs_buf_relse(bp);
- return NULL;
- }
+ xfs_buf_iowait(bp);
return bp;
}
@@ -694,7 +770,11 @@ xfs_buf_set_empty(
bp->b_addr = NULL;
bp->b_length = numblks;
bp->b_io_length = numblks;
+
+ ASSERT(bp->b_map_count == 1);
bp->b_bn = XFS_BUF_DADDR_NULL;
+ bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL;
+ bp->b_maps[0].bm_len = bp->b_length;
}
static inline struct page *
@@ -758,9 +838,10 @@ xfs_buf_get_uncached(
{
unsigned long page_count;
int error, i;
- xfs_buf_t *bp;
+ struct xfs_buf *bp;
+ DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
- bp = xfs_buf_alloc(target, XFS_BUF_DADDR_NULL, numblks, 0);
+ bp = _xfs_buf_alloc(target, &map, 1, 0);
if (unlikely(bp == NULL))
goto fail;
@@ -791,6 +872,7 @@ xfs_buf_get_uncached(
__free_page(bp->b_pages[i]);
_xfs_buf_free_pages(bp);
fail_free_buf:
+ xfs_buf_free_maps(bp);
kmem_zone_free(xfs_buf_zone, bp);
fail:
return NULL;
@@ -937,27 +1019,37 @@ STATIC void
xfs_buf_iodone_work(
struct work_struct *work)
{
- xfs_buf_t *bp =
+ struct xfs_buf *bp =
container_of(work, xfs_buf_t, b_iodone_work);
+ bool read = !!(bp->b_flags & XBF_READ);
+
+ bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
+ if (read && bp->b_ops)
+ bp->b_ops->verify_read(bp);
if (bp->b_iodone)
(*(bp->b_iodone))(bp);
else if (bp->b_flags & XBF_ASYNC)
xfs_buf_relse(bp);
+ else {
+ ASSERT(read && bp->b_ops);
+ complete(&bp->b_iowait);
+ }
}
void
xfs_buf_ioend(
- xfs_buf_t *bp,
- int schedule)
+ struct xfs_buf *bp,
+ int schedule)
{
+ bool read = !!(bp->b_flags & XBF_READ);
+
trace_xfs_buf_iodone(bp, _RET_IP_);
- bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
if (bp->b_error == 0)
bp->b_flags |= XBF_DONE;
- if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
+ if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
if (schedule) {
INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
queue_work(xfslogd_workqueue, &bp->b_iodone_work);
@@ -965,6 +1057,7 @@ xfs_buf_ioend(
xfs_buf_iodone_work(&bp->b_iodone_work);
}
} else {
+ bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
complete(&bp->b_iowait);
}
}
@@ -1135,45 +1228,53 @@ xfs_buf_bio_end_io(
{
xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
- xfs_buf_ioerror(bp, -error);
+ /*
+ * don't overwrite existing errors - otherwise we can lose errors on
+ * buffers that require multiple bios to complete.
+ */
+ if (!bp->b_error)
+ xfs_buf_ioerror(bp, -error);
- if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+ if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
_xfs_buf_ioend(bp, 1);
bio_put(bio);
}
-STATIC void
-_xfs_buf_ioapply(
- xfs_buf_t *bp)
+static void
+xfs_buf_ioapply_map(
+ struct xfs_buf *bp,
+ int map,
+ int *buf_offset,
+ int *count,
+ int rw)
{
- int rw, map_i, total_nr_pages, nr_pages;
- struct bio *bio;
- int offset = bp->b_offset;
- int size = BBTOB(bp->b_io_length);
- sector_t sector = bp->b_bn;
+ int page_index;
+ int total_nr_pages = bp->b_page_count;
+ int nr_pages;
+ struct bio *bio;
+ sector_t sector = bp->b_maps[map].bm_bn;
+ int size;
+ int offset;
total_nr_pages = bp->b_page_count;
- map_i = 0;
- if (bp->b_flags & XBF_WRITE) {
- if (bp->b_flags & XBF_SYNCIO)
- rw = WRITE_SYNC;
- else
- rw = WRITE;
- if (bp->b_flags & XBF_FUA)
- rw |= REQ_FUA;
- if (bp->b_flags & XBF_FLUSH)
- rw |= REQ_FLUSH;
- } else if (bp->b_flags & XBF_READ_AHEAD) {
- rw = READA;
- } else {
- rw = READ;
+ /* skip the pages in the buffer before the start offset */
+ page_index = 0;
+ offset = *buf_offset;
+ while (offset >= PAGE_SIZE) {
+ page_index++;
+ offset -= PAGE_SIZE;
}
- /* we only use the buffer cache for meta-data */
- rw |= REQ_META;
+ /*
+ * Limit the IO size to the length of the current vector, and update the
+ * remaining IO count for the next time around.
+ */
+ size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
+ *count -= size;
+ *buf_offset += size;
next_chunk:
atomic_inc(&bp->b_io_remaining);
@@ -1188,13 +1289,14 @@ next_chunk:
bio->bi_private = bp;
- for (; size && nr_pages; nr_pages--, map_i++) {
+ for (; size && nr_pages; nr_pages--, page_index++) {
int rbytes, nbytes = PAGE_SIZE - offset;
if (nbytes > size)
nbytes = size;
- rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
+ rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
+ offset);
if (rbytes < nbytes)
break;
@@ -1213,9 +1315,76 @@ next_chunk:
if (size)
goto next_chunk;
} else {
+ /*
+ * This is guaranteed not to be the last io reference count
+ * because the caller (xfs_buf_iorequest) holds a count itself.
+ */
+ atomic_dec(&bp->b_io_remaining);
xfs_buf_ioerror(bp, EIO);
bio_put(bio);
}
+
+}
+
+STATIC void
+_xfs_buf_ioapply(
+ struct xfs_buf *bp)
+{
+ struct blk_plug plug;
+ int rw;
+ int offset;
+ int size;
+ int i;
+
+ if (bp->b_flags & XBF_WRITE) {
+ if (bp->b_flags & XBF_SYNCIO)
+ rw = WRITE_SYNC;
+ else
+ rw = WRITE;
+ if (bp->b_flags & XBF_FUA)
+ rw |= REQ_FUA;
+ if (bp->b_flags & XBF_FLUSH)
+ rw |= REQ_FLUSH;
+
+ /*
+ * Run the write verifier callback function if it exists. If
+ * this function fails it will mark the buffer with an error and
+ * the IO should not be dispatched.
+ */
+ if (bp->b_ops) {
+ bp->b_ops->verify_write(bp);
+ if (bp->b_error) {
+ xfs_force_shutdown(bp->b_target->bt_mount,
+ SHUTDOWN_CORRUPT_INCORE);
+ return;
+ }
+ }
+ } else if (bp->b_flags & XBF_READ_AHEAD) {
+ rw = READA;
+ } else {
+ rw = READ;
+ }
+
+ /* we only use the buffer cache for meta-data */
+ rw |= REQ_META;
+
+ /*
+ * Walk all the vectors issuing IO on them. Set up the initial offset
+ * into the buffer and the desired IO size before we start -
+ * _xfs_buf_ioapply_vec() will modify them appropriately for each
+ * subsequent call.
+ */
+ offset = bp->b_offset;
+ size = BBTOB(bp->b_io_length);
+ blk_start_plug(&plug);
+ for (i = 0; i < bp->b_map_count; i++) {
+ xfs_buf_ioapply_map(bp, i, &offset, &size, rw);
+ if (bp->b_error)
+ break;
+ if (size <= 0)
+ break; /* all done */
+ }
+ blk_finish_plug(&plug);
}
void
@@ -1336,6 +1505,8 @@ restart:
while (!list_empty(&btp->bt_lru)) {
bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
if (atomic_read(&bp->b_hold) > 1) {
+ trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+ list_move_tail(&bp->b_lru, &btp->bt_lru);
spin_unlock(&btp->bt_lru_lock);
delay(100);
goto restart;
@@ -1389,6 +1560,7 @@ xfs_buftarg_shrink(
*/
list_move(&bp->b_lru, &dispose);
btp->bt_lru_nr--;
+ bp->b_lru_flags |= _XBF_LRU_DISPOSE;
}
spin_unlock(&btp->bt_lru_lock);
@@ -1557,7 +1729,7 @@ xfs_buf_cmp(
struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
xfs_daddr_t diff;
- diff = ap->b_bn - bp->b_bn;
+ diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
if (diff < 0)
return -1;
if (diff > 0)
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 79344c48008..433a12ed7b1 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -38,26 +38,28 @@ typedef enum {
XBRW_ZERO = 3, /* Zero target memory */
} xfs_buf_rw_t;
-#define XBF_READ (1 << 0) /* buffer intended for reading from device */
-#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
-#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
-#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
-#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
-#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
+#define XBF_READ (1 << 0) /* buffer intended for reading from device */
+#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
+#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
+#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
+#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
+#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
/* I/O hints for the BIO layer */
-#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
-#define XBF_FUA (1 << 11)/* force cache write through mode */
-#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */
+#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
+#define XBF_FUA (1 << 11)/* force cache write through mode */
+#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */
/* flags used only as arguments to access routines */
-#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */
-#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */
+#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */
+#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */
/* flags used only internally */
-#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
-#define _XBF_KMEM (1 << 21)/* backed by heap memory */
-#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
+#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
+#define _XBF_KMEM (1 << 21)/* backed by heap memory */
+#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
+#define _XBF_COMPOUND (1 << 23)/* compound buffer */
+#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */
typedef unsigned int xfs_buf_flags_t;
@@ -71,11 +73,13 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_SYNCIO, "SYNCIO" }, \
{ XBF_FUA, "FUA" }, \
{ XBF_FLUSH, "FLUSH" }, \
- { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\
+ { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\
{ XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
- { _XBF_DELWRI_Q, "DELWRI_Q" }
+ { _XBF_DELWRI_Q, "DELWRI_Q" }, \
+ { _XBF_COMPOUND, "COMPOUND" }, \
+ { _XBF_LRU_DISPOSE, "LRU_DISPOSE" }
typedef struct xfs_buftarg {
dev_t bt_dev;
@@ -96,8 +100,22 @@ typedef struct xfs_buftarg {
struct xfs_buf;
typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
+
#define XB_PAGES 2
+struct xfs_buf_map {
+ xfs_daddr_t bm_bn; /* block number for I/O */
+ int bm_len; /* size of I/O */
+};
+
+#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
+ struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
+
+struct xfs_buf_ops {
+ void (*verify_read)(struct xfs_buf *);
+ void (*verify_write)(struct xfs_buf *);
+};
+
typedef struct xfs_buf {
/*
* first cacheline holds all the fields needed for an uncontended cache
@@ -107,14 +125,19 @@ typedef struct xfs_buf {
* fast-path on locking.
*/
struct rb_node b_rbnode; /* rbtree node */
- xfs_daddr_t b_bn; /* block number for I/O */
+ xfs_daddr_t b_bn; /* block number of buffer */
int b_length; /* size of buffer in BBs */
atomic_t b_hold; /* reference count */
atomic_t b_lru_ref; /* lru reclaim ref count */
xfs_buf_flags_t b_flags; /* status flags */
struct semaphore b_sema; /* semaphore for lockables */
+ /*
+ * concurrent access to b_lru and b_lru_flags are protected by
+ * bt_lru_lock and not by b_sema
+ */
struct list_head b_lru; /* lru list */
+ xfs_buf_flags_t b_lru_flags; /* internal lru status flags */
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
@@ -127,42 +150,107 @@ typedef struct xfs_buf {
struct xfs_trans *b_transp;
struct page **b_pages; /* array of page pointers */
struct page *b_page_array[XB_PAGES]; /* inline pages */
+ struct xfs_buf_map *b_maps; /* compound buffer map */
+ struct xfs_buf_map __b_map; /* inline compound buffer map */
+ int b_map_count;
int b_io_length; /* IO size in BBs */
atomic_t b_pin_count; /* pin count */
atomic_t b_io_remaining; /* #outstanding I/O requests */
unsigned int b_page_count; /* size of page array */
unsigned int b_offset; /* page offset in first page */
unsigned short b_error; /* error code on I/O */
+ const struct xfs_buf_ops *b_ops;
+
#ifdef XFS_BUF_LOCK_TRACKING
int b_last_holder;
#endif
} xfs_buf_t;
-
/* Finding and Reading Buffers */
-struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, xfs_daddr_t blkno,
- size_t numblks, xfs_buf_flags_t flags,
- struct xfs_buf *new_bp);
-#define xfs_incore(buftarg,blkno,len,lockit) \
- _xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
-
-struct xfs_buf *xfs_buf_get(struct xfs_buftarg *target, xfs_daddr_t blkno,
- size_t numblks, xfs_buf_flags_t flags);
-struct xfs_buf *xfs_buf_read(struct xfs_buftarg *target, xfs_daddr_t blkno,
- size_t numblks, xfs_buf_flags_t flags);
-void xfs_buf_readahead(struct xfs_buftarg *target, xfs_daddr_t blkno,
- size_t numblks);
+struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
+ struct xfs_buf_map *map, int nmaps,
+ xfs_buf_flags_t flags, struct xfs_buf *new_bp);
+
+static inline struct xfs_buf *
+xfs_incore(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ xfs_buf_flags_t flags)
+{
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+ return _xfs_buf_find(target, &map, 1, flags, NULL);
+}
+
+struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target,
+ struct xfs_buf_map *map, int nmaps,
+ xfs_buf_flags_t flags);
+
+static inline struct xfs_buf *
+xfs_buf_alloc(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ xfs_buf_flags_t flags)
+{
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+ return _xfs_buf_alloc(target, &map, 1, flags);
+}
+
+struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
+ struct xfs_buf_map *map, int nmaps,
+ xfs_buf_flags_t flags);
+struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
+ struct xfs_buf_map *map, int nmaps,
+ xfs_buf_flags_t flags,
+ const struct xfs_buf_ops *ops);
+void xfs_buf_readahead_map(struct xfs_buftarg *target,
+ struct xfs_buf_map *map, int nmaps,
+ const struct xfs_buf_ops *ops);
+
+static inline struct xfs_buf *
+xfs_buf_get(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ xfs_buf_flags_t flags)
+{
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+ return xfs_buf_get_map(target, &map, 1, flags);
+}
+
+static inline struct xfs_buf *
+xfs_buf_read(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ xfs_buf_flags_t flags,
+ const struct xfs_buf_ops *ops)
+{
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+ return xfs_buf_read_map(target, &map, 1, flags, ops);
+}
+
+static inline void
+xfs_buf_readahead(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ const struct xfs_buf_ops *ops)
+{
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+ return xfs_buf_readahead_map(target, &map, 1, ops);
+}
struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
-struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *target, xfs_daddr_t blkno,
- size_t numblks, xfs_buf_flags_t flags);
void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks);
int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
int flags);
struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
- xfs_daddr_t daddr, size_t numblks, int flags);
+ xfs_daddr_t daddr, size_t numblks, int flags,
+ const struct xfs_buf_ops *ops);
void xfs_buf_hold(struct xfs_buf *bp);
/* Releasing Buffers */
@@ -232,8 +320,18 @@ void xfs_buf_stale(struct xfs_buf *bp);
#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE)
#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE)
-#define XFS_BUF_ADDR(bp) ((bp)->b_bn)
-#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno))
+/*
+ * These macros use the IO block map rather than b_bn. b_bn is now really
+ * just for the buffer cache index for cached buffers. As IO does not use b_bn
+ * anymore, uncached buffers do not use b_bn at all and hence must modify the IO
+ * map directly. Uncached buffers are not allowed to be discontiguous, so this
+ * is safe to do.
+ *
+ * In future, uncached buffers will pass the block number directly to the io
+ * request function and hence these macros will go away at that point.
+ */
+#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn)
+#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno))
static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
{
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index d9e451115f9..3f9949fee39 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -71,7 +71,7 @@ xfs_buf_item_log_debug(
chunk_num = byte >> XFS_BLF_SHIFT;
word_num = chunk_num >> BIT_TO_WORD_SHIFT;
bit_num = chunk_num & (NBWORD - 1);
- wordp = &(bip->bli_format.blf_data_map[word_num]);
+ wordp = &(bip->__bli_format.blf_data_map[word_num]);
bit_set = *wordp & (1 << bit_num);
ASSERT(bit_set);
byte++;
@@ -153,33 +153,25 @@ STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
* If the XFS_BLI_STALE flag has been set, then log nothing.
*/
STATIC uint
-xfs_buf_item_size(
- struct xfs_log_item *lip)
+xfs_buf_item_size_segment(
+ struct xfs_buf_log_item *bip,
+ struct xfs_buf_log_format *blfp)
{
- struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
uint nvecs;
int next_bit;
int last_bit;
- ASSERT(atomic_read(&bip->bli_refcount) > 0);
- if (bip->bli_flags & XFS_BLI_STALE) {
- /*
- * The buffer is stale, so all we need to log
- * is the buf log format structure with the
- * cancel flag in it.
- */
- trace_xfs_buf_item_size_stale(bip);
- ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
- return 1;
- }
+ last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
+ if (last_bit == -1)
+ return 0;
+
+ /*
+ * initial count for a dirty buffer is 2 vectors - the format structure
+ * and the first dirty region.
+ */
+ nvecs = 2;
- ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
- nvecs = 1;
- last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
- bip->bli_format.blf_map_size, 0);
- ASSERT(last_bit != -1);
- nvecs++;
while (last_bit != -1) {
/*
* This takes the bit number to start looking from and
@@ -187,16 +179,15 @@ xfs_buf_item_size(
* if there are no more bits set or the start bit is
* beyond the end of the bitmap.
*/
- next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
- bip->bli_format.blf_map_size,
- last_bit + 1);
+ next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
+ last_bit + 1);
/*
* If we run out of bits, leave the loop,
* else if we find a new set of bits bump the number of vecs,
* else keep scanning the current set of bits.
*/
if (next_bit == -1) {
- last_bit = -1;
+ break;
} else if (next_bit != last_bit + 1) {
last_bit = next_bit;
nvecs++;
@@ -210,22 +201,73 @@ xfs_buf_item_size(
}
}
- trace_xfs_buf_item_size(bip);
return nvecs;
}
/*
- * This is called to fill in the vector of log iovecs for the
- * given log buf item. It fills the first entry with a buf log
- * format structure, and the rest point to contiguous chunks
- * within the buffer.
+ * This returns the number of log iovecs needed to log the given buf log item.
+ *
+ * It calculates this as 1 iovec for the buf log format structure and 1 for each
+ * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
+ * in a single iovec.
+ *
+ * Discontiguous buffers need a format structure per region that that is being
+ * logged. This makes the changes in the buffer appear to log recovery as though
+ * they came from separate buffers, just like would occur if multiple buffers
+ * were used instead of a single discontiguous buffer. This enables
+ * discontiguous buffers to be in-memory constructs, completely transparent to
+ * what ends up on disk.
+ *
+ * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
+ * format structures.
*/
-STATIC void
-xfs_buf_item_format(
- struct xfs_log_item *lip,
- struct xfs_log_iovec *vecp)
+STATIC uint
+xfs_buf_item_size(
+ struct xfs_log_item *lip)
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ uint nvecs;
+ int i;
+
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ if (bip->bli_flags & XFS_BLI_STALE) {
+ /*
+ * The buffer is stale, so all we need to log
+ * is the buf log format structure with the
+ * cancel flag in it.
+ */
+ trace_xfs_buf_item_size_stale(bip);
+ ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+ return bip->bli_format_count;
+ }
+
+ ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
+
+ /*
+ * the vector count is based on the number of buffer vectors we have
+ * dirty bits in. This will only be greater than one when we have a
+ * compound buffer with more than one segment dirty. Hence for compound
+ * buffers we need to track which segment the dirty bits correspond to,
+ * and when we move from one segment to the next increment the vector
+ * count for the extra buf log format structure that will need to be
+ * written.
+ */
+ nvecs = 0;
+ for (i = 0; i < bip->bli_format_count; i++) {
+ nvecs += xfs_buf_item_size_segment(bip, &bip->bli_formats[i]);
+ }
+
+ trace_xfs_buf_item_size(bip);
+ return nvecs;
+}
+
+static struct xfs_log_iovec *
+xfs_buf_item_format_segment(
+ struct xfs_buf_log_item *bip,
+ struct xfs_log_iovec *vecp,
+ uint offset,
+ struct xfs_buf_log_format *blfp)
+{
struct xfs_buf *bp = bip->bli_buf;
uint base_size;
uint nvecs;
@@ -235,40 +277,33 @@ xfs_buf_item_format(
uint nbits;
uint buffer_offset;
- ASSERT(atomic_read(&bip->bli_refcount) > 0);
- ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
- (bip->bli_flags & XFS_BLI_STALE));
+ /* copy the flags across from the base format item */
+ blfp->blf_flags = bip->__bli_format.blf_flags;
/*
- * The size of the base structure is the size of the
- * declared structure plus the space for the extra words
- * of the bitmap. We subtract one from the map size, because
- * the first element of the bitmap is accounted for in the
- * size of the base structure.
+ * Base size is the actual size of the ondisk structure - it reflects
+ * the actual size of the dirty bitmap rather than the size of the in
+ * memory structure.
*/
- base_size =
- (uint)(sizeof(xfs_buf_log_format_t) +
- ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
- vecp->i_addr = &bip->bli_format;
+ base_size = offsetof(struct xfs_buf_log_format, blf_data_map) +
+ (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
+
+ nvecs = 0;
+ first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
+ if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
+ /*
+ * If the map is not be dirty in the transaction, mark
+ * the size as zero and do not advance the vector pointer.
+ */
+ goto out;
+ }
+
+ vecp->i_addr = blfp;
vecp->i_len = base_size;
vecp->i_type = XLOG_REG_TYPE_BFORMAT;
vecp++;
nvecs = 1;
- /*
- * If it is an inode buffer, transfer the in-memory state to the
- * format flags and clear the in-memory state. We do not transfer
- * this state if the inode buffer allocation has not yet been committed
- * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
- * correct replay of the inode allocation.
- */
- if (bip->bli_flags & XFS_BLI_INODE_BUF) {
- if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
- xfs_log_item_in_current_chkpt(lip)))
- bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
- bip->bli_flags &= ~XFS_BLI_INODE_BUF;
- }
-
if (bip->bli_flags & XFS_BLI_STALE) {
/*
* The buffer is stale, so all we need to log
@@ -276,17 +311,14 @@ xfs_buf_item_format(
* cancel flag in it.
*/
trace_xfs_buf_item_format_stale(bip);
- ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
- bip->bli_format.blf_size = nvecs;
- return;
+ ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
+ goto out;
}
/*
* Fill in an iovec for each set of contiguous chunks.
*/
- first_bit = xfs_next_bit(bip->bli_format.blf_data_map,
- bip->bli_format.blf_map_size, 0);
- ASSERT(first_bit != -1);
+
last_bit = first_bit;
nbits = 1;
for (;;) {
@@ -296,9 +328,8 @@ xfs_buf_item_format(
* if there are no more bits set or the start bit is
* beyond the end of the bitmap.
*/
- next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
- bip->bli_format.blf_map_size,
- (uint)last_bit + 1);
+ next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
+ (uint)last_bit + 1);
/*
* If we run out of bits fill in the last iovec and get
* out of the loop.
@@ -309,14 +340,14 @@ xfs_buf_item_format(
* keep counting and scanning.
*/
if (next_bit == -1) {
- buffer_offset = first_bit * XFS_BLF_CHUNK;
+ buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
vecp->i_len = nbits * XFS_BLF_CHUNK;
vecp->i_type = XLOG_REG_TYPE_BCHUNK;
nvecs++;
break;
} else if (next_bit != last_bit + 1) {
- buffer_offset = first_bit * XFS_BLF_CHUNK;
+ buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
vecp->i_len = nbits * XFS_BLF_CHUNK;
vecp->i_type = XLOG_REG_TYPE_BCHUNK;
@@ -325,14 +356,17 @@ xfs_buf_item_format(
first_bit = next_bit;
last_bit = next_bit;
nbits = 1;
- } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
- (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
+ } else if (xfs_buf_offset(bp, offset +
+ (next_bit << XFS_BLF_SHIFT)) !=
+ (xfs_buf_offset(bp, offset +
+ (last_bit << XFS_BLF_SHIFT)) +
XFS_BLF_CHUNK)) {
- buffer_offset = first_bit * XFS_BLF_CHUNK;
+ buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
vecp->i_len = nbits * XFS_BLF_CHUNK;
vecp->i_type = XLOG_REG_TYPE_BCHUNK;
-/* You would think we need to bump the nvecs here too, but we do not
+/*
+ * You would think we need to bump the nvecs here too, but we do not
* this number is used by recovery, and it gets confused by the boundary
* split here
* nvecs++;
@@ -346,7 +380,50 @@ xfs_buf_item_format(
nbits++;
}
}
- bip->bli_format.blf_size = nvecs;
+out:
+ blfp->blf_size = nvecs;
+ return vecp;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given log buf item. It fills the first entry with a buf log
+ * format structure, and the rest point to contiguous chunks
+ * within the buffer.
+ */
+STATIC void
+xfs_buf_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *vecp)
+{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+ uint offset = 0;
+ int i;
+
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+ (bip->bli_flags & XFS_BLI_STALE));
+
+ /*
+ * If it is an inode buffer, transfer the in-memory state to the
+ * format flags and clear the in-memory state. We do not transfer
+ * this state if the inode buffer allocation has not yet been committed
+ * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
+ * correct replay of the inode allocation.
+ */
+ if (bip->bli_flags & XFS_BLI_INODE_BUF) {
+ if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+ xfs_log_item_in_current_chkpt(lip)))
+ bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+ bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+ }
+
+ for (i = 0; i < bip->bli_format_count; i++) {
+ vecp = xfs_buf_item_format_segment(bip, vecp, offset,
+ &bip->bli_formats[i]);
+ offset += bp->b_maps[i].bm_len;
+ }
/*
* Check to make sure everything is consistent.
@@ -418,7 +495,7 @@ xfs_buf_item_unpin(
ASSERT(bip->bli_flags & XFS_BLI_STALE);
ASSERT(xfs_buf_islocked(bp));
ASSERT(XFS_BUF_ISSTALE(bp));
- ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+ ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
trace_xfs_buf_item_unpin_stale(bip);
@@ -459,7 +536,25 @@ xfs_buf_item_unpin(
}
xfs_buf_relse(bp);
} else if (freed && remove) {
+ /*
+ * There are currently two references to the buffer - the active
+ * LRU reference and the buf log item. What we are about to do
+ * here - simulate a failed IO completion - requires 3
+ * references.
+ *
+ * The LRU reference is removed by the xfs_buf_stale() call. The
+ * buf item reference is removed by the xfs_buf_iodone()
+ * callback that is run by xfs_buf_do_callbacks() during ioend
+ * processing (via the bp->b_iodone callback), and then finally
+ * the ioend processing will drop the IO reference if the buffer
+ * is marked XBF_ASYNC.
+ *
+ * Hence we need to take an additional reference here so that IO
+ * completion processing doesn't free the buffer prematurely.
+ */
xfs_buf_lock(bp);
+ xfs_buf_hold(bp);
+ bp->b_flags |= XBF_ASYNC;
xfs_buf_ioerror(bp, EIO);
XFS_BUF_UNDONE(bp);
xfs_buf_stale(bp);
@@ -516,7 +611,7 @@ xfs_buf_item_unlock(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- int aborted;
+ int aborted, clean, i;
uint hold;
/* Clear the buffer's association with this transaction. */
@@ -546,7 +641,7 @@ xfs_buf_item_unlock(
*/
if (bip->bli_flags & XFS_BLI_STALE) {
trace_xfs_buf_item_unlock_stale(bip);
- ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+ ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
if (!aborted) {
atomic_dec(&bip->bli_refcount);
return;
@@ -557,12 +652,27 @@ xfs_buf_item_unlock(
/*
* If the buf item isn't tracking any data, free it, otherwise drop the
- * reference we hold to it.
+ * reference we hold to it. If we are aborting the transaction, this may
+ * be the only reference to the buf item, so we free it anyway
+ * regardless of whether it is dirty or not. A dirty abort implies a
+ * shutdown, anyway.
*/
- if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
- bip->bli_format.blf_map_size))
+ clean = 1;
+ for (i = 0; i < bip->bli_format_count; i++) {
+ if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+ bip->bli_formats[i].blf_map_size)) {
+ clean = 0;
+ break;
+ }
+ }
+ if (clean)
xfs_buf_item_relse(bp);
- else
+ else if (aborted) {
+ if (atomic_dec_and_test(&bip->bli_refcount)) {
+ ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+ xfs_buf_item_relse(bp);
+ }
+ } else
atomic_dec(&bip->bli_refcount);
if (!hold)
@@ -622,6 +732,35 @@ static const struct xfs_item_ops xfs_buf_item_ops = {
.iop_committing = xfs_buf_item_committing
};
+STATIC int
+xfs_buf_item_get_format(
+ struct xfs_buf_log_item *bip,
+ int count)
+{
+ ASSERT(bip->bli_formats == NULL);
+ bip->bli_format_count = count;
+
+ if (count == 1) {
+ bip->bli_formats = &bip->__bli_format;
+ return 0;
+ }
+
+ bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
+ KM_SLEEP);
+ if (!bip->bli_formats)
+ return ENOMEM;
+ return 0;
+}
+
+STATIC void
+xfs_buf_item_free_format(
+ struct xfs_buf_log_item *bip)
+{
+ if (bip->bli_formats != &bip->__bli_format) {
+ kmem_free(bip->bli_formats);
+ bip->bli_formats = NULL;
+ }
+}
/*
* Allocate a new buf log item to go with the given buffer.
@@ -639,6 +778,8 @@ xfs_buf_item_init(
xfs_buf_log_item_t *bip;
int chunks;
int map_size;
+ int error;
+ int i;
/*
* Check to see if there is already a buf log item for
@@ -650,25 +791,33 @@ xfs_buf_item_init(
if (lip != NULL && lip->li_type == XFS_LI_BUF)
return;
- /*
- * chunks is the number of XFS_BLF_CHUNK size pieces
- * the buffer can be divided into. Make sure not to
- * truncate any pieces. map_size is the size of the
- * bitmap needed to describe the chunks of the buffer.
- */
- chunks = (int)((BBTOB(bp->b_length) + (XFS_BLF_CHUNK - 1)) >>
- XFS_BLF_SHIFT);
- map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
-
- bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
- KM_SLEEP);
+ bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP);
xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
bip->bli_buf = bp;
xfs_buf_hold(bp);
- bip->bli_format.blf_type = XFS_LI_BUF;
- bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
- bip->bli_format.blf_len = (ushort)bp->b_length;
- bip->bli_format.blf_map_size = map_size;
+
+ /*
+ * chunks is the number of XFS_BLF_CHUNK size pieces the buffer
+ * can be divided into. Make sure not to truncate any pieces.
+ * map_size is the size of the bitmap needed to describe the
+ * chunks of the buffer.
+ *
+ * Discontiguous buffer support follows the layout of the underlying
+ * buffer. This makes the implementation as simple as possible.
+ */
+ error = xfs_buf_item_get_format(bip, bp->b_map_count);
+ ASSERT(error == 0);
+
+ for (i = 0; i < bip->bli_format_count; i++) {
+ chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
+ XFS_BLF_CHUNK);
+ map_size = DIV_ROUND_UP(chunks, NBWORD);
+
+ bip->bli_formats[i].blf_type = XFS_LI_BUF;
+ bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn;
+ bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len;
+ bip->bli_formats[i].blf_map_size = map_size;
+ }
#ifdef XFS_TRANS_DEBUG
/*
@@ -699,10 +848,11 @@ xfs_buf_item_init(
* item's bitmap.
*/
void
-xfs_buf_item_log(
- xfs_buf_log_item_t *bip,
+xfs_buf_item_log_segment(
+ struct xfs_buf_log_item *bip,
uint first,
- uint last)
+ uint last,
+ uint *map)
{
uint first_bit;
uint last_bit;
@@ -715,12 +865,6 @@ xfs_buf_item_log(
uint mask;
/*
- * Mark the item as having some dirty data for
- * quick reference in xfs_buf_item_dirty.
- */
- bip->bli_flags |= XFS_BLI_DIRTY;
-
- /*
* Convert byte offsets to bit numbers.
*/
first_bit = first >> XFS_BLF_SHIFT;
@@ -736,7 +880,7 @@ xfs_buf_item_log(
* to set a bit in.
*/
word_num = first_bit >> BIT_TO_WORD_SHIFT;
- wordp = &(bip->bli_format.blf_data_map[word_num]);
+ wordp = &map[word_num];
/*
* Calculate the starting bit in the first word.
@@ -783,6 +927,51 @@ xfs_buf_item_log(
xfs_buf_item_log_debug(bip, first, last);
}
+/*
+ * Mark bytes first through last inclusive as dirty in the buf
+ * item's bitmap.
+ */
+void
+xfs_buf_item_log(
+ xfs_buf_log_item_t *bip,
+ uint first,
+ uint last)
+{
+ int i;
+ uint start;
+ uint end;
+ struct xfs_buf *bp = bip->bli_buf;
+
+ /*
+ * Mark the item as having some dirty data for
+ * quick reference in xfs_buf_item_dirty.
+ */
+ bip->bli_flags |= XFS_BLI_DIRTY;
+
+ /*
+ * walk each buffer segment and mark them dirty appropriately.
+ */
+ start = 0;
+ for (i = 0; i < bip->bli_format_count; i++) {
+ if (start > last)
+ break;
+ end = start + BBTOB(bp->b_maps[i].bm_len);
+ if (first > end) {
+ start += BBTOB(bp->b_maps[i].bm_len);
+ continue;
+ }
+ if (first < start)
+ first = start;
+ if (end > last)
+ end = last;
+
+ xfs_buf_item_log_segment(bip, first, end,
+ &bip->bli_formats[i].blf_data_map[0]);
+
+ start += bp->b_maps[i].bm_len;
+ }
+}
+
/*
* Return 1 if the buffer has some data that has been logged (at any
@@ -804,6 +993,7 @@ xfs_buf_item_free(
kmem_free(bip->bli_logged);
#endif /* XFS_TRANS_DEBUG */
+ xfs_buf_item_free_format(bip);
kmem_zone_free(xfs_buf_item_zone, bip);
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index b6ecd2061e7..16def435944 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -21,23 +21,6 @@
extern kmem_zone_t *xfs_buf_item_zone;
/*
- * This is the structure used to lay out a buf log item in the
- * log. The data map describes which 128 byte chunks of the buffer
- * have been logged.
- * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything.
- */
-typedef struct xfs_buf_log_format {
- unsigned short blf_type; /* buf log item type indicator */
- unsigned short blf_size; /* size of this item */
- ushort blf_flags; /* misc state */
- ushort blf_len; /* number of blocks in this buf */
- __int64_t blf_blkno; /* starting blkno of this buf */
- unsigned int blf_map_size; /* size of data bitmap in words */
- unsigned int blf_data_map[1];/* variable size bitmap of */
- /* regions of buffer in this item */
-} xfs_buf_log_format_t;
-
-/*
* This flag indicates that the buffer contains on disk inodes
* and requires special recovery handling.
*/
@@ -61,6 +44,23 @@ typedef struct xfs_buf_log_format {
#define NBWORD (NBBY * sizeof(unsigned int))
/*
+ * This is the structure used to lay out a buf log item in the
+ * log. The data map describes which 128 byte chunks of the buffer
+ * have been logged.
+ */
+#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+
+typedef struct xfs_buf_log_format {
+ unsigned short blf_type; /* buf log item type indicator */
+ unsigned short blf_size; /* size of this item */
+ ushort blf_flags; /* misc state */
+ ushort blf_len; /* number of blocks in this buf */
+ __int64_t blf_blkno; /* starting blkno of this buf */
+ unsigned int blf_map_size; /* used size of data bitmap in words */
+ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
+} xfs_buf_log_format_t;
+
+/*
* buf log item flags
*/
#define XFS_BLI_HOLD 0x01
@@ -102,7 +102,9 @@ typedef struct xfs_buf_log_item {
char *bli_orig; /* original buffer copy */
char *bli_logged; /* bytes logged (bitmap) */
#endif
- xfs_buf_log_format_t bli_format; /* in-log header */
+ int bli_format_count; /* count of headers */
+ struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */
+ struct xfs_buf_log_format __bli_format; /* embedded in-log header */
} xfs_buf_log_item_t;
void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h
new file mode 100644
index 00000000000..fad1676ad8c
--- /dev/null
+++ b/fs/xfs/xfs_cksum.h
@@ -0,0 +1,63 @@
+#ifndef _XFS_CKSUM_H
+#define _XFS_CKSUM_H 1
+
+#define XFS_CRC_SEED (~(__uint32_t)0)
+
+/*
+ * Calculate the intermediate checksum for a buffer that has the CRC field
+ * inside it. The offset of the 32bit crc fields is passed as the
+ * cksum_offset parameter.
+ */
+static inline __uint32_t
+xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ __uint32_t zero = 0;
+ __uint32_t crc;
+
+ /* Calculate CRC up to the checksum. */
+ crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
+
+ /* Skip checksum field */
+ crc = crc32c(crc, &zero, sizeof(__u32));
+
+ /* Calculate the rest of the CRC. */
+ return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
+ length - (cksum_offset + sizeof(__be32)));
+}
+
+/*
+ * Convert the intermediate checksum to the final ondisk format.
+ *
+ * The CRC32c calculation uses LE format even on BE machines, but returns the
+ * result in host endian format. Hence we need to byte swap it back to LE format
+ * so that it is consistent on disk.
+ */
+static inline __le32
+xfs_end_cksum(__uint32_t crc)
+{
+ return ~cpu_to_le32(crc);
+}
+
+/*
+ * Helper to generate the checksum for a buffer.
+ */
+static inline void
+xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+
+ *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
+}
+
+/*
+ * Helper to verify the checksum for a buffer.
+ */
+static inline int
+xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+
+ return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
+}
+
+#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 015b946c580..4d7696a0241 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -83,14 +83,92 @@ STATIC void xfs_da_node_unbalance(xfs_da_state_t *state,
/*
* Utility routines.
*/
-STATIC uint xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count);
-STATIC int xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp);
-STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps);
+STATIC uint xfs_da_node_lasthash(struct xfs_buf *bp, int *count);
+STATIC int xfs_da_node_order(struct xfs_buf *node1_bp,
+ struct xfs_buf *node2_bp);
STATIC int xfs_da_blk_unlink(xfs_da_state_t *state,
xfs_da_state_blk_t *drop_blk,
xfs_da_state_blk_t *save_blk);
STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state);
+static void
+xfs_da_node_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_da_node_hdr *hdr = bp->b_addr;
+ int block_ok = 0;
+
+ block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC);
+ block_ok = block_ok &&
+ be16_to_cpu(hdr->level) > 0 &&
+ be16_to_cpu(hdr->count) > 0 ;
+ if (!block_ok) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+
+}
+
+static void
+xfs_da_node_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_da_node_verify(bp);
+}
+
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_da_node_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_da_blkinfo *info = bp->b_addr;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA_NODE_MAGIC:
+ xfs_da_node_verify(bp);
+ break;
+ case XFS_ATTR_LEAF_MAGIC:
+ bp->b_ops = &xfs_attr_leaf_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ case XFS_DIR2_LEAFN_MAGIC:
+ bp->b_ops = &xfs_dir2_leafn_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ default:
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+ mp, info);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ break;
+ }
+}
+
+const struct xfs_buf_ops xfs_da_node_buf_ops = {
+ .verify_read = xfs_da_node_read_verify,
+ .verify_write = xfs_da_node_write_verify,
+};
+
+
+int
+xfs_da_node_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp,
+ int which_fork)
+{
+ return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ which_fork, &xfs_da_node_buf_ops);
+}
+
/*========================================================================
* Routines used for growing the Btree.
*========================================================================*/
@@ -100,10 +178,10 @@ STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state);
*/
int
xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
- xfs_dabuf_t **bpp, int whichfork)
+ struct xfs_buf **bpp, int whichfork)
{
xfs_da_intnode_t *node;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int error;
xfs_trans_t *tp;
@@ -114,7 +192,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
if (error)
return(error);
ASSERT(bp != NULL);
- node = bp->data;
+ node = bp->b_addr;
node->hdr.info.forw = 0;
node->hdr.info.back = 0;
node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC);
@@ -122,9 +200,10 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
node->hdr.count = 0;
node->hdr.level = cpu_to_be16(level);
- xfs_da_log_buf(tp, bp,
+ xfs_trans_log_buf(tp, bp,
XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+ bp->b_ops = &xfs_da_node_buf_ops;
*bpp = bp;
return(0);
}
@@ -138,7 +217,7 @@ xfs_da_split(xfs_da_state_t *state)
{
xfs_da_state_blk_t *oldblk, *newblk, *addblk;
xfs_da_intnode_t *node;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int max, action, error, i;
trace_xfs_da_split(state->args);
@@ -203,7 +282,6 @@ xfs_da_split(xfs_da_state_t *state)
case XFS_DA_NODE_MAGIC:
error = xfs_da_node_split(state, oldblk, newblk, addblk,
max - i, &action);
- xfs_da_buf_done(addblk->bp);
addblk->bp = NULL;
if (error)
return(error); /* GROT: dir is inconsistent */
@@ -221,13 +299,6 @@ xfs_da_split(xfs_da_state_t *state)
* Update the btree to show the new hashval for this child.
*/
xfs_da_fixhashpath(state, &state->path);
- /*
- * If we won't need this block again, it's getting dropped
- * from the active path by the loop control, so we need
- * to mark it done now.
- */
- if (i > 0 || !addblk)
- xfs_da_buf_done(oldblk->bp);
}
if (!addblk)
return(0);
@@ -239,8 +310,6 @@ xfs_da_split(xfs_da_state_t *state)
oldblk = &state->path.blk[0];
error = xfs_da_root_split(state, oldblk, addblk);
if (error) {
- xfs_da_buf_done(oldblk->bp);
- xfs_da_buf_done(addblk->bp);
addblk->bp = NULL;
return(error); /* GROT: dir is inconsistent */
}
@@ -252,7 +321,7 @@ xfs_da_split(xfs_da_state_t *state)
* and the original block 0 could be at any position in the list.
*/
- node = oldblk->bp->data;
+ node = oldblk->bp->b_addr;
if (node->hdr.info.forw) {
if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
bp = addblk->bp;
@@ -260,13 +329,13 @@ xfs_da_split(xfs_da_state_t *state)
ASSERT(state->extravalid);
bp = state->extrablk.bp;
}
- node = bp->data;
+ node = bp->b_addr;
node->hdr.info.back = cpu_to_be32(oldblk->blkno);
- xfs_da_log_buf(state->args->trans, bp,
+ xfs_trans_log_buf(state->args->trans, bp,
XFS_DA_LOGRANGE(node, &node->hdr.info,
sizeof(node->hdr.info)));
}
- node = oldblk->bp->data;
+ node = oldblk->bp->b_addr;
if (node->hdr.info.back) {
if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
bp = addblk->bp;
@@ -274,14 +343,12 @@ xfs_da_split(xfs_da_state_t *state)
ASSERT(state->extravalid);
bp = state->extrablk.bp;
}
- node = bp->data;
+ node = bp->b_addr;
node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
- xfs_da_log_buf(state->args->trans, bp,
+ xfs_trans_log_buf(state->args->trans, bp,
XFS_DA_LOGRANGE(node, &node->hdr.info,
sizeof(node->hdr.info)));
}
- xfs_da_buf_done(oldblk->bp);
- xfs_da_buf_done(addblk->bp);
addblk->bp = NULL;
return(0);
}
@@ -298,7 +365,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
xfs_da_intnode_t *node, *oldroot;
xfs_da_args_t *args;
xfs_dablk_t blkno;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int error, size;
xfs_inode_t *dp;
xfs_trans_t *tp;
@@ -323,8 +390,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
if (error)
return(error);
ASSERT(bp != NULL);
- node = bp->data;
- oldroot = blk1->bp->data;
+ node = bp->b_addr;
+ oldroot = blk1->bp->b_addr;
if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) {
size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] -
(char *)oldroot);
@@ -335,8 +402,9 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
(char *)leaf);
}
memcpy(node, oldroot, size);
- xfs_da_log_buf(tp, bp, 0, size - 1);
- xfs_da_buf_done(blk1->bp);
+ xfs_trans_log_buf(tp, bp, 0, size - 1);
+
+ bp->b_ops = blk1->bp->b_ops;
blk1->bp = bp;
blk1->blkno = blkno;
@@ -348,7 +416,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork);
if (error)
return(error);
- node = bp->data;
+ node = bp->b_addr;
node->btree[0].hashval = cpu_to_be32(blk1->hashval);
node->btree[0].before = cpu_to_be32(blk1->blkno);
node->btree[1].hashval = cpu_to_be32(blk2->hashval);
@@ -365,10 +433,9 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
#endif
/* Header is already logged by xfs_da_node_create */
- xfs_da_log_buf(tp, bp,
+ xfs_trans_log_buf(tp, bp,
XFS_DA_LOGRANGE(node, node->btree,
sizeof(xfs_da_node_entry_t) * 2));
- xfs_da_buf_done(bp);
return(0);
}
@@ -389,7 +456,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
trace_xfs_da_node_split(state->args);
- node = oldblk->bp->data;
+ node = oldblk->bp->b_addr;
ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
/*
@@ -436,7 +503,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
*
* If we had double-split op below us, then add the extra block too.
*/
- node = oldblk->bp->data;
+ node = oldblk->bp->b_addr;
if (oldblk->index <= be16_to_cpu(node->hdr.count)) {
oldblk->index++;
xfs_da_node_add(state, oldblk, addblk);
@@ -477,8 +544,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
trace_xfs_da_node_rebalance(state->args);
- node1 = blk1->bp->data;
- node2 = blk2->bp->data;
+ node1 = blk1->bp->b_addr;
+ node2 = blk2->bp->b_addr;
/*
* Figure out how many entries need to move, and in which direction.
* Swap the nodes around if that makes it simpler.
@@ -532,7 +599,7 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)];
memcpy(btree_d, btree_s, tmp);
be16_add_cpu(&node1->hdr.count, count);
- xfs_da_log_buf(tp, blk1->bp,
+ xfs_trans_log_buf(tp, blk1->bp,
XFS_DA_LOGRANGE(node1, btree_d, tmp));
/*
@@ -549,9 +616,9 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
/*
* Log header of node 1 and all current bits of node 2.
*/
- xfs_da_log_buf(tp, blk1->bp,
+ xfs_trans_log_buf(tp, blk1->bp,
XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr)));
- xfs_da_log_buf(tp, blk2->bp,
+ xfs_trans_log_buf(tp, blk2->bp,
XFS_DA_LOGRANGE(node2, &node2->hdr,
sizeof(node2->hdr) +
sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count)));
@@ -560,8 +627,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* Record the last hashval from each block for upward propagation.
* (note: don't use the swapped node pointers)
*/
- node1 = blk1->bp->data;
- node2 = blk2->bp->data;
+ node1 = blk1->bp->b_addr;
+ node2 = blk2->bp->b_addr;
blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval);
blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval);
@@ -587,7 +654,7 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
trace_xfs_da_node_add(state->args);
- node = oldblk->bp->data;
+ node = oldblk->bp->b_addr;
ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
ASSERT(newblk->blkno != 0);
@@ -606,10 +673,10 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
}
btree->hashval = cpu_to_be32(newblk->hashval);
btree->before = cpu_to_be32(newblk->blkno);
- xfs_da_log_buf(state->args->trans, oldblk->bp,
+ xfs_trans_log_buf(state->args->trans, oldblk->bp,
XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree)));
be16_add_cpu(&node->hdr.count, 1);
- xfs_da_log_buf(state->args->trans, oldblk->bp,
+ xfs_trans_log_buf(state->args->trans, oldblk->bp,
XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
/*
@@ -735,7 +802,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
xfs_da_intnode_t *oldroot;
xfs_da_args_t *args;
xfs_dablk_t child;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int error;
trace_xfs_da_root_join(state->args);
@@ -743,7 +810,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
args = state->args;
ASSERT(args != NULL);
ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
- oldroot = root_blk->bp->data;
+ oldroot = root_blk->bp->b_addr;
ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
ASSERT(!oldroot->hdr.info.forw);
ASSERT(!oldroot->hdr.info.back);
@@ -760,16 +827,23 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
*/
child = be32_to_cpu(oldroot->btree[0].before);
ASSERT(child != 0);
- error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
+ error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp,
args->whichfork);
if (error)
return(error);
ASSERT(bp != NULL);
- xfs_da_blkinfo_onlychild_validate(bp->data,
+ xfs_da_blkinfo_onlychild_validate(bp->b_addr,
be16_to_cpu(oldroot->hdr.level));
- memcpy(root_blk->bp->data, bp->data, state->blocksize);
- xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
+ /*
+ * This could be copying a leaf back into the root block in the case of
+ * there only being a single leaf block left in the tree. Hence we have
+ * to update the b_ops pointer as well to match the buffer type change
+ * that could occur.
+ */
+ memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
+ root_blk->bp->b_ops = bp->b_ops;
+ xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
error = xfs_da_shrink_inode(args, child, bp);
return(error);
}
@@ -791,7 +865,9 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
xfs_da_blkinfo_t *info;
int count, forward, error, retval, i;
xfs_dablk_t blkno;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
+
+ trace_xfs_da_node_toosmall(state->args);
/*
* Check for the degenerate case of the block being over 50% full.
@@ -799,7 +875,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
* to coalesce with a sibling.
*/
blk = &state->path.blk[ state->path.active-1 ];
- info = blk->bp->data;
+ info = blk->bp->b_addr;
ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
node = (xfs_da_intnode_t *)info;
count = be16_to_cpu(node->hdr.count);
@@ -849,7 +925,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
blkno = be32_to_cpu(info->back);
if (blkno == 0)
continue;
- error = xfs_da_read_buf(state->args->trans, state->args->dp,
+ error = xfs_da_node_read(state->args->trans, state->args->dp,
blkno, -1, &bp, state->args->whichfork);
if (error)
return(error);
@@ -859,10 +935,10 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
count = state->node_ents;
count -= state->node_ents >> 2;
count -= be16_to_cpu(node->hdr.count);
- node = bp->data;
+ node = bp->b_addr;
ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
count -= be16_to_cpu(node->hdr.count);
- xfs_da_brelse(state->args->trans, bp);
+ xfs_trans_brelse(state->args->trans, bp);
if (count >= 0)
break; /* fits with at least 25% to spare */
}
@@ -914,6 +990,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
xfs_dahash_t lasthash=0;
int level, count;
+ trace_xfs_da_fixhashpath(state->args);
+
level = path->active-1;
blk = &path->blk[ level ];
switch (blk->magic) {
@@ -934,14 +1012,14 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
break;
}
for (blk--, level--; level >= 0; blk--, level--) {
- node = blk->bp->data;
+ node = blk->bp->b_addr;
ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
btree = &node->btree[ blk->index ];
if (be32_to_cpu(btree->hashval) == lasthash)
break;
blk->hashval = lasthash;
btree->hashval = cpu_to_be32(lasthash);
- xfs_da_log_buf(state->args->trans, blk->bp,
+ xfs_trans_log_buf(state->args->trans, blk->bp,
XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
@@ -960,7 +1038,7 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
trace_xfs_da_node_remove(state->args);
- node = drop_blk->bp->data;
+ node = drop_blk->bp->b_addr;
ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count));
ASSERT(drop_blk->index >= 0);
@@ -972,15 +1050,15 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
tmp = be16_to_cpu(node->hdr.count) - drop_blk->index - 1;
tmp *= (uint)sizeof(xfs_da_node_entry_t);
memmove(btree, btree + 1, tmp);
- xfs_da_log_buf(state->args->trans, drop_blk->bp,
+ xfs_trans_log_buf(state->args->trans, drop_blk->bp,
XFS_DA_LOGRANGE(node, btree, tmp));
btree = &node->btree[be16_to_cpu(node->hdr.count)-1];
}
memset((char *)btree, 0, sizeof(xfs_da_node_entry_t));
- xfs_da_log_buf(state->args->trans, drop_blk->bp,
+ xfs_trans_log_buf(state->args->trans, drop_blk->bp,
XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
be16_add_cpu(&node->hdr.count, -1);
- xfs_da_log_buf(state->args->trans, drop_blk->bp,
+ xfs_trans_log_buf(state->args->trans, drop_blk->bp,
XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
/*
@@ -1005,8 +1083,8 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
trace_xfs_da_node_unbalance(state->args);
- drop_node = drop_blk->bp->data;
- save_node = save_blk->bp->data;
+ drop_node = drop_blk->bp->b_addr;
+ save_node = save_blk->bp->b_addr;
ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
ASSERT(save_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
tp = state->args->trans;
@@ -1023,13 +1101,13 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
memmove(btree, &save_node->btree[0], tmp);
btree = &save_node->btree[0];
- xfs_da_log_buf(tp, save_blk->bp,
+ xfs_trans_log_buf(tp, save_blk->bp,
XFS_DA_LOGRANGE(save_node, btree,
(be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) *
sizeof(xfs_da_node_entry_t)));
} else {
btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)];
- xfs_da_log_buf(tp, save_blk->bp,
+ xfs_trans_log_buf(tp, save_blk->bp,
XFS_DA_LOGRANGE(save_node, btree,
be16_to_cpu(drop_node->hdr.count) *
sizeof(xfs_da_node_entry_t)));
@@ -1042,7 +1120,7 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
memcpy(btree, &drop_node->btree[0], tmp);
be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count));
- xfs_da_log_buf(tp, save_blk->bp,
+ xfs_trans_log_buf(tp, save_blk->bp,
XFS_DA_LOGRANGE(save_node, &save_node->hdr,
sizeof(save_node->hdr)));
@@ -1093,14 +1171,14 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
* Read the next node down in the tree.
*/
blk->blkno = blkno;
- error = xfs_da_read_buf(args->trans, args->dp, blkno,
+ error = xfs_da_node_read(args->trans, args->dp, blkno,
-1, &blk->bp, args->whichfork);
if (error) {
blk->blkno = 0;
state->path.active--;
return(error);
}
- curr = blk->bp->data;
+ curr = blk->bp->b_addr;
blk->magic = be16_to_cpu(curr->magic);
ASSERT(blk->magic == XFS_DA_NODE_MAGIC ||
blk->magic == XFS_DIR2_LEAFN_MAGIC ||
@@ -1110,7 +1188,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
* Search an intermediate node for a match.
*/
if (blk->magic == XFS_DA_NODE_MAGIC) {
- node = blk->bp->data;
+ node = blk->bp->b_addr;
max = be16_to_cpu(node->hdr.count);
blk->hashval = be32_to_cpu(node->btree[max-1].hashval);
@@ -1216,15 +1294,15 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
xfs_da_blkinfo_t *old_info, *new_info, *tmp_info;
xfs_da_args_t *args;
int before=0, error;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
/*
* Set up environment.
*/
args = state->args;
ASSERT(args != NULL);
- old_info = old_blk->bp->data;
- new_info = new_blk->bp->data;
+ old_info = old_blk->bp->b_addr;
+ new_info = new_blk->bp->b_addr;
ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
old_blk->magic == XFS_ATTR_LEAF_MAGIC);
@@ -1255,18 +1333,17 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
new_info->forw = cpu_to_be32(old_blk->blkno);
new_info->back = old_info->back;
if (old_info->back) {
- error = xfs_da_read_buf(args->trans, args->dp,
+ error = xfs_da_node_read(args->trans, args->dp,
be32_to_cpu(old_info->back),
-1, &bp, args->whichfork);
if (error)
return(error);
ASSERT(bp != NULL);
- tmp_info = bp->data;
+ tmp_info = bp->b_addr;
ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic));
ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
tmp_info->forw = cpu_to_be32(new_blk->blkno);
- xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
- xfs_da_buf_done(bp);
+ xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
}
old_info->back = cpu_to_be32(new_blk->blkno);
} else {
@@ -1277,24 +1354,23 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
new_info->forw = old_info->forw;
new_info->back = cpu_to_be32(old_blk->blkno);
if (old_info->forw) {
- error = xfs_da_read_buf(args->trans, args->dp,
+ error = xfs_da_node_read(args->trans, args->dp,
be32_to_cpu(old_info->forw),
-1, &bp, args->whichfork);
if (error)
return(error);
ASSERT(bp != NULL);
- tmp_info = bp->data;
+ tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == old_info->magic);
ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
tmp_info->back = cpu_to_be32(new_blk->blkno);
- xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
- xfs_da_buf_done(bp);
+ xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
}
old_info->forw = cpu_to_be32(new_blk->blkno);
}
- xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
- xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
+ xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
+ xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
return(0);
}
@@ -1302,12 +1378,14 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
* Compare two intermediate nodes for "order".
*/
STATIC int
-xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp)
+xfs_da_node_order(
+ struct xfs_buf *node1_bp,
+ struct xfs_buf *node2_bp)
{
xfs_da_intnode_t *node1, *node2;
- node1 = node1_bp->data;
- node2 = node2_bp->data;
+ node1 = node1_bp->b_addr;
+ node2 = node2_bp->b_addr;
ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) &&
node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
@@ -1324,11 +1402,13 @@ xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp)
* Pick up the last hashvalue from an intermediate node.
*/
STATIC uint
-xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count)
+xfs_da_node_lasthash(
+ struct xfs_buf *bp,
+ int *count)
{
xfs_da_intnode_t *node;
- node = bp->data;
+ node = bp->b_addr;
ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
if (count)
*count = be16_to_cpu(node->hdr.count);
@@ -1346,7 +1426,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
{
xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info;
xfs_da_args_t *args;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int error;
/*
@@ -1354,8 +1434,8 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
*/
args = state->args;
ASSERT(args != NULL);
- save_info = save_blk->bp->data;
- drop_info = drop_blk->bp->data;
+ save_info = save_blk->bp->b_addr;
+ drop_info = drop_blk->bp->b_addr;
ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
save_blk->magic == XFS_ATTR_LEAF_MAGIC);
@@ -1374,41 +1454,39 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
trace_xfs_da_unlink_back(args);
save_info->back = drop_info->back;
if (drop_info->back) {
- error = xfs_da_read_buf(args->trans, args->dp,
+ error = xfs_da_node_read(args->trans, args->dp,
be32_to_cpu(drop_info->back),
-1, &bp, args->whichfork);
if (error)
return(error);
ASSERT(bp != NULL);
- tmp_info = bp->data;
+ tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == save_info->magic);
ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
tmp_info->forw = cpu_to_be32(save_blk->blkno);
- xfs_da_log_buf(args->trans, bp, 0,
+ xfs_trans_log_buf(args->trans, bp, 0,
sizeof(*tmp_info) - 1);
- xfs_da_buf_done(bp);
}
} else {
trace_xfs_da_unlink_forward(args);
save_info->forw = drop_info->forw;
if (drop_info->forw) {
- error = xfs_da_read_buf(args->trans, args->dp,
+ error = xfs_da_node_read(args->trans, args->dp,
be32_to_cpu(drop_info->forw),
-1, &bp, args->whichfork);
if (error)
return(error);
ASSERT(bp != NULL);
- tmp_info = bp->data;
+ tmp_info = bp->b_addr;
ASSERT(tmp_info->magic == save_info->magic);
ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
tmp_info->back = cpu_to_be32(save_blk->blkno);
- xfs_da_log_buf(args->trans, bp, 0,
+ xfs_trans_log_buf(args->trans, bp, 0,
sizeof(*tmp_info) - 1);
- xfs_da_buf_done(bp);
}
}
- xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
+ xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
return(0);
}
@@ -1431,6 +1509,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
xfs_dablk_t blkno=0;
int level, error;
+ trace_xfs_da_path_shift(state->args);
+
/*
* Roll up the Btree looking for the first block where our
* current index is not at the edge of the block. Note that
@@ -1443,7 +1523,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
level = (path->active-1) - 1; /* skip bottom layer in path */
for (blk = &path->blk[level]; level >= 0; blk--, level--) {
ASSERT(blk->bp != NULL);
- node = blk->bp->data;
+ node = blk->bp->b_addr;
ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) {
blk->index++;
@@ -1471,18 +1551,18 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
* (if it's dirty, trans won't actually let go)
*/
if (release)
- xfs_da_brelse(args->trans, blk->bp);
+ xfs_trans_brelse(args->trans, blk->bp);
/*
* Read the next child block.
*/
blk->blkno = blkno;
- error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
- &blk->bp, args->whichfork);
+ error = xfs_da_node_read(args->trans, args->dp, blkno, -1,
+ &blk->bp, args->whichfork);
if (error)
return(error);
ASSERT(blk->bp != NULL);
- info = blk->bp->data;
+ info = blk->bp->b_addr;
ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
@@ -1702,11 +1782,13 @@ xfs_da_grow_inode(
* a bmap btree split to do that.
*/
STATIC int
-xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
- xfs_dabuf_t **dead_bufp)
+xfs_da_swap_lastblock(
+ xfs_da_args_t *args,
+ xfs_dablk_t *dead_blknop,
+ struct xfs_buf **dead_bufp)
{
xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno;
- xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf;
+ struct xfs_buf *dead_buf, *last_buf, *sib_buf, *par_buf;
xfs_fileoff_t lastoff;
xfs_inode_t *ip;
xfs_trans_t *tp;
@@ -1739,14 +1821,15 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
* Read the last block in the btree space.
*/
last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
- if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
+ error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w);
+ if (error)
return error;
/*
* Copy the last block into the dead buffer and log it.
*/
- memcpy(dead_buf->data, last_buf->data, mp->m_dirblksize);
- xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
- dead_info = dead_buf->data;
+ memcpy(dead_buf->b_addr, last_buf->b_addr, mp->m_dirblksize);
+ xfs_trans_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
+ dead_info = dead_buf->b_addr;
/*
* Get values from the moved block.
*/
@@ -1765,9 +1848,10 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
* If the moved block has a left sibling, fix up the pointers.
*/
if ((sib_blkno = be32_to_cpu(dead_info->back))) {
- if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+ error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+ if (error)
goto done;
- sib_info = sib_buf->data;
+ sib_info = sib_buf->b_addr;
if (unlikely(
be32_to_cpu(sib_info->forw) != last_blkno ||
sib_info->magic != dead_info->magic)) {
@@ -1777,19 +1861,19 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
goto done;
}
sib_info->forw = cpu_to_be32(dead_blkno);
- xfs_da_log_buf(tp, sib_buf,
+ xfs_trans_log_buf(tp, sib_buf,
XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
sizeof(sib_info->forw)));
- xfs_da_buf_done(sib_buf);
sib_buf = NULL;
}
/*
* If the moved block has a right sibling, fix up the pointers.
*/
if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
- if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+ error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+ if (error)
goto done;
- sib_info = sib_buf->data;
+ sib_info = sib_buf->b_addr;
if (unlikely(
be32_to_cpu(sib_info->back) != last_blkno ||
sib_info->magic != dead_info->magic)) {
@@ -1799,10 +1883,9 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
goto done;
}
sib_info->back = cpu_to_be32(dead_blkno);
- xfs_da_log_buf(tp, sib_buf,
+ xfs_trans_log_buf(tp, sib_buf,
XFS_DA_LOGRANGE(sib_info, &sib_info->back,
sizeof(sib_info->back)));
- xfs_da_buf_done(sib_buf);
sib_buf = NULL;
}
par_blkno = mp->m_dirleafblk;
@@ -1811,9 +1894,10 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
* Walk down the tree looking for the parent of the moved block.
*/
for (;;) {
- if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+ error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+ if (error)
goto done;
- par_node = par_buf->data;
+ par_node = par_buf->b_addr;
if (unlikely(par_node->hdr.info.magic !=
cpu_to_be16(XFS_DA_NODE_MAGIC) ||
(level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) {
@@ -1837,7 +1921,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
par_blkno = be32_to_cpu(par_node->btree[entno].before);
if (level == dead_level + 1)
break;
- xfs_da_brelse(tp, par_buf);
+ xfs_trans_brelse(tp, par_buf);
par_buf = NULL;
}
/*
@@ -1853,7 +1937,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
if (entno < be16_to_cpu(par_node->hdr.count))
break;
par_blkno = be32_to_cpu(par_node->hdr.info.forw);
- xfs_da_brelse(tp, par_buf);
+ xfs_trans_brelse(tp, par_buf);
par_buf = NULL;
if (unlikely(par_blkno == 0)) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
@@ -1861,9 +1945,10 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+ error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+ if (error)
goto done;
- par_node = par_buf->data;
+ par_node = par_buf->b_addr;
if (unlikely(
be16_to_cpu(par_node->hdr.level) != level ||
par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC))) {
@@ -1878,20 +1963,18 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
* Update the parent entry pointing to the moved block.
*/
par_node->btree[entno].before = cpu_to_be32(dead_blkno);
- xfs_da_log_buf(tp, par_buf,
+ xfs_trans_log_buf(tp, par_buf,
XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before,
sizeof(par_node->btree[entno].before)));
- xfs_da_buf_done(par_buf);
- xfs_da_buf_done(dead_buf);
*dead_blknop = last_blkno;
*dead_bufp = last_buf;
return 0;
done:
if (par_buf)
- xfs_da_brelse(tp, par_buf);
+ xfs_trans_brelse(tp, par_buf);
if (sib_buf)
- xfs_da_brelse(tp, sib_buf);
- xfs_da_brelse(tp, last_buf);
+ xfs_trans_brelse(tp, sib_buf);
+ xfs_trans_brelse(tp, last_buf);
return error;
}
@@ -1899,8 +1982,10 @@ done:
* Remove a btree block from a directory or attribute.
*/
int
-xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
- xfs_dabuf_t *dead_buf)
+xfs_da_shrink_inode(
+ xfs_da_args_t *args,
+ xfs_dablk_t dead_blkno,
+ struct xfs_buf *dead_buf)
{
xfs_inode_t *dp;
int done, error, w, count;
@@ -1935,7 +2020,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
break;
}
}
- xfs_da_binval(tp, dead_buf);
+ xfs_trans_binval(tp, dead_buf);
return error;
}
@@ -1967,35 +2052,75 @@ xfs_da_map_covers_blocks(
}
/*
- * Make a dabuf.
- * Used for get_buf, read_buf, read_bufr, and reada_buf.
+ * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
+ *
+ * For the single map case, it is assumed that the caller has provided a pointer
+ * to a valid xfs_buf_map. For the multiple map case, this function will
+ * allocate the xfs_buf_map to hold all the maps and replace the caller's single
+ * map pointer with the allocated map.
*/
-STATIC int
-xfs_da_do_buf(
- xfs_trans_t *trans,
- xfs_inode_t *dp,
- xfs_dablk_t bno,
- xfs_daddr_t *mappedbnop,
- xfs_dabuf_t **bpp,
- int whichfork,
- int caller)
+static int
+xfs_buf_map_from_irec(
+ struct xfs_mount *mp,
+ struct xfs_buf_map **mapp,
+ unsigned int *nmaps,
+ struct xfs_bmbt_irec *irecs,
+ unsigned int nirecs)
{
- xfs_buf_t *bp = NULL;
- xfs_buf_t **bplist;
- int error=0;
- int i;
- xfs_bmbt_irec_t map;
- xfs_bmbt_irec_t *mapp;
- xfs_daddr_t mappedbno;
- xfs_mount_t *mp;
- int nbplist=0;
- int nfsb;
- int nmap;
- xfs_dabuf_t *rbp;
+ struct xfs_buf_map *map;
+ int i;
+
+ ASSERT(*nmaps == 1);
+ ASSERT(nirecs >= 1);
+
+ if (nirecs > 1) {
+ map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP);
+ if (!map)
+ return ENOMEM;
+ *mapp = map;
+ }
+
+ *nmaps = nirecs;
+ map = *mapp;
+ for (i = 0; i < *nmaps; i++) {
+ ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
+ irecs[i].br_startblock != HOLESTARTBLOCK);
+ map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
+ map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
+ }
+ return 0;
+}
+
+/*
+ * Map the block we are given ready for reading. There are three possible return
+ * values:
+ * -1 - will be returned if we land in a hole and mappedbno == -2 so the
+ * caller knows not to execute a subsequent read.
+ * 0 - if we mapped the block successfully
+ * >0 - positive error number if there was an error.
+ */
+static int
+xfs_dabuf_map(
+ struct xfs_trans *trans,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ int whichfork,
+ struct xfs_buf_map **map,
+ int *nmaps)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ int nfsb;
+ int error = 0;
+ struct xfs_bmbt_irec irec;
+ struct xfs_bmbt_irec *irecs = &irec;
+ int nirecs;
+
+ ASSERT(map && *map);
+ ASSERT(*nmaps == 1);
- mp = dp->i_mount;
nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1;
- mappedbno = *mappedbnop;
+
/*
* Caller doesn't have a mapping. -2 means don't complain
* if we land in a hole.
@@ -2004,112 +2129,151 @@ xfs_da_do_buf(
/*
* Optimize the one-block case.
*/
- if (nfsb == 1)
- mapp = &map;
- else
- mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
+ if (nfsb != 1)
+ irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP);
- nmap = nfsb;
- error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, mapp,
- &nmap, xfs_bmapi_aflag(whichfork));
+ nirecs = nfsb;
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
+ &nirecs, xfs_bmapi_aflag(whichfork));
if (error)
- goto exit0;
+ goto out;
} else {
- map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
- map.br_startoff = (xfs_fileoff_t)bno;
- map.br_blockcount = nfsb;
- mapp = &map;
- nmap = 1;
+ irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
+ irecs->br_startoff = (xfs_fileoff_t)bno;
+ irecs->br_blockcount = nfsb;
+ irecs->br_state = 0;
+ nirecs = 1;
}
- if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) {
- error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
+
+ if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
+ error = mappedbno == -2 ? -1 : XFS_ERROR(EFSCORRUPTED);
if (unlikely(error == EFSCORRUPTED)) {
if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+ int i;
xfs_alert(mp, "%s: bno %lld dir: inode %lld",
__func__, (long long)bno,
(long long)dp->i_ino);
- for (i = 0; i < nmap; i++) {
+ for (i = 0; i < *nmaps; i++) {
xfs_alert(mp,
"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
i,
- (long long)mapp[i].br_startoff,
- (long long)mapp[i].br_startblock,
- (long long)mapp[i].br_blockcount,
- mapp[i].br_state);
+ (long long)irecs[i].br_startoff,
+ (long long)irecs[i].br_startblock,
+ (long long)irecs[i].br_blockcount,
+ irecs[i].br_state);
}
}
XFS_ERROR_REPORT("xfs_da_do_buf(1)",
XFS_ERRLEVEL_LOW, mp);
}
- goto exit0;
+ goto out;
}
- if (caller != 3 && nmap > 1) {
- bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP);
- nbplist = 0;
- } else
- bplist = NULL;
- /*
- * Turn the mapping(s) into buffer(s).
- */
- for (i = 0; i < nmap; i++) {
- int nmapped;
-
- mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock);
- if (i == 0)
- *mappedbnop = mappedbno;
- nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount);
- switch (caller) {
- case 0:
- bp = xfs_trans_get_buf(trans, mp->m_ddev_targp,
- mappedbno, nmapped, 0);
- error = bp ? bp->b_error : XFS_ERROR(EIO);
- break;
- case 1:
- case 2:
- bp = NULL;
- error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp,
- mappedbno, nmapped, 0, &bp);
- break;
- case 3:
- xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped);
+ error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
+out:
+ if (irecs != &irec)
+ kmem_free(irecs);
+ return error;
+}
+
+/*
+ * Get a buffer for the dir/attr block.
+ */
+int
+xfs_da_get_buf(
+ struct xfs_trans *trans,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp,
+ int whichfork)
+{
+ struct xfs_buf *bp;
+ struct xfs_buf_map map;
+ struct xfs_buf_map *mapp;
+ int nmap;
+ int error;
+
+ *bpp = NULL;
+ mapp = &map;
+ nmap = 1;
+ error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+ &mapp, &nmap);
+ if (error) {
+ /* mapping a hole is not an error, but we don't continue */
+ if (error == -1)
error = 0;
- bp = NULL;
- break;
- }
- if (error) {
- if (bp)
- xfs_trans_brelse(trans, bp);
- goto exit1;
- }
- if (!bp)
- continue;
- if (caller == 1) {
- if (whichfork == XFS_ATTR_FORK)
- xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
- else
- xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
- }
- if (bplist) {
- bplist[nbplist++] = bp;
- }
+ goto out_free;
}
- /*
- * Build a dabuf structure.
- */
- if (bplist) {
- rbp = xfs_da_buf_make(nbplist, bplist);
- } else if (bp)
- rbp = xfs_da_buf_make(1, &bp);
+
+ bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
+ mapp, nmap, 0);
+ error = bp ? bp->b_error : XFS_ERROR(EIO);
+ if (error) {
+ xfs_trans_brelse(trans, bp);
+ goto out_free;
+ }
+
+ *bpp = bp;
+
+out_free:
+ if (mapp != &map)
+ kmem_free(mapp);
+
+ return error;
+}
+
+/*
+ * Get a buffer for the dir/attr block, fill in the contents.
+ */
+int
+xfs_da_read_buf(
+ struct xfs_trans *trans,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp,
+ int whichfork,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_buf *bp;
+ struct xfs_buf_map map;
+ struct xfs_buf_map *mapp;
+ int nmap;
+ int error;
+
+ *bpp = NULL;
+ mapp = &map;
+ nmap = 1;
+ error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+ &mapp, &nmap);
+ if (error) {
+ /* mapping a hole is not an error, but we don't continue */
+ if (error == -1)
+ error = 0;
+ goto out_free;
+ }
+
+ error = xfs_trans_read_buf_map(dp->i_mount, trans,
+ dp->i_mount->m_ddev_targp,
+ mapp, nmap, 0, &bp, ops);
+ if (error)
+ goto out_free;
+
+ if (whichfork == XFS_ATTR_FORK)
+ xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
else
- rbp = NULL;
+ xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
+
/*
- * For read_buf, check the magic number.
+ * This verification code will be moved to a CRC verification callback
+ * function so just leave it here unchanged until then.
*/
- if (caller == 1) {
- xfs_dir2_data_hdr_t *hdr = rbp->data;
- xfs_dir2_free_t *free = rbp->data;
- xfs_da_blkinfo_t *info = rbp->data;
+ {
+ xfs_dir2_data_hdr_t *hdr = bp->b_addr;
+ xfs_dir2_free_t *free = bp->b_addr;
+ xfs_da_blkinfo_t *info = bp->b_addr;
uint magic, magic1;
+ struct xfs_mount *mp = dp->i_mount;
magic = be16_to_cpu(info->magic);
magic1 = be32_to_cpu(hdr->magic);
@@ -2123,66 +2287,20 @@ xfs_da_do_buf(
(free->hdr.magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)),
mp, XFS_ERRTAG_DA_READ_BUF,
XFS_RANDOM_DA_READ_BUF))) {
- trace_xfs_da_btree_corrupt(rbp->bps[0], _RET_IP_);
+ trace_xfs_da_btree_corrupt(bp, _RET_IP_);
XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
XFS_ERRLEVEL_LOW, mp, info);
error = XFS_ERROR(EFSCORRUPTED);
- xfs_da_brelse(trans, rbp);
- nbplist = 0;
- goto exit1;
+ xfs_trans_brelse(trans, bp);
+ goto out_free;
}
}
- if (bplist) {
- kmem_free(bplist);
- }
- if (mapp != &map) {
- kmem_free(mapp);
- }
- if (bpp)
- *bpp = rbp;
- return 0;
-exit1:
- if (bplist) {
- for (i = 0; i < nbplist; i++)
- xfs_trans_brelse(trans, bplist[i]);
- kmem_free(bplist);
- }
-exit0:
+ *bpp = bp;
+out_free:
if (mapp != &map)
kmem_free(mapp);
- if (bpp)
- *bpp = NULL;
- return error;
-}
-
-/*
- * Get a buffer for the dir/attr block.
- */
-int
-xfs_da_get_buf(
- xfs_trans_t *trans,
- xfs_inode_t *dp,
- xfs_dablk_t bno,
- xfs_daddr_t mappedbno,
- xfs_dabuf_t **bpp,
- int whichfork)
-{
- return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0);
-}
-/*
- * Get a buffer for the dir/attr block, fill in the contents.
- */
-int
-xfs_da_read_buf(
- xfs_trans_t *trans,
- xfs_inode_t *dp,
- xfs_dablk_t bno,
- xfs_daddr_t mappedbno,
- xfs_dabuf_t **bpp,
- int whichfork)
-{
- return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1);
+ return error;
}
/*
@@ -2190,22 +2308,42 @@ xfs_da_read_buf(
*/
xfs_daddr_t
xfs_da_reada_buf(
- xfs_trans_t *trans,
- xfs_inode_t *dp,
- xfs_dablk_t bno,
- int whichfork)
+ struct xfs_trans *trans,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ int whichfork,
+ const struct xfs_buf_ops *ops)
{
- xfs_daddr_t rval;
+ struct xfs_buf_map map;
+ struct xfs_buf_map *mapp;
+ int nmap;
+ int error;
+
+ mapp = &map;
+ nmap = 1;
+ error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+ &mapp, &nmap);
+ if (error) {
+ /* mapping a hole is not an error, but we don't continue */
+ if (error == -1)
+ error = 0;
+ goto out_free;
+ }
+
+ mappedbno = mapp[0].bm_bn;
+ xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
+
+out_free:
+ if (mapp != &map)
+ kmem_free(mapp);
- rval = -1;
- if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3))
+ if (error)
return -1;
- else
- return rval;
+ return mappedbno;
}
kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
-kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */
/*
* Allocate a dir-state structure.
@@ -2225,13 +2363,8 @@ xfs_da_state_kill_altpath(xfs_da_state_t *state)
{
int i;
- for (i = 0; i < state->altpath.active; i++) {
- if (state->altpath.blk[i].bp) {
- if (state->altpath.blk[i].bp != state->path.blk[i].bp)
- xfs_da_buf_done(state->altpath.blk[i].bp);
- state->altpath.blk[i].bp = NULL;
- }
- }
+ for (i = 0; i < state->altpath.active; i++)
+ state->altpath.blk[i].bp = NULL;
state->altpath.active = 0;
}
@@ -2241,204 +2374,9 @@ xfs_da_state_kill_altpath(xfs_da_state_t *state)
void
xfs_da_state_free(xfs_da_state_t *state)
{
- int i;
-
xfs_da_state_kill_altpath(state);
- for (i = 0; i < state->path.active; i++) {
- if (state->path.blk[i].bp)
- xfs_da_buf_done(state->path.blk[i].bp);
- }
- if (state->extravalid && state->extrablk.bp)
- xfs_da_buf_done(state->extrablk.bp);
#ifdef DEBUG
memset((char *)state, 0, sizeof(*state));
#endif /* DEBUG */
kmem_zone_free(xfs_da_state_zone, state);
}
-
-/*
- * Create a dabuf.
- */
-/* ARGSUSED */
-STATIC xfs_dabuf_t *
-xfs_da_buf_make(int nbuf, xfs_buf_t **bps)
-{
- xfs_buf_t *bp;
- xfs_dabuf_t *dabuf;
- int i;
- int off;
-
- if (nbuf == 1)
- dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS);
- else
- dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS);
- dabuf->dirty = 0;
- if (nbuf == 1) {
- dabuf->nbuf = 1;
- bp = bps[0];
- dabuf->bbcount = bp->b_length;
- dabuf->data = bp->b_addr;
- dabuf->bps[0] = bp;
- } else {
- dabuf->nbuf = nbuf;
- for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) {
- dabuf->bps[i] = bp = bps[i];
- dabuf->bbcount += bp->b_length;
- }
- dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
- for (i = off = 0; i < nbuf; i++, off += BBTOB(bp->b_length)) {
- bp = bps[i];
- memcpy((char *)dabuf->data + off, bp->b_addr,
- BBTOB(bp->b_length));
- }
- }
- return dabuf;
-}
-
-/*
- * Un-dirty a dabuf.
- */
-STATIC void
-xfs_da_buf_clean(xfs_dabuf_t *dabuf)
-{
- xfs_buf_t *bp;
- int i;
- int off;
-
- if (dabuf->dirty) {
- ASSERT(dabuf->nbuf > 1);
- dabuf->dirty = 0;
- for (i = off = 0; i < dabuf->nbuf;
- i++, off += BBTOB(bp->b_length)) {
- bp = dabuf->bps[i];
- memcpy(bp->b_addr, dabuf->data + off,
- BBTOB(bp->b_length));
- }
- }
-}
-
-/*
- * Release a dabuf.
- */
-void
-xfs_da_buf_done(xfs_dabuf_t *dabuf)
-{
- ASSERT(dabuf);
- ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
- if (dabuf->dirty)
- xfs_da_buf_clean(dabuf);
- if (dabuf->nbuf > 1) {
- kmem_free(dabuf->data);
- kmem_free(dabuf);
- } else {
- kmem_zone_free(xfs_dabuf_zone, dabuf);
- }
-}
-
-/*
- * Log transaction from a dabuf.
- */
-void
-xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
-{
- xfs_buf_t *bp;
- uint f;
- int i;
- uint l;
- int off;
-
- ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
- if (dabuf->nbuf == 1) {
- ASSERT(dabuf->data == dabuf->bps[0]->b_addr);
- xfs_trans_log_buf(tp, dabuf->bps[0], first, last);
- return;
- }
- dabuf->dirty = 1;
- ASSERT(first <= last);
- for (i = off = 0; i < dabuf->nbuf; i++, off += BBTOB(bp->b_length)) {
- bp = dabuf->bps[i];
- f = off;
- l = f + BBTOB(bp->b_length) - 1;
- if (f < first)
- f = first;
- if (l > last)
- l = last;
- if (f <= l)
- xfs_trans_log_buf(tp, bp, f - off, l - off);
- /*
- * B_DONE is set by xfs_trans_log buf.
- * If we don't set it on a new buffer (get not read)
- * then if we don't put anything in the buffer it won't
- * be set, and at commit it it released into the cache,
- * and then a read will fail.
- */
- else if (!(XFS_BUF_ISDONE(bp)))
- XFS_BUF_DONE(bp);
- }
- ASSERT(last < off);
-}
-
-/*
- * Release dabuf from a transaction.
- * Have to free up the dabuf before the buffers are released,
- * since the synchronization on the dabuf is really the lock on the buffer.
- */
-void
-xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
-{
- xfs_buf_t *bp;
- xfs_buf_t **bplist;
- int i;
- int nbuf;
-
- ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
- if ((nbuf = dabuf->nbuf) == 1) {
- bplist = &bp;
- bp = dabuf->bps[0];
- } else {
- bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
- memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist));
- }
- xfs_da_buf_done(dabuf);
- for (i = 0; i < nbuf; i++)
- xfs_trans_brelse(tp, bplist[i]);
- if (bplist != &bp)
- kmem_free(bplist);
-}
-
-/*
- * Invalidate dabuf from a transaction.
- */
-void
-xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
-{
- xfs_buf_t *bp;
- xfs_buf_t **bplist;
- int i;
- int nbuf;
-
- ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
- if ((nbuf = dabuf->nbuf) == 1) {
- bplist = &bp;
- bp = dabuf->bps[0];
- } else {
- bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
- memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist));
- }
- xfs_da_buf_done(dabuf);
- for (i = 0; i < nbuf; i++)
- xfs_trans_binval(tp, bplist[i]);
- if (bplist != &bp)
- kmem_free(bplist);
-}
-
-/*
- * Get the first daddr from a dabuf.
- */
-xfs_daddr_t
-xfs_da_blkno(xfs_dabuf_t *dabuf)
-{
- ASSERT(dabuf->nbuf);
- ASSERT(dabuf->data);
- return XFS_BUF_ADDR(dabuf->bps[0]);
-}
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index dbf7c074ae7..ee5170c46ae 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -18,7 +18,6 @@
#ifndef __XFS_DA_BTREE_H__
#define __XFS_DA_BTREE_H__
-struct xfs_buf;
struct xfs_bmap_free;
struct xfs_inode;
struct xfs_mount;
@@ -32,7 +31,7 @@ struct zone;
/*
* This structure is common to both leaf nodes and non-leaf nodes in the Btree.
*
- * Is is used to manage a doubly linked list of all blocks at the same
+ * It is used to manage a doubly linked list of all blocks at the same
* level in the Btree, and to identify which type of block this is.
*/
#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
@@ -133,24 +132,6 @@ typedef struct xfs_da_args {
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }
/*
- * Structure to describe buffer(s) for a block.
- * This is needed in the directory version 2 format case, when
- * multiple non-contiguous fsblocks might be needed to cover one
- * logical directory block.
- * If the buffer count is 1 then the data pointer points to the
- * same place as the b_addr field for the buffer, else to kmem_alloced memory.
- */
-typedef struct xfs_dabuf {
- int nbuf; /* number of buffer pointers present */
- short dirty; /* data needs to be copied back */
- short bbcount; /* how large is data in bbs */
- void *data; /* pointer for buffers' data */
- struct xfs_buf *bps[1]; /* actually nbuf of these */
-} xfs_dabuf_t;
-#define XFS_DA_BUF_SIZE(n) \
- (sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1))
-
-/*
* Storage for holding state during Btree searches and split/join ops.
*
* Only need space for 5 intermediate nodes. With a minimum of 62-way
@@ -158,7 +139,7 @@ typedef struct xfs_dabuf {
* which is slightly more than enough.
*/
typedef struct xfs_da_state_blk {
- xfs_dabuf_t *bp; /* buffer containing block */
+ struct xfs_buf *bp; /* buffer containing block */
xfs_dablk_t blkno; /* filesystem blkno of buffer */
xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */
int index; /* relevant index into block */
@@ -211,7 +192,7 @@ struct xfs_nameops {
* Routines used for growing the Btree.
*/
int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
- xfs_dabuf_t **bpp, int whichfork);
+ struct xfs_buf **bpp, int whichfork);
int xfs_da_split(xfs_da_state_t *state);
/*
@@ -232,6 +213,9 @@ int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
*/
int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
xfs_da_state_blk_t *new_blk);
+int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp, int which_fork);
/*
* Utility routines.
@@ -241,14 +225,16 @@ int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
int count);
int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
- xfs_dabuf_t **bp, int whichfork);
+ struct xfs_buf **bp, int whichfork);
int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
- xfs_dabuf_t **bpp, int whichfork);
+ struct xfs_buf **bpp, int whichfork,
+ const struct xfs_buf_ops *ops);
xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
- xfs_dablk_t bno, int whichfork);
+ xfs_dablk_t bno, xfs_daddr_t mapped_bno,
+ int whichfork, const struct xfs_buf_ops *ops);
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
- xfs_dabuf_t *dead_buf);
+ struct xfs_buf *dead_buf);
uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
@@ -258,15 +244,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
xfs_da_state_t *xfs_da_state_alloc(void);
void xfs_da_state_free(xfs_da_state_t *state);
-void xfs_da_buf_done(xfs_dabuf_t *dabuf);
-void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first,
- uint last);
-void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
-void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
-xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
-
extern struct kmem_zone *xfs_da_state_zone;
-extern struct kmem_zone *xfs_dabuf_zone;
extern const struct xfs_nameops xfs_default_nameops;
#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index e00de08dc8a..a8bd26b82ec 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -48,44 +48,44 @@ xfs_swapext(
xfs_swapext_t *sxp)
{
xfs_inode_t *ip, *tip;
- struct file *file, *tmp_file;
+ struct fd f, tmp;
int error = 0;
/* Pull information for the target fd */
- file = fget((int)sxp->sx_fdtarget);
- if (!file) {
+ f = fdget((int)sxp->sx_fdtarget);
+ if (!f.file) {
error = XFS_ERROR(EINVAL);
goto out;
}
- if (!(file->f_mode & FMODE_WRITE) ||
- !(file->f_mode & FMODE_READ) ||
- (file->f_flags & O_APPEND)) {
+ if (!(f.file->f_mode & FMODE_WRITE) ||
+ !(f.file->f_mode & FMODE_READ) ||
+ (f.file->f_flags & O_APPEND)) {
error = XFS_ERROR(EBADF);
goto out_put_file;
}
- tmp_file = fget((int)sxp->sx_fdtmp);
- if (!tmp_file) {
+ tmp = fdget((int)sxp->sx_fdtmp);
+ if (!tmp.file) {
error = XFS_ERROR(EINVAL);
goto out_put_file;
}
- if (!(tmp_file->f_mode & FMODE_WRITE) ||
- !(tmp_file->f_mode & FMODE_READ) ||
- (tmp_file->f_flags & O_APPEND)) {
+ if (!(tmp.file->f_mode & FMODE_WRITE) ||
+ !(tmp.file->f_mode & FMODE_READ) ||
+ (tmp.file->f_flags & O_APPEND)) {
error = XFS_ERROR(EBADF);
goto out_put_tmp_file;
}
- if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
- IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) {
+ if (IS_SWAPFILE(f.file->f_path.dentry->d_inode) ||
+ IS_SWAPFILE(tmp.file->f_path.dentry->d_inode)) {
error = XFS_ERROR(EINVAL);
goto out_put_tmp_file;
}
- ip = XFS_I(file->f_path.dentry->d_inode);
- tip = XFS_I(tmp_file->f_path.dentry->d_inode);
+ ip = XFS_I(f.file->f_path.dentry->d_inode);
+ tip = XFS_I(tmp.file->f_path.dentry->d_inode);
if (ip->i_mount != tip->i_mount) {
error = XFS_ERROR(EINVAL);
@@ -105,9 +105,9 @@ xfs_swapext(
error = xfs_swap_extents(ip, tip, sxp);
out_put_tmp_file:
- fput(tmp_file);
+ fdput(tmp);
out_put_file:
- fput(file);
+ fdput(f);
out:
return error;
}
@@ -246,12 +246,10 @@ xfs_swap_extents(
goto out_unlock;
}
- if (VN_CACHED(VFS_I(tip)) != 0) {
- error = xfs_flushinval_pages(tip, 0, -1,
- FI_REMAPF_LOCKED);
- if (error)
- goto out_unlock;
- }
+ error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
+ if (error)
+ goto out_unlock;
+ truncate_pagecache_range(VFS_I(tip), 0, -1);
/* Verify O_DIRECT for ftmp */
if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -315,8 +313,7 @@ xfs_swap_extents(
* are safe. We don't really care if non-io related
* fields change.
*/
-
- xfs_tosspages(ip, 0, -1, FI_REMAPF);
+ truncate_pagecache_range(VFS_I(ip), 0, -1);
tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index a3721633abc..1d9643b3dce 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -33,7 +33,7 @@ typedef struct xfs_timestamp {
* variable size the leftover area split into a data and an attribute fork.
* The format of the data and attribute fork depends on the format of the
* inode as indicated by di_format and di_aformat. To access the data and
- * attribute use the XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR macros
+ * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
* below.
*
* There is a very similar struct icdinode in xfs_inode which matches the
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 67a250c36d4..b26a50f9921 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -592,7 +592,7 @@ int
xfs_dir2_shrink_inode(
xfs_da_args_t *args,
xfs_dir2_db_t db,
- xfs_dabuf_t *bp)
+ struct xfs_buf *bp)
{
xfs_fileoff_t bno; /* directory file offset */
xfs_dablk_t da; /* directory file offset */
@@ -634,7 +634,7 @@ xfs_dir2_shrink_inode(
/*
* Invalidate the buffer from the transaction.
*/
- xfs_da_binval(tp, bp);
+ xfs_trans_binval(tp, bp);
/*
* If it's not a data block, we're done.
*/
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 586732f2d80..12afe07a91d 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -37,10 +37,10 @@
/*
* Local function prototypes.
*/
-static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first,
- int last);
-static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp);
-static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp,
+static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp,
+ int first, int last);
+static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp);
+static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp,
int *entno);
static int xfs_dir2_block_sort(const void *a, const void *b);
@@ -56,6 +56,214 @@ xfs_dir_startup(void)
xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
}
+static void
+xfs_dir2_block_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+ int block_ok = 0;
+
+ block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+ block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
+
+ if (!block_ok) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_dir2_block_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_block_verify(bp);
+}
+
+static void
+xfs_dir2_block_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_block_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_dir2_block_buf_ops = {
+ .verify_read = xfs_dir2_block_read_verify,
+ .verify_write = xfs_dir2_block_write_verify,
+};
+
+static int
+xfs_dir2_block_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+
+ return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
+ XFS_DATA_FORK, &xfs_dir2_block_buf_ops);
+}
+
+static void
+xfs_dir2_block_need_space(
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_block_tail *btp,
+ struct xfs_dir2_leaf_entry *blp,
+ __be16 **tagpp,
+ struct xfs_dir2_data_unused **dupp,
+ struct xfs_dir2_data_unused **enddupp,
+ int *compact,
+ int len)
+{
+ struct xfs_dir2_data_free *bf;
+ __be16 *tagp = NULL;
+ struct xfs_dir2_data_unused *dup = NULL;
+ struct xfs_dir2_data_unused *enddup = NULL;
+
+ *compact = 0;
+ bf = hdr->bestfree;
+
+ /*
+ * If there are stale entries we'll use one for the leaf.
+ */
+ if (btp->stale) {
+ if (be16_to_cpu(bf[0].length) >= len) {
+ /*
+ * The biggest entry enough to avoid compaction.
+ */
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
+ goto out;
+ }
+
+ /*
+ * Will need to compact to make this work.
+ * Tag just before the first leaf entry.
+ */
+ *compact = 1;
+ tagp = (__be16 *)blp - 1;
+
+ /* Data object just before the first leaf entry. */
+ dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+ /*
+ * If it's not free then the data will go where the
+ * leaf data starts now, if it works at all.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+ (uint)sizeof(*blp) < len)
+ dup = NULL;
+ } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+ dup = NULL;
+ else
+ dup = (xfs_dir2_data_unused_t *)blp;
+ goto out;
+ }
+
+ /*
+ * no stale entries, so just use free space.
+ * Tag just before the first leaf entry.
+ */
+ tagp = (__be16 *)blp - 1;
+
+ /* Data object just before the first leaf entry. */
+ enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+ /*
+ * If it's not free then can't do this add without cleaning up:
+ * the space before the first leaf entry needs to be free so it
+ * can be expanded to hold the pointer to the new entry.
+ */
+ if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ /*
+ * Check out the biggest freespace and see if it's the same one.
+ */
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
+ if (dup != enddup) {
+ /*
+ * Not the same free entry, just check its length.
+ */
+ if (be16_to_cpu(dup->length) < len)
+ dup = NULL;
+ goto out;
+ }
+
+ /*
+ * It is the biggest freespace, can it hold the leaf too?
+ */
+ if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+ /*
+ * Yes, use the second-largest entry instead if it works.
+ */
+ if (be16_to_cpu(bf[1].length) >= len)
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[1].offset));
+ else
+ dup = NULL;
+ }
+ }
+out:
+ *tagpp = tagp;
+ *dupp = dup;
+ *enddupp = enddup;
+}
+
+/*
+ * compact the leaf entries.
+ * Leave the highest-numbered stale entry stale.
+ * XXX should be the one closest to mid but mid is not yet computed.
+ */
+static void
+xfs_dir2_block_compact(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_block_tail *btp,
+ struct xfs_dir2_leaf_entry *blp,
+ int *needlog,
+ int *lfloghigh,
+ int *lfloglow)
+{
+ int fromidx; /* source leaf index */
+ int toidx; /* target leaf index */
+ int needscan = 0;
+ int highstale; /* high stale index */
+
+ fromidx = toidx = be32_to_cpu(btp->count) - 1;
+ highstale = *lfloghigh = -1;
+ for (; fromidx >= 0; fromidx--) {
+ if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+ if (highstale == -1)
+ highstale = toidx;
+ else {
+ if (*lfloghigh == -1)
+ *lfloghigh = toidx;
+ continue;
+ }
+ }
+ if (fromidx < toidx)
+ blp[toidx] = blp[fromidx];
+ toidx--;
+ }
+ *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+ *lfloghigh -= be32_to_cpu(btp->stale) - 1;
+ be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+ xfs_dir2_data_make_free(tp, bp,
+ (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+ (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+ needlog, &needscan);
+ blp += be32_to_cpu(btp->stale) - 1;
+ btp->stale = cpu_to_be32(1);
+ /*
+ * If we now need to rebuild the bestfree map, do so.
+ * This needs to happen before the next call to use_free.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog);
+}
+
/*
* Add an entry to a block directory.
*/
@@ -63,10 +271,9 @@ int /* error */
xfs_dir2_block_addname(
xfs_da_args_t *args) /* directory op arguments */
{
- xfs_dir2_data_free_t *bf; /* bestfree table in block */
xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
- xfs_dabuf_t *bp; /* buffer for block */
+ struct xfs_buf *bp; /* buffer for block */
xfs_dir2_block_tail_t *btp; /* block tail */
int compact; /* need to compact leaf ents */
xfs_dir2_data_entry_t *dep; /* block data entry */
@@ -94,204 +301,74 @@ xfs_dir2_block_addname(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- /*
- * Read the (one and only) directory block into dabuf bp.
- */
- if ((error =
- xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+
+ /* Read the (one and only) directory block into bp. */
+ error = xfs_dir2_block_read(tp, dp, &bp);
+ if (error)
return error;
- }
- ASSERT(bp != NULL);
- hdr = bp->data;
- /*
- * Check the magic number, corrupted if wrong.
- */
- if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) {
- XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
- XFS_ERRLEVEL_LOW, mp, hdr);
- xfs_da_brelse(tp, bp);
- return XFS_ERROR(EFSCORRUPTED);
- }
+
len = xfs_dir2_data_entsize(args->namelen);
+
/*
* Set up pointers to parts of the block.
*/
- bf = hdr->bestfree;
+ hdr = bp->b_addr;
btp = xfs_dir2_block_tail_p(mp, hdr);
blp = xfs_dir2_block_leaf_p(btp);
+
/*
- * No stale entries? Need space for entry and new leaf.
- */
- if (!btp->stale) {
- /*
- * Tag just before the first leaf entry.
- */
- tagp = (__be16 *)blp - 1;
- /*
- * Data object just before the first leaf entry.
- */
- enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
- /*
- * If it's not free then can't do this add without cleaning up:
- * the space before the first leaf entry needs to be free so it
- * can be expanded to hold the pointer to the new entry.
- */
- if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG)
- dup = enddup = NULL;
- /*
- * Check out the biggest freespace and see if it's the same one.
- */
- else {
- dup = (xfs_dir2_data_unused_t *)
- ((char *)hdr + be16_to_cpu(bf[0].offset));
- if (dup == enddup) {
- /*
- * It is the biggest freespace, is it too small
- * to hold the new leaf too?
- */
- if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
- /*
- * Yes, we use the second-largest
- * entry instead if it works.
- */
- if (be16_to_cpu(bf[1].length) >= len)
- dup = (xfs_dir2_data_unused_t *)
- ((char *)hdr +
- be16_to_cpu(bf[1].offset));
- else
- dup = NULL;
- }
- } else {
- /*
- * Not the same free entry,
- * just check its length.
- */
- if (be16_to_cpu(dup->length) < len) {
- dup = NULL;
- }
- }
- }
- compact = 0;
- }
- /*
- * If there are stale entries we'll use one for the leaf.
- * Is the biggest entry enough to avoid compaction?
+ * Find out if we can reuse stale entries or whether we need extra
+ * space for entry and new leaf.
*/
- else if (be16_to_cpu(bf[0].length) >= len) {
- dup = (xfs_dir2_data_unused_t *)
- ((char *)hdr + be16_to_cpu(bf[0].offset));
- compact = 0;
- }
+ xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup,
+ &enddup, &compact, len);
+
/*
- * Will need to compact to make this work.
+ * Done everything we need for a space check now.
*/
- else {
- /*
- * Tag just before the first leaf entry.
- */
- tagp = (__be16 *)blp - 1;
- /*
- * Data object just before the first leaf entry.
- */
- dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
- /*
- * If it's not free then the data will go where the
- * leaf data starts now, if it works at all.
- */
- if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
- (uint)sizeof(*blp) < len)
- dup = NULL;
- } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
- dup = NULL;
- else
- dup = (xfs_dir2_data_unused_t *)blp;
- compact = 1;
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+ xfs_trans_brelse(tp, bp);
+ if (!dup)
+ return XFS_ERROR(ENOSPC);
+ return 0;
}
- /*
- * If this isn't a real add, we're done with the buffer.
- */
- if (args->op_flags & XFS_DA_OP_JUSTCHECK)
- xfs_da_brelse(tp, bp);
+
/*
* If we don't have space for the new entry & leaf ...
*/
if (!dup) {
- /*
- * Not trying to actually do anything, or don't have
- * a space reservation: return no-space.
- */
- if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+ /* Don't have a space reservation: return no-space. */
+ if (args->total == 0)
return XFS_ERROR(ENOSPC);
/*
* Convert to the next larger format.
* Then add the new entry in that format.
*/
error = xfs_dir2_block_to_leaf(args, bp);
- xfs_da_buf_done(bp);
if (error)
return error;
return xfs_dir2_leaf_addname(args);
}
- /*
- * Just checking, and it would work, so say so.
- */
- if (args->op_flags & XFS_DA_OP_JUSTCHECK)
- return 0;
+
needlog = needscan = 0;
+
/*
* If need to compact the leaf entries, do it now.
- * Leave the highest-numbered stale entry stale.
- * XXX should be the one closest to mid but mid is not yet computed.
*/
if (compact) {
- int fromidx; /* source leaf index */
- int toidx; /* target leaf index */
-
- for (fromidx = toidx = be32_to_cpu(btp->count) - 1,
- highstale = lfloghigh = -1;
- fromidx >= 0;
- fromidx--) {
- if (blp[fromidx].address ==
- cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
- if (highstale == -1)
- highstale = toidx;
- else {
- if (lfloghigh == -1)
- lfloghigh = toidx;
- continue;
- }
- }
- if (fromidx < toidx)
- blp[toidx] = blp[fromidx];
- toidx--;
- }
- lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
- lfloghigh -= be32_to_cpu(btp->stale) - 1;
- be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
- xfs_dir2_data_make_free(tp, bp,
- (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
- (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
- &needlog, &needscan);
- blp += be32_to_cpu(btp->stale) - 1;
- btp->stale = cpu_to_be32(1);
+ xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
+ &lfloghigh, &lfloglow);
+ /* recalculate blp post-compaction */
+ blp = xfs_dir2_block_leaf_p(btp);
+ } else if (btp->stale) {
/*
- * If we now need to rebuild the bestfree map, do so.
- * This needs to happen before the next call to use_free.
+ * Set leaf logging boundaries to impossible state.
+ * For the no-stale case they're set explicitly.
*/
- if (needscan) {
- xfs_dir2_data_freescan(mp, hdr, &needlog);
- needscan = 0;
- }
- }
- /*
- * Set leaf logging boundaries to impossible state.
- * For the no-stale case they're set explicitly.
- */
- else if (btp->stale) {
lfloglow = be32_to_cpu(btp->count);
lfloghigh = -1;
}
+
/*
* Find the slot that's first lower than our hash value, -1 if none.
*/
@@ -422,7 +499,6 @@ xfs_dir2_block_addname(
xfs_dir2_block_log_tail(tp, bp);
xfs_dir2_data_log_entry(tp, bp, dep);
xfs_dir2_data_check(dp, bp);
- xfs_da_buf_done(bp);
return 0;
}
@@ -437,7 +513,7 @@ xfs_dir2_block_getdents(
filldir_t filldir)
{
xfs_dir2_data_hdr_t *hdr; /* block header */
- xfs_dabuf_t *bp; /* buffer for block */
+ struct xfs_buf *bp; /* buffer for block */
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_dir2_data_entry_t *dep; /* block data entry */
xfs_dir2_data_unused_t *dup; /* block unused entry */
@@ -452,24 +528,19 @@ xfs_dir2_block_getdents(
/*
* If the block number in the offset is out of range, we're done.
*/
- if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) {
+ if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
return 0;
- }
- /*
- * Can't read the block, give up, else get dabuf in bp.
- */
- error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1,
- &bp, XFS_DATA_FORK);
+
+ error = xfs_dir2_block_read(NULL, dp, &bp);
if (error)
return error;
- ASSERT(bp != NULL);
/*
* Extract the byte offset we start at from the seek pointer.
* We'll skip entries before this.
*/
wantoff = xfs_dir2_dataptr_to_off(mp, *offset);
- hdr = bp->data;
+ hdr = bp->b_addr;
xfs_dir2_data_check(dp, bp);
/*
* Set up values for the loop.
@@ -514,7 +585,7 @@ xfs_dir2_block_getdents(
cook & 0x7fffffff, be64_to_cpu(dep->inumber),
DT_UNKNOWN)) {
*offset = cook & 0x7fffffff;
- xfs_da_brelse(NULL, bp);
+ xfs_trans_brelse(NULL, bp);
return 0;
}
}
@@ -525,7 +596,7 @@ xfs_dir2_block_getdents(
*/
*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
0x7fffffff;
- xfs_da_brelse(NULL, bp);
+ xfs_trans_brelse(NULL, bp);
return 0;
}
@@ -535,17 +606,17 @@ xfs_dir2_block_getdents(
static void
xfs_dir2_block_log_leaf(
xfs_trans_t *tp, /* transaction structure */
- xfs_dabuf_t *bp, /* block buffer */
+ struct xfs_buf *bp, /* block buffer */
int first, /* index of first logged leaf */
int last) /* index of last logged leaf */
{
- xfs_dir2_data_hdr_t *hdr = bp->data;
+ xfs_dir2_data_hdr_t *hdr = bp->b_addr;
xfs_dir2_leaf_entry_t *blp;
xfs_dir2_block_tail_t *btp;
btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr);
blp = xfs_dir2_block_leaf_p(btp);
- xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
+ xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
(uint)((char *)&blp[last + 1] - (char *)hdr - 1));
}
@@ -555,13 +626,13 @@ xfs_dir2_block_log_leaf(
static void
xfs_dir2_block_log_tail(
xfs_trans_t *tp, /* transaction structure */
- xfs_dabuf_t *bp) /* block buffer */
+ struct xfs_buf *bp) /* block buffer */
{
- xfs_dir2_data_hdr_t *hdr = bp->data;
+ xfs_dir2_data_hdr_t *hdr = bp->b_addr;
xfs_dir2_block_tail_t *btp;
btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr);
- xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
+ xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
(uint)((char *)(btp + 1) - (char *)hdr - 1));
}
@@ -575,7 +646,7 @@ xfs_dir2_block_lookup(
{
xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
- xfs_dabuf_t *bp; /* block buffer */
+ struct xfs_buf *bp; /* block buffer */
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_dir2_data_entry_t *dep; /* block data entry */
xfs_inode_t *dp; /* incore inode */
@@ -593,7 +664,7 @@ xfs_dir2_block_lookup(
return error;
dp = args->dp;
mp = dp->i_mount;
- hdr = bp->data;
+ hdr = bp->b_addr;
xfs_dir2_data_check(dp, bp);
btp = xfs_dir2_block_tail_p(mp, hdr);
blp = xfs_dir2_block_leaf_p(btp);
@@ -607,7 +678,7 @@ xfs_dir2_block_lookup(
*/
args->inumber = be64_to_cpu(dep->inumber);
error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
- xfs_da_brelse(args->trans, bp);
+ xfs_trans_brelse(args->trans, bp);
return XFS_ERROR(error);
}
@@ -617,13 +688,13 @@ xfs_dir2_block_lookup(
static int /* error */
xfs_dir2_block_lookup_int(
xfs_da_args_t *args, /* dir lookup arguments */
- xfs_dabuf_t **bpp, /* returned block buffer */
+ struct xfs_buf **bpp, /* returned block buffer */
int *entno) /* returned entry number */
{
xfs_dir2_dataptr_t addr; /* data entry address */
xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
- xfs_dabuf_t *bp; /* block buffer */
+ struct xfs_buf *bp; /* block buffer */
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_dir2_data_entry_t *dep; /* block data entry */
xfs_inode_t *dp; /* incore inode */
@@ -639,15 +710,12 @@ xfs_dir2_block_lookup_int(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- /*
- * Read the buffer, return error if we can't get it.
- */
- if ((error =
- xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+
+ error = xfs_dir2_block_read(tp, dp, &bp);
+ if (error)
return error;
- }
- ASSERT(bp != NULL);
- hdr = bp->data;
+
+ hdr = bp->b_addr;
xfs_dir2_data_check(dp, bp);
btp = xfs_dir2_block_tail_p(mp, hdr);
blp = xfs_dir2_block_leaf_p(btp);
@@ -666,7 +734,7 @@ xfs_dir2_block_lookup_int(
high = mid - 1;
if (low > high) {
ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
- xfs_da_brelse(tp, bp);
+ xfs_trans_brelse(tp, bp);
return XFS_ERROR(ENOENT);
}
}
@@ -714,7 +782,7 @@ xfs_dir2_block_lookup_int(
/*
* No match, release the buffer and return ENOENT.
*/
- xfs_da_brelse(tp, bp);
+ xfs_trans_brelse(tp, bp);
return XFS_ERROR(ENOENT);
}
@@ -728,7 +796,7 @@ xfs_dir2_block_removename(
{
xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */
- xfs_dabuf_t *bp; /* block buffer */
+ struct xfs_buf *bp; /* block buffer */
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_dir2_data_entry_t *dep; /* block data entry */
xfs_inode_t *dp; /* incore inode */
@@ -753,7 +821,7 @@ xfs_dir2_block_removename(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- hdr = bp->data;
+ hdr = bp->b_addr;
btp = xfs_dir2_block_tail_p(mp, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
@@ -790,10 +858,9 @@ xfs_dir2_block_removename(
* See if the size as a shortform is good enough.
*/
size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
- if (size > XFS_IFORK_DSIZE(dp)) {
- xfs_da_buf_done(bp);
+ if (size > XFS_IFORK_DSIZE(dp))
return 0;
- }
+
/*
* If it works, do the conversion.
*/
@@ -810,7 +877,7 @@ xfs_dir2_block_replace(
{
xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
- xfs_dabuf_t *bp; /* block buffer */
+ struct xfs_buf *bp; /* block buffer */
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_dir2_data_entry_t *dep; /* block data entry */
xfs_inode_t *dp; /* incore inode */
@@ -829,7 +896,7 @@ xfs_dir2_block_replace(
}
dp = args->dp;
mp = dp->i_mount;
- hdr = bp->data;
+ hdr = bp->b_addr;
btp = xfs_dir2_block_tail_p(mp, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
@@ -844,7 +911,6 @@ xfs_dir2_block_replace(
dep->inumber = cpu_to_be64(args->inumber);
xfs_dir2_data_log_entry(args->trans, bp, dep);
xfs_dir2_data_check(dp, bp);
- xfs_da_buf_done(bp);
return 0;
}
@@ -871,8 +937,8 @@ xfs_dir2_block_sort(
int /* error */
xfs_dir2_leaf_to_block(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *lbp, /* leaf buffer */
- xfs_dabuf_t *dbp) /* data buffer */
+ struct xfs_buf *lbp, /* leaf buffer */
+ struct xfs_buf *dbp) /* data buffer */
{
__be16 *bestsp; /* leaf bests table */
xfs_dir2_data_hdr_t *hdr; /* block header */
@@ -898,7 +964,7 @@ xfs_dir2_leaf_to_block(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- leaf = lbp->data;
+ leaf = lbp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
/*
@@ -914,21 +980,19 @@ xfs_dir2_leaf_to_block(
if ((error =
xfs_dir2_leaf_trim_data(args, lbp,
(xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
- goto out;
- } else {
- error = 0;
- goto out;
- }
+ return error;
+ } else
+ return 0;
}
/*
* Read the data block if we don't already have it, give up if it fails.
*/
- if (dbp == NULL &&
- (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
- XFS_DATA_FORK))) {
- goto out;
+ if (!dbp) {
+ error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
+ if (error)
+ return error;
}
- hdr = dbp->data;
+ hdr = dbp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
/*
* Size of the "leaf" area in the block.
@@ -944,13 +1008,13 @@ xfs_dir2_leaf_to_block(
* If it's not free or is too short we can't do it.
*/
if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG ||
- be16_to_cpu(dup->length) < size) {
- error = 0;
- goto out;
- }
+ be16_to_cpu(dup->length) < size)
+ return 0;
+
/*
* Start converting it to block form.
*/
+ dbp->b_ops = &xfs_dir2_block_buf_ops;
hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
needlog = 1;
needscan = 0;
@@ -989,25 +1053,17 @@ xfs_dir2_leaf_to_block(
* Pitch the old leaf block.
*/
error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp);
- lbp = NULL;
- if (error) {
- goto out;
- }
+ if (error)
+ return error;
+
/*
* Now see if the resulting block can be shrunken to shortform.
*/
size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
- if (size > XFS_IFORK_DSIZE(dp)) {
- error = 0;
- goto out;
- }
+ if (size > XFS_IFORK_DSIZE(dp))
+ return 0;
+
return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
-out:
- if (lbp)
- xfs_da_buf_done(lbp);
- if (dbp)
- xfs_da_buf_done(dbp);
- return error;
}
/*
@@ -1020,7 +1076,7 @@ xfs_dir2_sf_to_block(
xfs_dir2_db_t blkno; /* dir-relative block # (0) */
xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
- xfs_dabuf_t *bp; /* block buffer */
+ struct xfs_buf *bp; /* block buffer */
xfs_dir2_block_tail_t *btp; /* block tail pointer */
xfs_dir2_data_entry_t *dep; /* data entry pointer */
xfs_inode_t *dp; /* incore directory inode */
@@ -1088,7 +1144,8 @@ xfs_dir2_sf_to_block(
kmem_free(sfp);
return error;
}
- hdr = bp->data;
+ bp->b_ops = &xfs_dir2_block_buf_ops;
+ hdr = bp->b_addr;
hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
/*
* Compute size of block "tail" area.
@@ -1217,6 +1274,5 @@ xfs_dir2_sf_to_block(
xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
xfs_dir2_block_log_tail(tp, bp);
xfs_dir2_data_check(dp, bp);
- xfs_da_buf_done(bp);
return 0;
}
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 2046988e9eb..ffcf1774152 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -34,16 +34,15 @@
STATIC xfs_dir2_data_free_t *
xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
-#ifdef DEBUG
/*
* Check the consistency of the data block.
* The input can also be a block-format directory.
- * Pop an assert if we find anything bad.
+ * Return 0 is the buffer is good, otherwise an error.
*/
-void
-xfs_dir2_data_check(
- xfs_inode_t *dp, /* incore inode pointer */
- xfs_dabuf_t *bp) /* data block's buffer */
+int
+__xfs_dir2_data_check(
+ struct xfs_inode *dp, /* incore inode pointer */
+ struct xfs_buf *bp) /* data block's buffer */
{
xfs_dir2_dataptr_t addr; /* addr for leaf lookup */
xfs_dir2_data_free_t *bf; /* bestfree table */
@@ -64,18 +63,23 @@ xfs_dir2_data_check(
int stale; /* count of stale leaves */
struct xfs_name name;
- mp = dp->i_mount;
- hdr = bp->data;
+ mp = bp->b_target->bt_mount;
+ hdr = bp->b_addr;
bf = hdr->bestfree;
p = (char *)(hdr + 1);
- if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
+ switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
btp = xfs_dir2_block_tail_p(mp, hdr);
lep = xfs_dir2_block_leaf_p(btp);
endp = (char *)lep;
- } else {
- ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
+ break;
+ case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
endp = (char *)hdr + mp->m_dirblksize;
+ break;
+ default:
+ XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
+ return EFSCORRUPTED;
}
count = lastfree = freeseen = 0;
@@ -83,19 +87,22 @@ xfs_dir2_data_check(
* Account for zero bestfree entries.
*/
if (!bf[0].length) {
- ASSERT(!bf[0].offset);
+ XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
freeseen |= 1 << 0;
}
if (!bf[1].length) {
- ASSERT(!bf[1].offset);
+ XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
freeseen |= 1 << 1;
}
if (!bf[2].length) {
- ASSERT(!bf[2].offset);
+ XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
freeseen |= 1 << 2;
}
- ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length));
- ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length));
+
+ XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
+ be16_to_cpu(bf[1].length));
+ XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
+ be16_to_cpu(bf[2].length));
/*
* Loop over the data/unused entries.
*/
@@ -107,17 +114,20 @@ xfs_dir2_data_check(
* doesn't need to be there.
*/
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- ASSERT(lastfree == 0);
- ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
- (char *)dup - (char *)hdr);
+ XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
+ XFS_WANT_CORRUPTED_RETURN(
+ be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+ (char *)dup - (char *)hdr);
dfp = xfs_dir2_data_freefind(hdr, dup);
if (dfp) {
i = (int)(dfp - bf);
- ASSERT((freeseen & (1 << i)) == 0);
+ XFS_WANT_CORRUPTED_RETURN(
+ (freeseen & (1 << i)) == 0);
freeseen |= 1 << i;
} else {
- ASSERT(be16_to_cpu(dup->length) <=
- be16_to_cpu(bf[2].length));
+ XFS_WANT_CORRUPTED_RETURN(
+ be16_to_cpu(dup->length) <=
+ be16_to_cpu(bf[2].length));
}
p += be16_to_cpu(dup->length);
lastfree = 1;
@@ -130,10 +140,12 @@ xfs_dir2_data_check(
* The linear search is crude but this is DEBUG code.
*/
dep = (xfs_dir2_data_entry_t *)p;
- ASSERT(dep->namelen != 0);
- ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0);
- ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
- (char *)dep - (char *)hdr);
+ XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
+ XFS_WANT_CORRUPTED_RETURN(
+ !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
+ XFS_WANT_CORRUPTED_RETURN(
+ be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
+ (char *)dep - (char *)hdr);
count++;
lastfree = 0;
if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
@@ -148,27 +160,122 @@ xfs_dir2_data_check(
be32_to_cpu(lep[i].hashval) == hash)
break;
}
- ASSERT(i < be32_to_cpu(btp->count));
+ XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
}
p += xfs_dir2_data_entsize(dep->namelen);
}
/*
* Need to have seen all the entries and all the bestfree slots.
*/
- ASSERT(freeseen == 7);
+ XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
if (lep[i].address ==
cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
stale++;
if (i > 0)
- ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval));
+ XFS_WANT_CORRUPTED_RETURN(
+ be32_to_cpu(lep[i].hashval) >=
+ be32_to_cpu(lep[i - 1].hashval));
}
- ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
- ASSERT(stale == be32_to_cpu(btp->stale));
+ XFS_WANT_CORRUPTED_RETURN(count ==
+ be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+ XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
}
+ return 0;
+}
+
+static void
+xfs_dir2_data_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+ int block_ok = 0;
+
+ block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+ block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
+
+ if (!block_ok) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+/*
+ * Readahead of the first block of the directory when it is opened is completely
+ * oblivious to the format of the directory. Hence we can either get a block
+ * format buffer or a data format buffer on readahead.
+ */
+static void
+xfs_dir2_data_reada_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+ switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+ bp->b_ops = &xfs_dir2_block_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+ xfs_dir2_data_verify(bp);
+ return;
+ default:
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ break;
+ }
+}
+
+static void
+xfs_dir2_data_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_data_verify(bp);
+}
+
+static void
+xfs_dir2_data_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_data_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_dir2_data_buf_ops = {
+ .verify_read = xfs_dir2_data_read_verify,
+ .verify_write = xfs_dir2_data_write_verify,
+};
+
+static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = {
+ .verify_read = xfs_dir2_data_reada_verify,
+ .verify_write = xfs_dir2_data_write_verify,
+};
+
+
+int
+xfs_dir2_data_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno,
+ struct xfs_buf **bpp)
+{
+ return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
+ XFS_DATA_FORK, &xfs_dir2_data_buf_ops);
+}
+
+int
+xfs_dir2_data_readahead(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno)
+{
+ return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
+ XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops);
}
-#endif
/*
* Given a data block and an unused entry from that block,
@@ -389,9 +496,9 @@ int /* error */
xfs_dir2_data_init(
xfs_da_args_t *args, /* directory operation args */
xfs_dir2_db_t blkno, /* logical dir block number */
- xfs_dabuf_t **bpp) /* output block buffer */
+ struct xfs_buf **bpp) /* output block buffer */
{
- xfs_dabuf_t *bp; /* block buffer */
+ struct xfs_buf *bp; /* block buffer */
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_inode_t *dp; /* incore directory inode */
xfs_dir2_data_unused_t *dup; /* unused entry pointer */
@@ -409,15 +516,14 @@ xfs_dir2_data_init(
*/
error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
XFS_DATA_FORK);
- if (error) {
+ if (error)
return error;
- }
- ASSERT(bp != NULL);
+ bp->b_ops = &xfs_dir2_data_buf_ops;
/*
* Initialize the header.
*/
- hdr = bp->data;
+ hdr = bp->b_addr;
hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
hdr->bestfree[0].offset = cpu_to_be16(sizeof(*hdr));
for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
@@ -449,16 +555,16 @@ xfs_dir2_data_init(
*/
void
xfs_dir2_data_log_entry(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp, /* block buffer */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
xfs_dir2_data_entry_t *dep) /* data entry pointer */
{
- xfs_dir2_data_hdr_t *hdr = bp->data;
+ xfs_dir2_data_hdr_t *hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
- xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr),
+ xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr),
(uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) -
(char *)hdr - 1));
}
@@ -468,15 +574,15 @@ xfs_dir2_data_log_entry(
*/
void
xfs_dir2_data_log_header(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp) /* block buffer */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
{
- xfs_dir2_data_hdr_t *hdr = bp->data;
+ xfs_dir2_data_hdr_t *hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
- xfs_da_log_buf(tp, bp, 0, sizeof(*hdr) - 1);
+ xfs_trans_log_buf(tp, bp, 0, sizeof(*hdr) - 1);
}
/*
@@ -484,11 +590,11 @@ xfs_dir2_data_log_header(
*/
void
xfs_dir2_data_log_unused(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp, /* block buffer */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
xfs_dir2_data_unused_t *dup) /* data unused pointer */
{
- xfs_dir2_data_hdr_t *hdr = bp->data;
+ xfs_dir2_data_hdr_t *hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
@@ -496,13 +602,13 @@ xfs_dir2_data_log_unused(
/*
* Log the first part of the unused entry.
*/
- xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr),
+ xfs_trans_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr),
(uint)((char *)&dup->length + sizeof(dup->length) -
1 - (char *)hdr));
/*
* Log the end (tag) of the unused entry.
*/
- xfs_da_log_buf(tp, bp,
+ xfs_trans_log_buf(tp, bp,
(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr),
(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr +
sizeof(xfs_dir2_data_off_t) - 1));
@@ -514,8 +620,8 @@ xfs_dir2_data_log_unused(
*/
void
xfs_dir2_data_make_free(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp, /* block buffer */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
xfs_dir2_data_aoff_t offset, /* starting byte offset */
xfs_dir2_data_aoff_t len, /* length in bytes */
int *needlogp, /* out: log header */
@@ -531,7 +637,7 @@ xfs_dir2_data_make_free(
xfs_dir2_data_unused_t *prevdup; /* unused entry before us */
mp = tp->t_mountp;
- hdr = bp->data;
+ hdr = bp->b_addr;
/*
* Figure out where the end of the data area is.
@@ -696,8 +802,8 @@ xfs_dir2_data_make_free(
*/
void
xfs_dir2_data_use_free(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp, /* data block buffer */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
xfs_dir2_data_unused_t *dup, /* unused entry */
xfs_dir2_data_aoff_t offset, /* starting offset to use */
xfs_dir2_data_aoff_t len, /* length to use */
@@ -713,7 +819,7 @@ xfs_dir2_data_use_free(
xfs_dir2_data_unused_t *newdup2; /* another new unused entry */
int oldlen; /* old unused entry's length */
- hdr = bp->data;
+ hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 397ffbcbab1..60cd2fa4e04 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -38,16 +38,93 @@
* Local function declarations.
*/
#ifdef DEBUG
-static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
+static void xfs_dir2_leaf_check(struct xfs_inode *dp, struct xfs_buf *bp);
#else
#define xfs_dir2_leaf_check(dp, bp)
#endif
-static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp,
- int *indexp, xfs_dabuf_t **dbpp);
-static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
+static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
+ int *indexp, struct xfs_buf **dbpp);
+static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
int first, int last);
-static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp);
+static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
+static void
+xfs_dir2_leaf_verify(
+ struct xfs_buf *bp,
+ __be16 magic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dir2_leaf_hdr *hdr = bp->b_addr;
+ int block_ok = 0;
+
+ block_ok = hdr->info.magic == magic;
+ if (!block_ok) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_dir2_leaf1_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+}
+
+static void
+xfs_dir2_leaf1_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+}
+
+void
+xfs_dir2_leafn_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+}
+
+void
+xfs_dir2_leafn_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+}
+
+static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = {
+ .verify_read = xfs_dir2_leaf1_read_verify,
+ .verify_write = xfs_dir2_leaf1_write_verify,
+};
+
+const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = {
+ .verify_read = xfs_dir2_leafn_read_verify,
+ .verify_write = xfs_dir2_leafn_write_verify,
+};
+
+static int
+xfs_dir2_leaf_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp)
+{
+ return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops);
+}
+
+int
+xfs_dir2_leafn_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp)
+{
+ return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops);
+}
/*
* Convert a block form directory to a leaf form directory.
@@ -55,7 +132,7 @@ static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp);
int /* error */
xfs_dir2_block_to_leaf(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *dbp) /* input block's buffer */
+ struct xfs_buf *dbp) /* input block's buffer */
{
__be16 *bestsp; /* leaf's bestsp entries */
xfs_dablk_t blkno; /* leaf block's bno */
@@ -64,7 +141,7 @@ xfs_dir2_block_to_leaf(
xfs_dir2_block_tail_t *btp; /* block's tail */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
- xfs_dabuf_t *lbp; /* leaf block's buffer */
+ struct xfs_buf *lbp; /* leaf block's buffer */
xfs_dir2_db_t ldb; /* leaf block's bno */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */
@@ -95,8 +172,8 @@ xfs_dir2_block_to_leaf(
return error;
}
ASSERT(lbp != NULL);
- leaf = lbp->data;
- hdr = dbp->data;
+ leaf = lbp->b_addr;
+ hdr = dbp->b_addr;
xfs_dir2_data_check(dp, dbp);
btp = xfs_dir2_block_tail_p(mp, hdr);
blp = xfs_dir2_block_leaf_p(btp);
@@ -125,6 +202,7 @@ xfs_dir2_block_to_leaf(
/*
* Fix up the block header, make it a data block.
*/
+ dbp->b_ops = &xfs_dir2_data_buf_ops;
hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
if (needscan)
xfs_dir2_data_freescan(mp, hdr, &needlog);
@@ -143,7 +221,6 @@ xfs_dir2_block_to_leaf(
xfs_dir2_leaf_check(dp, lbp);
xfs_dir2_data_check(dp, dbp);
xfs_dir2_leaf_log_bests(tp, lbp, 0, 0);
- xfs_da_buf_done(lbp);
return 0;
}
@@ -282,7 +359,7 @@ xfs_dir2_leaf_addname(
__be16 *bestsp; /* freespace table in leaf */
int compact; /* need to compact leaves */
xfs_dir2_data_hdr_t *hdr; /* data block header */
- xfs_dabuf_t *dbp; /* data block buffer */
+ struct xfs_buf *dbp; /* data block buffer */
xfs_dir2_data_entry_t *dep; /* data block entry */
xfs_inode_t *dp; /* incore directory inode */
xfs_dir2_data_unused_t *dup; /* data unused entry */
@@ -291,7 +368,7 @@ xfs_dir2_leaf_addname(
int highstale; /* index of next stale leaf */
int i; /* temporary, index */
int index; /* leaf table position */
- xfs_dabuf_t *lbp; /* leaf's buffer */
+ struct xfs_buf *lbp; /* leaf's buffer */
xfs_dir2_leaf_t *leaf; /* leaf structure */
int length; /* length of new entry */
xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
@@ -312,15 +389,11 @@ xfs_dir2_leaf_addname(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- /*
- * Read the leaf block.
- */
- error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
- XFS_DATA_FORK);
- if (error) {
+
+ error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
+ if (error)
return error;
- }
- ASSERT(lbp != NULL);
+
/*
* Look up the entry by hash value and name.
* We know it's not there, our caller has already done a lookup.
@@ -328,7 +401,7 @@ xfs_dir2_leaf_addname(
* But if there are dup hash values the index is of the first of those.
*/
index = xfs_dir2_leaf_search_hash(args, lbp);
- leaf = lbp->data;
+ leaf = lbp->b_addr;
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
length = xfs_dir2_data_entsize(args->namelen);
@@ -402,14 +475,13 @@ xfs_dir2_leaf_addname(
*/
if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
args->total == 0) {
- xfs_da_brelse(tp, lbp);
+ xfs_trans_brelse(tp, lbp);
return XFS_ERROR(ENOSPC);
}
/*
* Convert to node form.
*/
error = xfs_dir2_leaf_to_node(args, lbp);
- xfs_da_buf_done(lbp);
if (error)
return error;
/*
@@ -427,7 +499,7 @@ xfs_dir2_leaf_addname(
* a new data block.
*/
if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
- xfs_da_brelse(tp, lbp);
+ xfs_trans_brelse(tp, lbp);
return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
}
/*
@@ -435,7 +507,7 @@ xfs_dir2_leaf_addname(
* changed anything.
*/
if (args->total == 0 && use_block == -1) {
- xfs_da_brelse(tp, lbp);
+ xfs_trans_brelse(tp, lbp);
return XFS_ERROR(ENOSPC);
}
/*
@@ -466,14 +538,14 @@ xfs_dir2_leaf_addname(
*/
if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
&use_block))) {
- xfs_da_brelse(tp, lbp);
+ xfs_trans_brelse(tp, lbp);
return error;
}
/*
* Initialize the block.
*/
if ((error = xfs_dir2_data_init(args, use_block, &dbp))) {
- xfs_da_brelse(tp, lbp);
+ xfs_trans_brelse(tp, lbp);
return error;
}
/*
@@ -493,25 +565,24 @@ xfs_dir2_leaf_addname(
*/
else
xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
- hdr = dbp->data;
+ hdr = dbp->b_addr;
bestsp[use_block] = hdr->bestfree[0].length;
grown = 1;
- }
- /*
- * Already had space in some data block.
- * Just read that one in.
- */
- else {
- if ((error =
- xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block),
- -1, &dbp, XFS_DATA_FORK))) {
- xfs_da_brelse(tp, lbp);
+ } else {
+ /*
+ * Already had space in some data block.
+ * Just read that one in.
+ */
+ error = xfs_dir2_data_read(tp, dp,
+ xfs_dir2_db_to_da(mp, use_block),
+ -1, &dbp);
+ if (error) {
+ xfs_trans_brelse(tp, lbp);
return error;
}
- hdr = dbp->data;
+ hdr = dbp->b_addr;
grown = 0;
}
- xfs_dir2_data_check(dp, dbp);
/*
* Point to the biggest freespace in our data block.
*/
@@ -570,9 +641,7 @@ xfs_dir2_leaf_addname(
xfs_dir2_leaf_log_header(tp, lbp);
xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
xfs_dir2_leaf_check(dp, lbp);
- xfs_da_buf_done(lbp);
xfs_dir2_data_check(dp, dbp);
- xfs_da_buf_done(dbp);
return 0;
}
@@ -583,8 +652,8 @@ xfs_dir2_leaf_addname(
*/
STATIC void
xfs_dir2_leaf_check(
- xfs_inode_t *dp, /* incore directory inode */
- xfs_dabuf_t *bp) /* leaf's buffer */
+ struct xfs_inode *dp, /* incore directory inode */
+ struct xfs_buf *bp) /* leaf's buffer */
{
int i; /* leaf index */
xfs_dir2_leaf_t *leaf; /* leaf structure */
@@ -592,7 +661,7 @@ xfs_dir2_leaf_check(
xfs_mount_t *mp; /* filesystem mount point */
int stale; /* count of stale leaves */
- leaf = bp->data;
+ leaf = bp->b_addr;
mp = dp->i_mount;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
/*
@@ -628,14 +697,14 @@ xfs_dir2_leaf_check(
void
xfs_dir2_leaf_compact(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *bp) /* leaf buffer */
+ struct xfs_buf *bp) /* leaf buffer */
{
int from; /* source leaf index */
xfs_dir2_leaf_t *leaf; /* leaf structure */
int loglow; /* first leaf entry to log */
int to; /* target leaf index */
- leaf = bp->data;
+ leaf = bp->b_addr;
if (!leaf->hdr.stale) {
return;
}
@@ -677,7 +746,7 @@ xfs_dir2_leaf_compact(
*/
void
xfs_dir2_leaf_compact_x1(
- xfs_dabuf_t *bp, /* leaf buffer */
+ struct xfs_buf *bp, /* leaf buffer */
int *indexp, /* insertion index */
int *lowstalep, /* out: stale entry before us */
int *highstalep, /* out: stale entry after us */
@@ -693,7 +762,7 @@ xfs_dir2_leaf_compact_x1(
int newindex=0; /* new insertion index */
int to; /* destination copy index */
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(be16_to_cpu(leaf->hdr.stale) > 1);
index = *indexp;
@@ -763,6 +832,216 @@ xfs_dir2_leaf_compact_x1(
*highstalep = highstale;
}
+struct xfs_dir2_leaf_map_info {
+ xfs_extlen_t map_blocks; /* number of fsbs in map */
+ xfs_dablk_t map_off; /* last mapped file offset */
+ int map_size; /* total entries in *map */
+ int map_valid; /* valid entries in *map */
+ int nmap; /* mappings to ask xfs_bmapi */
+ xfs_dir2_db_t curdb; /* db for current block */
+ int ra_current; /* number of read-ahead blks */
+ int ra_index; /* *map index for read-ahead */
+ int ra_offset; /* map entry offset for ra */
+ int ra_want; /* readahead count wanted */
+ struct xfs_bmbt_irec map[]; /* map vector for blocks */
+};
+
+STATIC int
+xfs_dir2_leaf_readbuf(
+ struct xfs_inode *dp,
+ size_t bufsize,
+ struct xfs_dir2_leaf_map_info *mip,
+ xfs_dir2_off_t *curoff,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp = *bpp;
+ struct xfs_bmbt_irec *map = mip->map;
+ int error = 0;
+ int length;
+ int i;
+ int j;
+
+ /*
+ * If we have a buffer, we need to release it and
+ * take it out of the mapping.
+ */
+
+ if (bp) {
+ xfs_trans_brelse(NULL, bp);
+ bp = NULL;
+ mip->map_blocks -= mp->m_dirblkfsbs;
+ /*
+ * Loop to get rid of the extents for the
+ * directory block.
+ */
+ for (i = mp->m_dirblkfsbs; i > 0; ) {
+ j = min_t(int, map->br_blockcount, i);
+ map->br_blockcount -= j;
+ map->br_startblock += j;
+ map->br_startoff += j;
+ /*
+ * If mapping is done, pitch it from
+ * the table.
+ */
+ if (!map->br_blockcount && --mip->map_valid)
+ memmove(&map[0], &map[1],
+ sizeof(map[0]) * mip->map_valid);
+ i -= j;
+ }
+ }
+
+ /*
+ * Recalculate the readahead blocks wanted.
+ */
+ mip->ra_want = howmany(bufsize + mp->m_dirblksize,
+ mp->m_sb.sb_blocksize) - 1;
+ ASSERT(mip->ra_want >= 0);
+
+ /*
+ * If we don't have as many as we want, and we haven't
+ * run out of data blocks, get some more mappings.
+ */
+ if (1 + mip->ra_want > mip->map_blocks &&
+ mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
+ /*
+ * Get more bmaps, fill in after the ones
+ * we already have in the table.
+ */
+ mip->nmap = mip->map_size - mip->map_valid;
+ error = xfs_bmapi_read(dp, mip->map_off,
+ xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) -
+ mip->map_off,
+ &map[mip->map_valid], &mip->nmap, 0);
+
+ /*
+ * Don't know if we should ignore this or try to return an
+ * error. The trouble with returning errors is that readdir
+ * will just stop without actually passing the error through.
+ */
+ if (error)
+ goto out; /* XXX */
+
+ /*
+ * If we got all the mappings we asked for, set the final map
+ * offset based on the last bmap value received. Otherwise,
+ * we've reached the end.
+ */
+ if (mip->nmap == mip->map_size - mip->map_valid) {
+ i = mip->map_valid + mip->nmap - 1;
+ mip->map_off = map[i].br_startoff + map[i].br_blockcount;
+ } else
+ mip->map_off = xfs_dir2_byte_to_da(mp,
+ XFS_DIR2_LEAF_OFFSET);
+
+ /*
+ * Look for holes in the mapping, and eliminate them. Count up
+ * the valid blocks.
+ */
+ for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
+ if (map[i].br_startblock == HOLESTARTBLOCK) {
+ mip->nmap--;
+ length = mip->map_valid + mip->nmap - i;
+ if (length)
+ memmove(&map[i], &map[i + 1],
+ sizeof(map[i]) * length);
+ } else {
+ mip->map_blocks += map[i].br_blockcount;
+ i++;
+ }
+ }
+ mip->map_valid += mip->nmap;
+ }
+
+ /*
+ * No valid mappings, so no more data blocks.
+ */
+ if (!mip->map_valid) {
+ *curoff = xfs_dir2_da_to_byte(mp, mip->map_off);
+ goto out;
+ }
+
+ /*
+ * Read the directory block starting at the first mapping.
+ */
+ mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
+ error = xfs_dir2_data_read(NULL, dp, map->br_startoff,
+ map->br_blockcount >= mp->m_dirblkfsbs ?
+ XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
+
+ /*
+ * Should just skip over the data block instead of giving up.
+ */
+ if (error)
+ goto out; /* XXX */
+
+ /*
+ * Adjust the current amount of read-ahead: we just read a block that
+ * was previously ra.
+ */
+ if (mip->ra_current)
+ mip->ra_current -= mp->m_dirblkfsbs;
+
+ /*
+ * Do we need more readahead?
+ */
+ for (mip->ra_index = mip->ra_offset = i = 0;
+ mip->ra_want > mip->ra_current && i < mip->map_blocks;
+ i += mp->m_dirblkfsbs) {
+ ASSERT(mip->ra_index < mip->map_valid);
+ /*
+ * Read-ahead a contiguous directory block.
+ */
+ if (i > mip->ra_current &&
+ map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
+ xfs_dir2_data_readahead(NULL, dp,
+ map[mip->ra_index].br_startoff + mip->ra_offset,
+ XFS_FSB_TO_DADDR(mp,
+ map[mip->ra_index].br_startblock +
+ mip->ra_offset));
+ mip->ra_current = i;
+ }
+
+ /*
+ * Read-ahead a non-contiguous directory block. This doesn't
+ * use our mapping, but this is a very rare case.
+ */
+ else if (i > mip->ra_current) {
+ xfs_dir2_data_readahead(NULL, dp,
+ map[mip->ra_index].br_startoff +
+ mip->ra_offset, -1);
+ mip->ra_current = i;
+ }
+
+ /*
+ * Advance offset through the mapping table.
+ */
+ for (j = 0; j < mp->m_dirblkfsbs; j++) {
+ /*
+ * The rest of this extent but not more than a dir
+ * block.
+ */
+ length = min_t(int, mp->m_dirblkfsbs,
+ map[mip->ra_index].br_blockcount -
+ mip->ra_offset);
+ j += length;
+ mip->ra_offset += length;
+
+ /*
+ * Advance to the next mapping if this one is used up.
+ */
+ if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
+ mip->ra_offset = 0;
+ mip->ra_index++;
+ }
+ }
+ }
+
+out:
+ *bpp = bp;
+ return error;
+}
+
/*
* Getdents (readdir) for leaf and node directories.
* This reads the data blocks only, so is the same for both forms.
@@ -775,30 +1054,18 @@ xfs_dir2_leaf_getdents(
xfs_off_t *offset,
filldir_t filldir)
{
- xfs_dabuf_t *bp; /* data block buffer */
- int byteoff; /* offset in current block */
- xfs_dir2_db_t curdb; /* db for current block */
- xfs_dir2_off_t curoff; /* current overall offset */
+ struct xfs_buf *bp = NULL; /* data block buffer */
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_data_entry_t *dep; /* data entry */
xfs_dir2_data_unused_t *dup; /* unused entry */
int error = 0; /* error return value */
- int i; /* temporary loop index */
- int j; /* temporary loop index */
int length; /* temporary length value */
- xfs_bmbt_irec_t *map; /* map vector for blocks */
- xfs_extlen_t map_blocks; /* number of fsbs in map */
- xfs_dablk_t map_off; /* last mapped file offset */
- int map_size; /* total entries in *map */
- int map_valid; /* valid entries in *map */
xfs_mount_t *mp; /* filesystem mount point */
+ int byteoff; /* offset in current block */
+ xfs_dir2_off_t curoff; /* current overall offset */
xfs_dir2_off_t newoff; /* new curoff after new blk */
- int nmap; /* mappings to ask xfs_bmapi */
char *ptr = NULL; /* pointer to current data */
- int ra_current; /* number of read-ahead blks */
- int ra_index; /* *map index for read-ahead */
- int ra_offset; /* map entry offset for ra */
- int ra_want; /* readahead count wanted */
+ struct xfs_dir2_leaf_map_info *map_info;
/*
* If the offset is at or past the largest allowed value,
@@ -814,10 +1081,12 @@ xfs_dir2_leaf_getdents(
* buffer size, the directory block size, and the filesystem
* block size.
*/
- map_size = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize);
- map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP);
- map_valid = ra_index = ra_offset = ra_current = map_blocks = 0;
- bp = NULL;
+ length = howmany(bufsize + mp->m_dirblksize,
+ mp->m_sb.sb_blocksize);
+ map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
+ (length * sizeof(struct xfs_bmbt_irec)),
+ KM_SLEEP);
+ map_info->map_size = length;
/*
* Inside the loop we keep the main offset value as a byte offset
@@ -829,7 +1098,9 @@ xfs_dir2_leaf_getdents(
* Force this conversion through db so we truncate the offset
* down to get the start of the data block.
*/
- map_off = xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, curoff));
+ map_info->map_off = xfs_dir2_db_to_da(mp,
+ xfs_dir2_byte_to_db(mp, curoff));
+
/*
* Loop over directory entries until we reach the end offset.
* Get more blocks and readahead as necessary.
@@ -839,191 +1110,17 @@ xfs_dir2_leaf_getdents(
* If we have no buffer, or we're off the end of the
* current buffer, need to get another one.
*/
- if (!bp || ptr >= (char *)bp->data + mp->m_dirblksize) {
- /*
- * If we have a buffer, we need to release it and
- * take it out of the mapping.
- */
- if (bp) {
- xfs_da_brelse(NULL, bp);
- bp = NULL;
- map_blocks -= mp->m_dirblkfsbs;
- /*
- * Loop to get rid of the extents for the
- * directory block.
- */
- for (i = mp->m_dirblkfsbs; i > 0; ) {
- j = MIN((int)map->br_blockcount, i);
- map->br_blockcount -= j;
- map->br_startblock += j;
- map->br_startoff += j;
- /*
- * If mapping is done, pitch it from
- * the table.
- */
- if (!map->br_blockcount && --map_valid)
- memmove(&map[0], &map[1],
- sizeof(map[0]) *
- map_valid);
- i -= j;
- }
- }
- /*
- * Recalculate the readahead blocks wanted.
- */
- ra_want = howmany(bufsize + mp->m_dirblksize,
- mp->m_sb.sb_blocksize) - 1;
- ASSERT(ra_want >= 0);
+ if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) {
- /*
- * If we don't have as many as we want, and we haven't
- * run out of data blocks, get some more mappings.
- */
- if (1 + ra_want > map_blocks &&
- map_off <
- xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
- /*
- * Get more bmaps, fill in after the ones
- * we already have in the table.
- */
- nmap = map_size - map_valid;
- error = xfs_bmapi_read(dp, map_off,
- xfs_dir2_byte_to_da(mp,
- XFS_DIR2_LEAF_OFFSET) - map_off,
- &map[map_valid], &nmap, 0);
- /*
- * Don't know if we should ignore this or
- * try to return an error.
- * The trouble with returning errors
- * is that readdir will just stop without
- * actually passing the error through.
- */
- if (error)
- break; /* XXX */
- /*
- * If we got all the mappings we asked for,
- * set the final map offset based on the
- * last bmap value received.
- * Otherwise, we've reached the end.
- */
- if (nmap == map_size - map_valid)
- map_off =
- map[map_valid + nmap - 1].br_startoff +
- map[map_valid + nmap - 1].br_blockcount;
- else
- map_off =
- xfs_dir2_byte_to_da(mp,
- XFS_DIR2_LEAF_OFFSET);
- /*
- * Look for holes in the mapping, and
- * eliminate them. Count up the valid blocks.
- */
- for (i = map_valid; i < map_valid + nmap; ) {
- if (map[i].br_startblock ==
- HOLESTARTBLOCK) {
- nmap--;
- length = map_valid + nmap - i;
- if (length)
- memmove(&map[i],
- &map[i + 1],
- sizeof(map[i]) *
- length);
- } else {
- map_blocks +=
- map[i].br_blockcount;
- i++;
- }
- }
- map_valid += nmap;
- }
- /*
- * No valid mappings, so no more data blocks.
- */
- if (!map_valid) {
- curoff = xfs_dir2_da_to_byte(mp, map_off);
+ error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info,
+ &curoff, &bp);
+ if (error || !map_info->map_valid)
break;
- }
- /*
- * Read the directory block starting at the first
- * mapping.
- */
- curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
- error = xfs_da_read_buf(NULL, dp, map->br_startoff,
- map->br_blockcount >= mp->m_dirblkfsbs ?
- XFS_FSB_TO_DADDR(mp, map->br_startblock) :
- -1,
- &bp, XFS_DATA_FORK);
- /*
- * Should just skip over the data block instead
- * of giving up.
- */
- if (error)
- break; /* XXX */
- /*
- * Adjust the current amount of read-ahead: we just
- * read a block that was previously ra.
- */
- if (ra_current)
- ra_current -= mp->m_dirblkfsbs;
- /*
- * Do we need more readahead?
- */
- for (ra_index = ra_offset = i = 0;
- ra_want > ra_current && i < map_blocks;
- i += mp->m_dirblkfsbs) {
- ASSERT(ra_index < map_valid);
- /*
- * Read-ahead a contiguous directory block.
- */
- if (i > ra_current &&
- map[ra_index].br_blockcount >=
- mp->m_dirblkfsbs) {
- xfs_buf_readahead(mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(mp,
- map[ra_index].br_startblock +
- ra_offset),
- (int)BTOBB(mp->m_dirblksize));
- ra_current = i;
- }
- /*
- * Read-ahead a non-contiguous directory block.
- * This doesn't use our mapping, but this
- * is a very rare case.
- */
- else if (i > ra_current) {
- (void)xfs_da_reada_buf(NULL, dp,
- map[ra_index].br_startoff +
- ra_offset, XFS_DATA_FORK);
- ra_current = i;
- }
- /*
- * Advance offset through the mapping table.
- */
- for (j = 0; j < mp->m_dirblkfsbs; j++) {
- /*
- * The rest of this extent but not
- * more than a dir block.
- */
- length = MIN(mp->m_dirblkfsbs,
- (int)(map[ra_index].br_blockcount -
- ra_offset));
- j += length;
- ra_offset += length;
- /*
- * Advance to the next mapping if
- * this one is used up.
- */
- if (ra_offset ==
- map[ra_index].br_blockcount) {
- ra_offset = 0;
- ra_index++;
- }
- }
- }
+
/*
* Having done a read, we need to set a new offset.
*/
- newoff = xfs_dir2_db_off_to_byte(mp, curdb, 0);
+ newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0);
/*
* Start of the current block.
*/
@@ -1034,8 +1131,8 @@ xfs_dir2_leaf_getdents(
*/
else if (curoff > newoff)
ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
- curdb);
- hdr = bp->data;
+ map_info->curdb);
+ hdr = bp->b_addr;
xfs_dir2_data_check(dp, bp);
/*
* Find our position in the block.
@@ -1117,9 +1214,9 @@ xfs_dir2_leaf_getdents(
*offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
else
*offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
- kmem_free(map);
+ kmem_free(map_info);
if (bp)
- xfs_da_brelse(NULL, bp);
+ xfs_trans_brelse(NULL, bp);
return error;
}
@@ -1130,10 +1227,10 @@ int
xfs_dir2_leaf_init(
xfs_da_args_t *args, /* operation arguments */
xfs_dir2_db_t bno, /* directory block number */
- xfs_dabuf_t **bpp, /* out: leaf buffer */
+ struct xfs_buf **bpp, /* out: leaf buffer */
int magic) /* magic number for block */
{
- xfs_dabuf_t *bp; /* leaf buffer */
+ struct xfs_buf *bp; /* leaf buffer */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
xfs_dir2_leaf_t *leaf; /* leaf structure */
@@ -1151,15 +1248,14 @@ xfs_dir2_leaf_init(
* Get the buffer for the block.
*/
error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
- XFS_DATA_FORK);
- if (error) {
+ XFS_DATA_FORK);
+ if (error)
return error;
- }
- ASSERT(bp != NULL);
- leaf = bp->data;
+
/*
* Initialize the header.
*/
+ leaf = bp->b_addr;
leaf->hdr.info.magic = cpu_to_be16(magic);
leaf->hdr.info.forw = 0;
leaf->hdr.info.back = 0;
@@ -1172,10 +1268,12 @@ xfs_dir2_leaf_init(
* the block.
*/
if (magic == XFS_DIR2_LEAF1_MAGIC) {
+ bp->b_ops = &xfs_dir2_leaf1_buf_ops;
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
ltp->bestcount = 0;
xfs_dir2_leaf_log_tail(tp, bp);
- }
+ } else
+ bp->b_ops = &xfs_dir2_leafn_buf_ops;
*bpp = bp;
return 0;
}
@@ -1186,7 +1284,7 @@ xfs_dir2_leaf_init(
static void
xfs_dir2_leaf_log_bests(
xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp, /* leaf buffer */
+ struct xfs_buf *bp, /* leaf buffer */
int first, /* first entry to log */
int last) /* last entry to log */
{
@@ -1195,12 +1293,12 @@ xfs_dir2_leaf_log_bests(
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf);
firstb = xfs_dir2_leaf_bests_p(ltp) + first;
lastb = xfs_dir2_leaf_bests_p(ltp) + last;
- xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf),
+ xfs_trans_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf),
(uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
}
@@ -1210,7 +1308,7 @@ xfs_dir2_leaf_log_bests(
void
xfs_dir2_leaf_log_ents(
xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp, /* leaf buffer */
+ struct xfs_buf *bp, /* leaf buffer */
int first, /* first entry to log */
int last) /* last entry to log */
{
@@ -1218,12 +1316,12 @@ xfs_dir2_leaf_log_ents(
xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */
xfs_dir2_leaf_t *leaf; /* leaf structure */
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
firstlep = &leaf->ents[first];
lastlep = &leaf->ents[last];
- xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
+ xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
}
@@ -1232,15 +1330,15 @@ xfs_dir2_leaf_log_ents(
*/
void
xfs_dir2_leaf_log_header(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp) /* leaf buffer */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
{
xfs_dir2_leaf_t *leaf; /* leaf structure */
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
+ xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
(uint)(sizeof(leaf->hdr) - 1));
}
@@ -1249,18 +1347,18 @@ xfs_dir2_leaf_log_header(
*/
STATIC void
xfs_dir2_leaf_log_tail(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp) /* leaf buffer */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
{
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
xfs_mount_t *mp; /* filesystem mount point */
mp = tp->t_mountp;
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
- xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
+ xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
(uint)(mp->m_dirblksize - 1));
}
@@ -1273,12 +1371,12 @@ int
xfs_dir2_leaf_lookup(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_dabuf_t *dbp; /* data block buffer */
+ struct xfs_buf *dbp; /* data block buffer */
xfs_dir2_data_entry_t *dep; /* data block entry */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
int index; /* found entry index */
- xfs_dabuf_t *lbp; /* leaf buffer */
+ struct xfs_buf *lbp; /* leaf buffer */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_trans_t *tp; /* transaction pointer */
@@ -1294,7 +1392,7 @@ xfs_dir2_leaf_lookup(
tp = args->trans;
dp = args->dp;
xfs_dir2_leaf_check(dp, lbp);
- leaf = lbp->data;
+ leaf = lbp->b_addr;
/*
* Get to the leaf entry and contained data entry address.
*/
@@ -1303,15 +1401,15 @@ xfs_dir2_leaf_lookup(
* Point to the data entry.
*/
dep = (xfs_dir2_data_entry_t *)
- ((char *)dbp->data +
+ ((char *)dbp->b_addr +
xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
/*
* Return the found inode number & CI name if appropriate
*/
args->inumber = be64_to_cpu(dep->inumber);
error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
- xfs_da_brelse(tp, dbp);
- xfs_da_brelse(tp, lbp);
+ xfs_trans_brelse(tp, dbp);
+ xfs_trans_brelse(tp, lbp);
return XFS_ERROR(error);
}
@@ -1324,17 +1422,17 @@ xfs_dir2_leaf_lookup(
static int /* error */
xfs_dir2_leaf_lookup_int(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t **lbpp, /* out: leaf buffer */
+ struct xfs_buf **lbpp, /* out: leaf buffer */
int *indexp, /* out: index in leaf block */
- xfs_dabuf_t **dbpp) /* out: data buffer */
+ struct xfs_buf **dbpp) /* out: data buffer */
{
xfs_dir2_db_t curdb = -1; /* current data block number */
- xfs_dabuf_t *dbp = NULL; /* data buffer */
+ struct xfs_buf *dbp = NULL; /* data buffer */
xfs_dir2_data_entry_t *dep; /* data entry */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
int index; /* index in leaf block */
- xfs_dabuf_t *lbp; /* leaf buffer */
+ struct xfs_buf *lbp; /* leaf buffer */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_mount_t *mp; /* filesystem mount point */
@@ -1346,15 +1444,13 @@ xfs_dir2_leaf_lookup_int(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- /*
- * Read the leaf block into the buffer.
- */
- error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
- XFS_DATA_FORK);
+
+ error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
if (error)
return error;
+
*lbpp = lbp;
- leaf = lbp->data;
+ leaf = lbp->b_addr;
xfs_dir2_leaf_check(dp, lbp);
/*
* Look for the first leaf entry with our hash value.
@@ -1382,21 +1478,20 @@ xfs_dir2_leaf_lookup_int(
*/
if (newdb != curdb) {
if (dbp)
- xfs_da_brelse(tp, dbp);
- error = xfs_da_read_buf(tp, dp,
- xfs_dir2_db_to_da(mp, newdb),
- -1, &dbp, XFS_DATA_FORK);
+ xfs_trans_brelse(tp, dbp);
+ error = xfs_dir2_data_read(tp, dp,
+ xfs_dir2_db_to_da(mp, newdb),
+ -1, &dbp);
if (error) {
- xfs_da_brelse(tp, lbp);
+ xfs_trans_brelse(tp, lbp);
return error;
}
- xfs_dir2_data_check(dp, dbp);
curdb = newdb;
}
/*
* Point to the data entry.
*/
- dep = (xfs_dir2_data_entry_t *)((char *)dbp->data +
+ dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr +
xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
/*
* Compare name and if it's an exact match, return the index
@@ -1424,12 +1519,12 @@ xfs_dir2_leaf_lookup_int(
if (args->cmpresult == XFS_CMP_CASE) {
ASSERT(cidb != -1);
if (cidb != curdb) {
- xfs_da_brelse(tp, dbp);
- error = xfs_da_read_buf(tp, dp,
- xfs_dir2_db_to_da(mp, cidb),
- -1, &dbp, XFS_DATA_FORK);
+ xfs_trans_brelse(tp, dbp);
+ error = xfs_dir2_data_read(tp, dp,
+ xfs_dir2_db_to_da(mp, cidb),
+ -1, &dbp);
if (error) {
- xfs_da_brelse(tp, lbp);
+ xfs_trans_brelse(tp, lbp);
return error;
}
}
@@ -1441,8 +1536,8 @@ xfs_dir2_leaf_lookup_int(
*/
ASSERT(cidb == -1);
if (dbp)
- xfs_da_brelse(tp, dbp);
- xfs_da_brelse(tp, lbp);
+ xfs_trans_brelse(tp, dbp);
+ xfs_trans_brelse(tp, lbp);
return XFS_ERROR(ENOENT);
}
@@ -1456,13 +1551,13 @@ xfs_dir2_leaf_removename(
__be16 *bestsp; /* leaf block best freespace */
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_db_t db; /* data block number */
- xfs_dabuf_t *dbp; /* data block buffer */
+ struct xfs_buf *dbp; /* data block buffer */
xfs_dir2_data_entry_t *dep; /* data entry structure */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
xfs_dir2_db_t i; /* temporary data block # */
int index; /* index into leaf entries */
- xfs_dabuf_t *lbp; /* leaf buffer */
+ struct xfs_buf *lbp; /* leaf buffer */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
@@ -1483,8 +1578,8 @@ xfs_dir2_leaf_removename(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- leaf = lbp->data;
- hdr = dbp->data;
+ leaf = lbp->b_addr;
+ hdr = dbp->b_addr;
xfs_dir2_data_check(dp, dbp);
/*
* Point to the leaf entry, use that to point to the data entry.
@@ -1541,12 +1636,9 @@ xfs_dir2_leaf_removename(
* Just go on, returning success, leaving the
* empty block in place.
*/
- if (error == ENOSPC && args->total == 0) {
- xfs_da_buf_done(dbp);
+ if (error == ENOSPC && args->total == 0)
error = 0;
- }
xfs_dir2_leaf_check(dp, lbp);
- xfs_da_buf_done(lbp);
return error;
}
dbp = NULL;
@@ -1577,10 +1669,9 @@ xfs_dir2_leaf_removename(
/*
* If the data block was not the first one, drop it.
*/
- else if (db != mp->m_dirdatablk && dbp != NULL) {
- xfs_da_buf_done(dbp);
+ else if (db != mp->m_dirdatablk)
dbp = NULL;
- }
+
xfs_dir2_leaf_check(dp, lbp);
/*
* See if we can convert to block form.
@@ -1595,12 +1686,12 @@ int /* error */
xfs_dir2_leaf_replace(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_dabuf_t *dbp; /* data block buffer */
+ struct xfs_buf *dbp; /* data block buffer */
xfs_dir2_data_entry_t *dep; /* data block entry */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
int index; /* index of leaf entry */
- xfs_dabuf_t *lbp; /* leaf buffer */
+ struct xfs_buf *lbp; /* leaf buffer */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_trans_t *tp; /* transaction pointer */
@@ -1614,7 +1705,7 @@ xfs_dir2_leaf_replace(
return error;
}
dp = args->dp;
- leaf = lbp->data;
+ leaf = lbp->b_addr;
/*
* Point to the leaf entry, get data address from it.
*/
@@ -1623,7 +1714,7 @@ xfs_dir2_leaf_replace(
* Point to the data entry.
*/
dep = (xfs_dir2_data_entry_t *)
- ((char *)dbp->data +
+ ((char *)dbp->b_addr +
xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
ASSERT(args->inumber != be64_to_cpu(dep->inumber));
/*
@@ -1632,9 +1723,8 @@ xfs_dir2_leaf_replace(
dep->inumber = cpu_to_be64(args->inumber);
tp = args->trans;
xfs_dir2_data_log_entry(tp, dbp, dep);
- xfs_da_buf_done(dbp);
xfs_dir2_leaf_check(dp, lbp);
- xfs_da_brelse(tp, lbp);
+ xfs_trans_brelse(tp, lbp);
return 0;
}
@@ -1646,7 +1736,7 @@ xfs_dir2_leaf_replace(
int /* index value */
xfs_dir2_leaf_search_hash(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *lbp) /* leaf buffer */
+ struct xfs_buf *lbp) /* leaf buffer */
{
xfs_dahash_t hash=0; /* hash from this entry */
xfs_dahash_t hashwant; /* hash value looking for */
@@ -1656,7 +1746,7 @@ xfs_dir2_leaf_search_hash(
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
int mid=0; /* current leaf index */
- leaf = lbp->data;
+ leaf = lbp->b_addr;
#ifndef __KERNEL__
if (!leaf->hdr.count)
return 0;
@@ -1699,11 +1789,11 @@ xfs_dir2_leaf_search_hash(
int /* error */
xfs_dir2_leaf_trim_data(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *lbp, /* leaf buffer */
+ struct xfs_buf *lbp, /* leaf buffer */
xfs_dir2_db_t db) /* data block number */
{
__be16 *bestsp; /* leaf bests table */
- xfs_dabuf_t *dbp; /* data block buffer */
+ struct xfs_buf *dbp; /* data block buffer */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return value */
xfs_dir2_leaf_t *leaf; /* leaf structure */
@@ -1717,17 +1807,16 @@ xfs_dir2_leaf_trim_data(
/*
* Read the offending data block. We need its buffer.
*/
- if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp,
- XFS_DATA_FORK))) {
+ error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
+ if (error)
return error;
- }
- leaf = lbp->data;
+ leaf = lbp->b_addr;
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
#ifdef DEBUG
{
- struct xfs_dir2_data_hdr *hdr = dbp->data;
+ struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
ASSERT(be16_to_cpu(hdr->bestfree[0].length) ==
@@ -1741,7 +1830,7 @@ xfs_dir2_leaf_trim_data(
*/
if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
ASSERT(error != ENOSPC);
- xfs_da_brelse(tp, dbp);
+ xfs_trans_brelse(tp, dbp);
return error;
}
/*
@@ -1781,10 +1870,10 @@ xfs_dir2_node_to_leaf(
xfs_da_args_t *args; /* operation arguments */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
- xfs_dabuf_t *fbp; /* buffer for freespace block */
+ struct xfs_buf *fbp; /* buffer for freespace block */
xfs_fileoff_t fo; /* freespace file offset */
xfs_dir2_free_t *free; /* freespace structure */
- xfs_dabuf_t *lbp; /* buffer for leaf block */
+ struct xfs_buf *lbp; /* buffer for leaf block */
xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_mount_t *mp; /* filesystem mount point */
@@ -1838,16 +1927,15 @@ xfs_dir2_node_to_leaf(
if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize)
return 0;
lbp = state->path.blk[0].bp;
- leaf = lbp->data;
+ leaf = lbp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
/*
* Read the freespace block.
*/
- if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp,
- XFS_DATA_FORK))) {
+ error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp);
+ if (error)
return error;
- }
- free = fbp->data;
+ free = fbp->b_addr;
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
ASSERT(!free->hdr.firstdb);
@@ -1857,7 +1945,7 @@ xfs_dir2_node_to_leaf(
*/
if (xfs_dir2_leaf_size(&leaf->hdr, be32_to_cpu(free->hdr.nvalid)) >
mp->m_dirblksize) {
- xfs_da_brelse(tp, fbp);
+ xfs_trans_brelse(tp, fbp);
return 0;
}
@@ -1869,7 +1957,10 @@ xfs_dir2_node_to_leaf(
xfs_dir2_leaf_compact(args, lbp);
else
xfs_dir2_leaf_log_header(tp, lbp);
+
+ lbp->b_ops = &xfs_dir2_leaf1_buf_ops;
leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
+
/*
* Set up the leaf tail from the freespace block.
*/
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index b0f26780449..5980f9b7fa9 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -36,40 +36,108 @@
/*
* Function declarations.
*/
-static void xfs_dir2_free_log_header(xfs_trans_t *tp, xfs_dabuf_t *bp);
-static int xfs_dir2_leafn_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index);
+static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args,
+ int index);
#ifdef DEBUG
-static void xfs_dir2_leafn_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
+static void xfs_dir2_leafn_check(struct xfs_inode *dp, struct xfs_buf *bp);
#else
#define xfs_dir2_leafn_check(dp, bp)
#endif
-static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, xfs_dabuf_t *bp_s,
- int start_s, xfs_dabuf_t *bp_d, int start_d,
- int count);
+static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, struct xfs_buf *bp_s,
+ int start_s, struct xfs_buf *bp_d,
+ int start_d, int count);
static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
xfs_da_state_blk_t *blk1,
xfs_da_state_blk_t *blk2);
-static int xfs_dir2_leafn_remove(xfs_da_args_t *args, xfs_dabuf_t *bp,
+static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
int index, xfs_da_state_blk_t *dblk,
int *rval);
static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
xfs_da_state_blk_t *fblk);
+static void
+xfs_dir2_free_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+ int block_ok = 0;
+
+ block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC);
+ if (!block_ok) {
+ XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic",
+ XFS_ERRLEVEL_LOW, mp, hdr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_dir2_free_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_free_verify(bp);
+}
+
+static void
+xfs_dir2_free_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_free_verify(bp);
+}
+
+static const struct xfs_buf_ops xfs_dir2_free_buf_ops = {
+ .verify_read = xfs_dir2_free_read_verify,
+ .verify_write = xfs_dir2_free_write_verify,
+};
+
+
+static int
+__xfs_dir2_free_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp)
+{
+ return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir2_free_buf_ops);
+}
+
+int
+xfs_dir2_free_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ struct xfs_buf **bpp)
+{
+ return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp);
+}
+
+static int
+xfs_dir2_free_try_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ struct xfs_buf **bpp)
+{
+ return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp);
+}
+
/*
* Log entries from a freespace block.
*/
STATIC void
xfs_dir2_free_log_bests(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp, /* freespace buffer */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
int first, /* first entry to log */
int last) /* last entry to log */
{
xfs_dir2_free_t *free; /* freespace structure */
- free = bp->data;
+ free = bp->b_addr;
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
- xfs_da_log_buf(tp, bp,
+ xfs_trans_log_buf(tp, bp,
(uint)((char *)&free->bests[first] - (char *)free),
(uint)((char *)&free->bests[last] - (char *)free +
sizeof(free->bests[0]) - 1));
@@ -80,14 +148,14 @@ xfs_dir2_free_log_bests(
*/
static void
xfs_dir2_free_log_header(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp) /* freespace buffer */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
{
xfs_dir2_free_t *free; /* freespace structure */
- free = bp->data;
+ free = bp->b_addr;
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
- xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free),
+ xfs_trans_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free),
(uint)(sizeof(xfs_dir2_free_hdr_t) - 1));
}
@@ -99,11 +167,11 @@ xfs_dir2_free_log_header(
int /* error */
xfs_dir2_leaf_to_node(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *lbp) /* leaf buffer */
+ struct xfs_buf *lbp) /* leaf buffer */
{
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return value */
- xfs_dabuf_t *fbp; /* freespace buffer */
+ struct xfs_buf *fbp; /* freespace buffer */
xfs_dir2_db_t fdb; /* freespace block number */
xfs_dir2_free_t *free; /* freespace structure */
__be16 *from; /* pointer to freespace entry */
@@ -131,13 +199,14 @@ xfs_dir2_leaf_to_node(
/*
* Get the buffer for the new freespace block.
*/
- if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
- XFS_DATA_FORK))) {
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
+ XFS_DATA_FORK);
+ if (error)
return error;
- }
- ASSERT(fbp != NULL);
- free = fbp->data;
- leaf = lbp->data;
+ fbp->b_ops = &xfs_dir2_free_buf_ops;
+
+ free = fbp->b_addr;
+ leaf = lbp->b_addr;
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
/*
* Initialize the freespace block header.
@@ -157,14 +226,16 @@ xfs_dir2_leaf_to_node(
*to = cpu_to_be16(off);
}
free->hdr.nused = cpu_to_be32(n);
+
+ lbp->b_ops = &xfs_dir2_leafn_buf_ops;
leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+
/*
* Log everything.
*/
xfs_dir2_leaf_log_header(tp, lbp);
xfs_dir2_free_log_header(tp, fbp);
xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1);
- xfs_da_buf_done(fbp);
xfs_dir2_leafn_check(dp, lbp);
return 0;
}
@@ -175,7 +246,7 @@ xfs_dir2_leaf_to_node(
*/
static int /* error */
xfs_dir2_leafn_add(
- xfs_dabuf_t *bp, /* leaf buffer */
+ struct xfs_buf *bp, /* leaf buffer */
xfs_da_args_t *args, /* operation arguments */
int index) /* insertion pt for new entry */
{
@@ -195,7 +266,7 @@ xfs_dir2_leafn_add(
dp = args->dp;
mp = dp->i_mount;
tp = args->trans;
- leaf = bp->data;
+ leaf = bp->b_addr;
/*
* Quick check just to make sure we are not going to index
@@ -261,15 +332,15 @@ xfs_dir2_leafn_add(
*/
void
xfs_dir2_leafn_check(
- xfs_inode_t *dp, /* incore directory inode */
- xfs_dabuf_t *bp) /* leaf buffer */
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
{
int i; /* leaf index */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_mount_t *mp; /* filesystem mount point */
int stale; /* count of stale leaves */
- leaf = bp->data;
+ leaf = bp->b_addr;
mp = dp->i_mount;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp));
@@ -291,12 +362,12 @@ xfs_dir2_leafn_check(
*/
xfs_dahash_t /* hash value */
xfs_dir2_leafn_lasthash(
- xfs_dabuf_t *bp, /* leaf buffer */
+ struct xfs_buf *bp, /* leaf buffer */
int *count) /* count of entries in leaf */
{
xfs_dir2_leaf_t *leaf; /* leaf structure */
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
if (count)
*count = be16_to_cpu(leaf->hdr.count);
@@ -311,12 +382,12 @@ xfs_dir2_leafn_lasthash(
*/
STATIC int
xfs_dir2_leafn_lookup_for_addname(
- xfs_dabuf_t *bp, /* leaf buffer */
+ struct xfs_buf *bp, /* leaf buffer */
xfs_da_args_t *args, /* operation arguments */
int *indexp, /* out: leaf entry index */
xfs_da_state_t *state) /* state to fill in */
{
- xfs_dabuf_t *curbp = NULL; /* current data/free buffer */
+ struct xfs_buf *curbp = NULL; /* current data/free buffer */
xfs_dir2_db_t curdb = -1; /* current data block number */
xfs_dir2_db_t curfdb = -1; /* current free block number */
xfs_inode_t *dp; /* incore directory inode */
@@ -335,7 +406,7 @@ xfs_dir2_leafn_lookup_for_addname(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
#ifdef __KERNEL__
ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
@@ -352,7 +423,7 @@ xfs_dir2_leafn_lookup_for_addname(
/* If so, it's a free block buffer, get the block number. */
curbp = state->extrablk.bp;
curfdb = state->extrablk.blkno;
- free = curbp->data;
+ free = curbp->b_addr;
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
}
length = xfs_dir2_data_entsize(args->namelen);
@@ -394,16 +465,14 @@ xfs_dir2_leafn_lookup_for_addname(
* If we had one before, drop it.
*/
if (curbp)
- xfs_da_brelse(tp, curbp);
- /*
- * Read the free block.
- */
- error = xfs_da_read_buf(tp, dp,
+ xfs_trans_brelse(tp, curbp);
+
+ error = xfs_dir2_free_read(tp, dp,
xfs_dir2_db_to_da(mp, newfdb),
- -1, &curbp, XFS_DATA_FORK);
+ &curbp);
if (error)
return error;
- free = curbp->data;
+ free = curbp->b_addr;
ASSERT(be32_to_cpu(free->hdr.magic) ==
XFS_DIR2_FREE_MAGIC);
ASSERT((be32_to_cpu(free->hdr.firstdb) %
@@ -424,7 +493,7 @@ xfs_dir2_leafn_lookup_for_addname(
XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
XFS_ERRLEVEL_LOW, mp);
if (curfdb != newfdb)
- xfs_da_brelse(tp, curbp);
+ xfs_trans_brelse(tp, curbp);
return XFS_ERROR(EFSCORRUPTED);
}
curfdb = newfdb;
@@ -459,12 +528,12 @@ out:
*/
STATIC int
xfs_dir2_leafn_lookup_for_entry(
- xfs_dabuf_t *bp, /* leaf buffer */
+ struct xfs_buf *bp, /* leaf buffer */
xfs_da_args_t *args, /* operation arguments */
int *indexp, /* out: leaf entry index */
xfs_da_state_t *state) /* state to fill in */
{
- xfs_dabuf_t *curbp = NULL; /* current data/free buffer */
+ struct xfs_buf *curbp = NULL; /* current data/free buffer */
xfs_dir2_db_t curdb = -1; /* current data block number */
xfs_dir2_data_entry_t *dep; /* data block entry */
xfs_inode_t *dp; /* incore directory inode */
@@ -480,7 +549,7 @@ xfs_dir2_leafn_lookup_for_entry(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
#ifdef __KERNEL__
ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
@@ -525,7 +594,7 @@ xfs_dir2_leafn_lookup_for_entry(
*/
if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
curdb != state->extrablk.blkno))
- xfs_da_brelse(tp, curbp);
+ xfs_trans_brelse(tp, curbp);
/*
* If needing the block that is saved with a CI match,
* use it otherwise read in the new data block.
@@ -535,9 +604,9 @@ xfs_dir2_leafn_lookup_for_entry(
ASSERT(state->extravalid);
curbp = state->extrablk.bp;
} else {
- error = xfs_da_read_buf(tp, dp,
+ error = xfs_dir2_data_read(tp, dp,
xfs_dir2_db_to_da(mp, newdb),
- -1, &curbp, XFS_DATA_FORK);
+ -1, &curbp);
if (error)
return error;
}
@@ -547,7 +616,7 @@ xfs_dir2_leafn_lookup_for_entry(
/*
* Point to the data entry.
*/
- dep = (xfs_dir2_data_entry_t *)((char *)curbp->data +
+ dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr +
xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
/*
* Compare the entry and if it's an exact match, return
@@ -559,7 +628,7 @@ xfs_dir2_leafn_lookup_for_entry(
/* If there is a CI match block, drop it */
if (args->cmpresult != XFS_CMP_DIFFERENT &&
curdb != state->extrablk.blkno)
- xfs_da_brelse(tp, state->extrablk.bp);
+ xfs_trans_brelse(tp, state->extrablk.bp);
args->cmpresult = cmp;
args->inumber = be64_to_cpu(dep->inumber);
*indexp = index;
@@ -567,8 +636,9 @@ xfs_dir2_leafn_lookup_for_entry(
state->extrablk.bp = curbp;
state->extrablk.blkno = curdb;
state->extrablk.index = (int)((char *)dep -
- (char *)curbp->data);
+ (char *)curbp->b_addr);
state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+ curbp->b_ops = &xfs_dir2_data_buf_ops;
if (cmp == XFS_CMP_EXACT)
return XFS_ERROR(EEXIST);
}
@@ -583,10 +653,11 @@ xfs_dir2_leafn_lookup_for_entry(
state->extrablk.index = -1;
state->extrablk.blkno = curdb;
state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+ curbp->b_ops = &xfs_dir2_data_buf_ops;
} else {
/* If the curbp is not the CI match block, drop it */
if (state->extrablk.bp != curbp)
- xfs_da_brelse(tp, curbp);
+ xfs_trans_brelse(tp, curbp);
}
} else {
state->extravalid = 0;
@@ -602,7 +673,7 @@ xfs_dir2_leafn_lookup_for_entry(
*/
int
xfs_dir2_leafn_lookup_int(
- xfs_dabuf_t *bp, /* leaf buffer */
+ struct xfs_buf *bp, /* leaf buffer */
xfs_da_args_t *args, /* operation arguments */
int *indexp, /* out: leaf entry index */
xfs_da_state_t *state) /* state to fill in */
@@ -620,9 +691,9 @@ xfs_dir2_leafn_lookup_int(
static void
xfs_dir2_leafn_moveents(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *bp_s, /* source leaf buffer */
+ struct xfs_buf *bp_s, /* source leaf buffer */
int start_s, /* source leaf index */
- xfs_dabuf_t *bp_d, /* destination leaf buffer */
+ struct xfs_buf *bp_d, /* destination leaf buffer */
int start_d, /* destination leaf index */
int count) /* count of leaves to copy */
{
@@ -640,8 +711,8 @@ xfs_dir2_leafn_moveents(
return;
}
tp = args->trans;
- leaf_s = bp_s->data;
- leaf_d = bp_d->data;
+ leaf_s = bp_s->b_addr;
+ leaf_d = bp_d->b_addr;
/*
* If the destination index is not the end of the current
* destination leaf entries, open up a hole in the destination
@@ -702,14 +773,14 @@ xfs_dir2_leafn_moveents(
*/
int /* sort order */
xfs_dir2_leafn_order(
- xfs_dabuf_t *leaf1_bp, /* leaf1 buffer */
- xfs_dabuf_t *leaf2_bp) /* leaf2 buffer */
+ struct xfs_buf *leaf1_bp, /* leaf1 buffer */
+ struct xfs_buf *leaf2_bp) /* leaf2 buffer */
{
xfs_dir2_leaf_t *leaf1; /* leaf1 structure */
xfs_dir2_leaf_t *leaf2; /* leaf2 structure */
- leaf1 = leaf1_bp->data;
- leaf2 = leaf2_bp->data;
+ leaf1 = leaf1_bp->b_addr;
+ leaf2 = leaf2_bp->b_addr;
ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
if (be16_to_cpu(leaf1->hdr.count) > 0 &&
@@ -757,8 +828,8 @@ xfs_dir2_leafn_rebalance(
blk1 = blk2;
blk2 = tmp;
}
- leaf1 = blk1->bp->data;
- leaf2 = blk2->bp->data;
+ leaf1 = blk1->bp->b_addr;
+ leaf2 = blk2->bp->b_addr;
oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count);
#ifdef DEBUG
oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale);
@@ -826,6 +897,77 @@ xfs_dir2_leafn_rebalance(
}
}
+static int
+xfs_dir2_data_block_free(
+ xfs_da_args_t *args,
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_free *free,
+ xfs_dir2_db_t fdb,
+ int findex,
+ struct xfs_buf *fbp,
+ int longest)
+{
+ struct xfs_trans *tp = args->trans;
+ int logfree = 0;
+
+ if (!hdr) {
+ /* One less used entry in the free table. */
+ be32_add_cpu(&free->hdr.nused, -1);
+ xfs_dir2_free_log_header(tp, fbp);
+
+ /*
+ * If this was the last entry in the table, we can trim the
+ * table size back. There might be other entries at the end
+ * referring to non-existent data blocks, get those too.
+ */
+ if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
+ int i; /* free entry index */
+
+ for (i = findex - 1; i >= 0; i--) {
+ if (free->bests[i] != cpu_to_be16(NULLDATAOFF))
+ break;
+ }
+ free->hdr.nvalid = cpu_to_be32(i + 1);
+ logfree = 0;
+ } else {
+ /* Not the last entry, just punch it out. */
+ free->bests[findex] = cpu_to_be16(NULLDATAOFF);
+ logfree = 1;
+ }
+ /*
+ * If there are no useful entries left in the block,
+ * get rid of the block if we can.
+ */
+ if (!free->hdr.nused) {
+ int error;
+
+ error = xfs_dir2_shrink_inode(args, fdb, fbp);
+ if (error == 0) {
+ fbp = NULL;
+ logfree = 0;
+ } else if (error != ENOSPC || args->total != 0)
+ return error;
+ /*
+ * It's possible to get ENOSPC if there is no
+ * space reservation. In this case some one
+ * else will eventually get rid of this block.
+ */
+ }
+ } else {
+ /*
+ * Data block is not empty, just set the free entry to the new
+ * value.
+ */
+ free->bests[findex] = cpu_to_be16(longest);
+ logfree = 1;
+ }
+
+ /* Log the free entry that changed, unless we got rid of it. */
+ if (logfree)
+ xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+ return 0;
+}
+
/*
* Remove an entry from a node directory.
* This removes the leaf entry and the data entry,
@@ -834,14 +976,14 @@ xfs_dir2_leafn_rebalance(
static int /* error */
xfs_dir2_leafn_remove(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *bp, /* leaf buffer */
+ struct xfs_buf *bp, /* leaf buffer */
int index, /* leaf entry index */
xfs_da_state_blk_t *dblk, /* data block */
int *rval) /* resulting block needs join */
{
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_db_t db; /* data block number */
- xfs_dabuf_t *dbp; /* data block buffer */
+ struct xfs_buf *dbp; /* data block buffer */
xfs_dir2_data_entry_t *dep; /* data block entry */
xfs_inode_t *dp; /* incore directory inode */
xfs_dir2_leaf_t *leaf; /* leaf structure */
@@ -858,7 +1000,7 @@ xfs_dir2_leafn_remove(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
/*
* Point to the entry we're removing.
@@ -884,7 +1026,7 @@ xfs_dir2_leafn_remove(
* in the data block in case it changes.
*/
dbp = dblk->bp;
- hdr = dbp->data;
+ hdr = dbp->b_addr;
dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
longest = be16_to_cpu(hdr->bestfree[0].length);
needlog = needscan = 0;
@@ -905,22 +1047,21 @@ xfs_dir2_leafn_remove(
*/
if (longest < be16_to_cpu(hdr->bestfree[0].length)) {
int error; /* error return value */
- xfs_dabuf_t *fbp; /* freeblock buffer */
+ struct xfs_buf *fbp; /* freeblock buffer */
xfs_dir2_db_t fdb; /* freeblock block number */
int findex; /* index in freeblock entries */
xfs_dir2_free_t *free; /* freeblock structure */
- int logfree; /* need to log free entry */
/*
* Convert the data block number to a free block,
* read in the free block.
*/
fdb = xfs_dir2_db_to_fdb(mp, db);
- if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb),
- -1, &fbp, XFS_DATA_FORK))) {
+ error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb),
+ &fbp);
+ if (error)
return error;
- }
- free = fbp->data;
+ free = fbp->b_addr;
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
ASSERT(be32_to_cpu(free->hdr.firstdb) ==
xfs_dir2_free_max_bests(mp) *
@@ -948,82 +1089,19 @@ xfs_dir2_leafn_remove(
* In this case just drop the buffer and some one else
* will eventually get rid of the empty block.
*/
- else if (error == ENOSPC && args->total == 0)
- xfs_da_buf_done(dbp);
- else
+ else if (!(error == ENOSPC && args->total == 0))
return error;
}
/*
* If we got rid of the data block, we can eliminate that entry
* in the free block.
*/
- if (hdr == NULL) {
- /*
- * One less used entry in the free table.
- */
- be32_add_cpu(&free->hdr.nused, -1);
- xfs_dir2_free_log_header(tp, fbp);
- /*
- * If this was the last entry in the table, we can
- * trim the table size back. There might be other
- * entries at the end referring to non-existent
- * data blocks, get those too.
- */
- if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
- int i; /* free entry index */
-
- for (i = findex - 1;
- i >= 0 &&
- free->bests[i] == cpu_to_be16(NULLDATAOFF);
- i--)
- continue;
- free->hdr.nvalid = cpu_to_be32(i + 1);
- logfree = 0;
- }
- /*
- * Not the last entry, just punch it out.
- */
- else {
- free->bests[findex] = cpu_to_be16(NULLDATAOFF);
- logfree = 1;
- }
- /*
- * If there are no useful entries left in the block,
- * get rid of the block if we can.
- */
- if (!free->hdr.nused) {
- error = xfs_dir2_shrink_inode(args, fdb, fbp);
- if (error == 0) {
- fbp = NULL;
- logfree = 0;
- } else if (error != ENOSPC || args->total != 0)
- return error;
- /*
- * It's possible to get ENOSPC if there is no
- * space reservation. In this case some one
- * else will eventually get rid of this block.
- */
- }
- }
- /*
- * Data block is not empty, just set the free entry to
- * the new value.
- */
- else {
- free->bests[findex] = cpu_to_be16(longest);
- logfree = 1;
- }
- /*
- * Log the free entry that changed, unless we got rid of it.
- */
- if (logfree)
- xfs_dir2_free_log_bests(tp, fbp, findex, findex);
- /*
- * Drop the buffer if we still have it.
- */
- if (fbp)
- xfs_da_buf_done(fbp);
+ error = xfs_dir2_data_block_free(args, hdr, free,
+ fdb, findex, fbp, longest);
+ if (error)
+ return error;
}
+
xfs_dir2_leafn_check(dp, bp);
/*
* Return indication of whether this leaf block is empty enough
@@ -1114,7 +1192,7 @@ xfs_dir2_leafn_toosmall(
{
xfs_da_state_blk_t *blk; /* leaf block */
xfs_dablk_t blkno; /* leaf block number */
- xfs_dabuf_t *bp; /* leaf buffer */
+ struct xfs_buf *bp; /* leaf buffer */
int bytes; /* bytes in use */
int count; /* leaf live entry count */
int error; /* error return value */
@@ -1130,7 +1208,7 @@ xfs_dir2_leafn_toosmall(
* to coalesce with a sibling.
*/
blk = &state->path.blk[state->path.active - 1];
- info = blk->bp->data;
+ info = blk->bp->b_addr;
ASSERT(info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
leaf = (xfs_dir2_leaf_t *)info;
count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
@@ -1177,19 +1255,18 @@ xfs_dir2_leafn_toosmall(
/*
* Read the sibling leaf block.
*/
- if ((error =
- xfs_da_read_buf(state->args->trans, state->args->dp, blkno,
- -1, &bp, XFS_DATA_FORK))) {
+ error = xfs_dir2_leafn_read(state->args->trans, state->args->dp,
+ blkno, -1, &bp);
+ if (error)
return error;
- }
- ASSERT(bp != NULL);
+
/*
* Count bytes in the two blocks combined.
*/
leaf = (xfs_dir2_leaf_t *)info;
count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
bytes = state->blocksize - (state->blocksize >> 2);
- leaf = bp->data;
+ leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
bytes -= count * (uint)sizeof(leaf->ents[0]);
@@ -1198,7 +1275,7 @@ xfs_dir2_leafn_toosmall(
*/
if (bytes >= 0)
break;
- xfs_da_brelse(state->args->trans, bp);
+ xfs_trans_brelse(state->args->trans, bp);
}
/*
* Didn't like either block, give up.
@@ -1207,11 +1284,7 @@ xfs_dir2_leafn_toosmall(
*action = 0;
return 0;
}
- /*
- * Done with the sibling leaf block here, drop the dabuf
- * so path_shift can get it.
- */
- xfs_da_buf_done(bp);
+
/*
* Make altpath point to the block we want to keep (the lower
* numbered block) and path point to the block we want to drop.
@@ -1247,8 +1320,8 @@ xfs_dir2_leafn_unbalance(
args = state->args;
ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
- drop_leaf = drop_blk->bp->data;
- save_leaf = save_blk->bp->data;
+ drop_leaf = drop_blk->bp->b_addr;
+ save_leaf = save_blk->bp->b_addr;
ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
/*
@@ -1356,13 +1429,13 @@ xfs_dir2_node_addname_int(
{
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_db_t dbno; /* data block number */
- xfs_dabuf_t *dbp; /* data block buffer */
+ struct xfs_buf *dbp; /* data block buffer */
xfs_dir2_data_entry_t *dep; /* data entry pointer */
xfs_inode_t *dp; /* incore directory inode */
xfs_dir2_data_unused_t *dup; /* data unused entry pointer */
int error; /* error return value */
xfs_dir2_db_t fbno; /* freespace block number */
- xfs_dabuf_t *fbp; /* freespace buffer */
+ struct xfs_buf *fbp; /* freespace buffer */
int findex; /* freespace entry index */
xfs_dir2_free_t *free=NULL; /* freespace block structure */
xfs_dir2_db_t ifbno; /* initial freespace block no */
@@ -1390,7 +1463,7 @@ xfs_dir2_node_addname_int(
* Remember initial freespace block number.
*/
ifbno = fblk->blkno;
- free = fbp->data;
+ free = fbp->b_addr;
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
findex = fblk->index;
/*
@@ -1466,15 +1539,14 @@ xfs_dir2_node_addname_int(
* This should be really rare, so there's no reason
* to avoid it.
*/
- if ((error = xfs_da_read_buf(tp, dp,
- xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
- XFS_DATA_FORK))) {
+ error = xfs_dir2_free_try_read(tp, dp,
+ xfs_dir2_db_to_da(mp, fbno),
+ &fbp);
+ if (error)
return error;
- }
- if (unlikely(fbp == NULL)) {
+ if (!fbp)
continue;
- }
- free = fbp->data;
+ free = fbp->b_addr;
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
findex = 0;
}
@@ -1492,7 +1564,7 @@ xfs_dir2_node_addname_int(
/*
* Drop the block.
*/
- xfs_da_brelse(tp, fbp);
+ xfs_trans_brelse(tp, fbp);
fbp = NULL;
if (fblk && fblk->bp)
fblk->bp = NULL;
@@ -1507,36 +1579,23 @@ xfs_dir2_node_addname_int(
/*
* Not allowed to allocate, return failure.
*/
- if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
- args->total == 0) {
- /*
- * Drop the freespace buffer unless it came from our
- * caller.
- */
- if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
- xfs_da_buf_done(fbp);
+ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
return XFS_ERROR(ENOSPC);
- }
+
/*
* Allocate and initialize the new data block.
*/
if (unlikely((error = xfs_dir2_grow_inode(args,
XFS_DIR2_DATA_SPACE,
&dbno)) ||
- (error = xfs_dir2_data_init(args, dbno, &dbp)))) {
- /*
- * Drop the freespace buffer unless it came from our
- * caller.
- */
- if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
- xfs_da_buf_done(fbp);
+ (error = xfs_dir2_data_init(args, dbno, &dbp))))
return error;
- }
+
/*
* If (somehow) we have a freespace block, get rid of it.
*/
if (fbp)
- xfs_da_brelse(tp, fbp);
+ xfs_trans_brelse(tp, fbp);
if (fblk && fblk->bp)
fblk->bp = NULL;
@@ -1545,12 +1604,12 @@ xfs_dir2_node_addname_int(
* that was just allocated.
*/
fbno = xfs_dir2_db_to_fdb(mp, dbno);
- if (unlikely(error = xfs_da_read_buf(tp, dp,
- xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
- XFS_DATA_FORK))) {
- xfs_da_buf_done(dbp);
+ error = xfs_dir2_free_try_read(tp, dp,
+ xfs_dir2_db_to_da(mp, fbno),
+ &fbp);
+ if (error)
return error;
- }
+
/*
* If there wasn't a freespace block, the read will
* return a NULL fbp. Allocate and initialize a new one.
@@ -1587,18 +1646,18 @@ xfs_dir2_node_addname_int(
/*
* Get a buffer for the new block.
*/
- if ((error = xfs_da_get_buf(tp, dp,
- xfs_dir2_db_to_da(mp, fbno),
- -1, &fbp, XFS_DATA_FORK))) {
+ error = xfs_da_get_buf(tp, dp,
+ xfs_dir2_db_to_da(mp, fbno),
+ -1, &fbp, XFS_DATA_FORK);
+ if (error)
return error;
- }
- ASSERT(fbp != NULL);
+ fbp->b_ops = &xfs_dir2_free_buf_ops;
/*
* Initialize the new block to be empty, and remember
* its first slot as our empty slot.
*/
- free = fbp->data;
+ free = fbp->b_addr;
free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC);
free->hdr.firstdb = cpu_to_be32(
(fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
@@ -1606,7 +1665,7 @@ xfs_dir2_node_addname_int(
free->hdr.nvalid = 0;
free->hdr.nused = 0;
} else {
- free = fbp->data;
+ free = fbp->b_addr;
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
}
@@ -1639,7 +1698,7 @@ xfs_dir2_node_addname_int(
* We haven't allocated the data entry yet so this will
* change again.
*/
- hdr = dbp->data;
+ hdr = dbp->b_addr;
free->bests[findex] = hdr->bestfree[0].length;
logfree = 1;
}
@@ -1650,22 +1709,17 @@ xfs_dir2_node_addname_int(
/*
* If just checking, we succeeded.
*/
- if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
- if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
- xfs_da_buf_done(fbp);
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK)
return 0;
- }
+
/*
* Read the data block in.
*/
- if (unlikely(
- error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno),
- -1, &dbp, XFS_DATA_FORK))) {
- if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
- xfs_da_buf_done(fbp);
+ error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
+ -1, &dbp);
+ if (error)
return error;
- }
- hdr = dbp->data;
+ hdr = dbp->b_addr;
logfree = 0;
}
ASSERT(be16_to_cpu(hdr->bestfree[0].length) >= length);
@@ -1714,16 +1768,10 @@ xfs_dir2_node_addname_int(
if (logfree)
xfs_dir2_free_log_bests(tp, fbp, findex, findex);
/*
- * If the caller didn't hand us the freespace block, drop it.
- */
- if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
- xfs_da_buf_done(fbp);
- /*
* Return the data block and offset in args, then drop the data block.
*/
args->blkno = (xfs_dablk_t)dbno;
args->index = be16_to_cpu(*tagp);
- xfs_da_buf_done(dbp);
return 0;
}
@@ -1761,22 +1809,23 @@ xfs_dir2_node_lookup(
/* If a CI match, dup the actual name and return EEXIST */
xfs_dir2_data_entry_t *dep;
- dep = (xfs_dir2_data_entry_t *)((char *)state->extrablk.bp->
- data + state->extrablk.index);
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)state->extrablk.bp->b_addr +
+ state->extrablk.index);
rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
}
/*
* Release the btree blocks and leaf block.
*/
for (i = 0; i < state->path.active; i++) {
- xfs_da_brelse(args->trans, state->path.blk[i].bp);
+ xfs_trans_brelse(args->trans, state->path.blk[i].bp);
state->path.blk[i].bp = NULL;
}
/*
* Release the data block if we have it.
*/
if (state->extravalid && state->extrablk.bp) {
- xfs_da_brelse(args->trans, state->extrablk.bp);
+ xfs_trans_brelse(args->trans, state->extrablk.bp);
state->extrablk.bp = NULL;
}
xfs_da_state_free(state);
@@ -1893,13 +1942,13 @@ xfs_dir2_node_replace(
*/
blk = &state->path.blk[state->path.active - 1];
ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
- leaf = blk->bp->data;
+ leaf = blk->bp->b_addr;
lep = &leaf->ents[blk->index];
ASSERT(state->extravalid);
/*
* Point to the data entry.
*/
- hdr = state->extrablk.bp->data;
+ hdr = state->extrablk.bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
dep = (xfs_dir2_data_entry_t *)
((char *)hdr +
@@ -1916,14 +1965,14 @@ xfs_dir2_node_replace(
* Didn't find it, and we're holding a data block. Drop it.
*/
else if (state->extravalid) {
- xfs_da_brelse(args->trans, state->extrablk.bp);
+ xfs_trans_brelse(args->trans, state->extrablk.bp);
state->extrablk.bp = NULL;
}
/*
* Release all the buffers in the cursor.
*/
for (i = 0; i < state->path.active; i++) {
- xfs_da_brelse(args->trans, state->path.blk[i].bp);
+ xfs_trans_brelse(args->trans, state->path.blk[i].bp);
state->path.blk[i].bp = NULL;
}
xfs_da_state_free(state);
@@ -1940,7 +1989,7 @@ xfs_dir2_node_trim_free(
xfs_fileoff_t fo, /* free block number */
int *rvalp) /* out: did something */
{
- xfs_dabuf_t *bp; /* freespace buffer */
+ struct xfs_buf *bp; /* freespace buffer */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
xfs_dir2_free_t *free; /* freespace structure */
@@ -1953,25 +2002,22 @@ xfs_dir2_node_trim_free(
/*
* Read the freespace block.
*/
- if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp,
- XFS_DATA_FORK))) {
+ error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
+ if (error)
return error;
- }
-
/*
* There can be holes in freespace. If fo is a hole, there's
* nothing to do.
*/
- if (bp == NULL) {
+ if (!bp)
return 0;
- }
- free = bp->data;
+ free = bp->b_addr;
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
/*
* If there are used entries, there's nothing to do.
*/
if (be32_to_cpu(free->hdr.nused) > 0) {
- xfs_da_brelse(tp, bp);
+ xfs_trans_brelse(tp, bp);
*rvalp = 0;
return 0;
}
@@ -1987,7 +2033,7 @@ xfs_dir2_node_trim_free(
* pieces. This is the last block of an extent.
*/
ASSERT(error != ENOSPC);
- xfs_da_brelse(tp, bp);
+ xfs_trans_brelse(tp, bp);
return error;
}
/*
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 067f403ecf8..7da79f6515f 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -25,11 +25,13 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
xfs_dir2_db_t *dbp);
extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
- struct xfs_dabuf *bp);
+ struct xfs_buf *bp);
extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
const unsigned char *name, int len);
/* xfs_dir2_block.c */
+extern const struct xfs_buf_ops xfs_dir2_block_buf_ops;
+
extern int xfs_dir2_block_addname(struct xfs_da_args *args);
extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
xfs_off_t *offset, filldir_t filldir);
@@ -37,57 +39,70 @@ extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
extern int xfs_dir2_block_removename(struct xfs_da_args *args);
extern int xfs_dir2_block_replace(struct xfs_da_args *args);
extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
- struct xfs_dabuf *lbp, struct xfs_dabuf *dbp);
+ struct xfs_buf *lbp, struct xfs_buf *dbp);
/* xfs_dir2_data.c */
#ifdef DEBUG
-extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp);
+#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp);
#else
#define xfs_dir2_data_check(dp,bp)
#endif
+
+extern const struct xfs_buf_ops xfs_dir2_data_buf_ops;
+
+extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
+extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, xfs_daddr_t mapped_bno);
+
extern struct xfs_dir2_data_free *
xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
struct xfs_dir2_data_unused *dup, int *loghead);
extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
struct xfs_dir2_data_hdr *hdr, int *loghead);
extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
- struct xfs_dabuf **bpp);
-extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp,
+ struct xfs_buf **bpp);
+extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp,
struct xfs_dir2_data_entry *dep);
extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
- struct xfs_dabuf *bp);
-extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp,
+ struct xfs_buf *bp);
+extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp,
struct xfs_dir2_data_unused *dup);
-extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
+extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp,
xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
int *needlogp, int *needscanp);
-extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
+extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset,
xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
/* xfs_dir2_leaf.c */
+extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops;
+
+extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
- struct xfs_dabuf *dbp);
+ struct xfs_buf *dbp);
extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
extern void xfs_dir2_leaf_compact(struct xfs_da_args *args,
- struct xfs_dabuf *bp);
-extern void xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp,
+ struct xfs_buf *bp);
+extern void xfs_dir2_leaf_compact_x1(struct xfs_buf *bp, int *indexp,
int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent,
size_t bufsize, xfs_off_t *offset, filldir_t filldir);
extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno,
- struct xfs_dabuf **bpp, int magic);
-extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp,
+ struct xfs_buf **bpp, int magic);
+extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
int first, int last);
extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp,
- struct xfs_dabuf *bp);
+ struct xfs_buf *bp);
extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
extern int xfs_dir2_leaf_replace(struct xfs_da_args *args);
extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
- struct xfs_dabuf *lbp);
+ struct xfs_buf *lbp);
extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
- struct xfs_dabuf *lbp, xfs_dir2_db_t db);
+ struct xfs_buf *lbp, xfs_dir2_db_t db);
extern struct xfs_dir2_leaf_entry *
xfs_dir2_leaf_find_entry(struct xfs_dir2_leaf *leaf, int index, int compact,
int lowstale, int highstale,
@@ -96,13 +111,13 @@ extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
/* xfs_dir2_node.c */
extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
- struct xfs_dabuf *lbp);
-extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
-extern int xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp,
+ struct xfs_buf *lbp);
+extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_buf *bp, int *count);
+extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
struct xfs_da_args *args, int *indexp,
struct xfs_da_state *state);
-extern int xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp,
- struct xfs_dabuf *leaf2_bp);
+extern int xfs_dir2_leafn_order(struct xfs_buf *leaf1_bp,
+ struct xfs_buf *leaf2_bp);
extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk);
extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
@@ -115,6 +130,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args);
extern int xfs_dir2_node_replace(struct xfs_da_args *args);
extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
int *rvalp);
+extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t fbno, struct xfs_buf **bpp);
/* xfs_dir2_sf.c */
extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
@@ -122,7 +139,7 @@ extern xfs_ino_t xfs_dir2_sfe_get_ino(struct xfs_dir2_sf_hdr *sfp,
struct xfs_dir2_sf_entry *sfep);
extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
-extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp,
+extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
int size, xfs_dir2_sf_hdr_t *sfhp);
extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 19bf0c5e38f..1b9fc3ec7e4 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -222,7 +222,7 @@ xfs_dir2_block_sfsize(
int /* error */
xfs_dir2_block_to_sf(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *bp, /* block buffer */
+ struct xfs_buf *bp,
int size, /* shortform directory size */
xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */
{
@@ -249,7 +249,7 @@ xfs_dir2_block_to_sf(
* and add local data.
*/
hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP);
- memcpy(hdr, bp->data, mp->m_dirblksize);
+ memcpy(hdr, bp->b_addr, mp->m_dirblksize);
logflags = XFS_ILOG_CORE;
if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
ASSERT(error != ENOSPC);
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index f9c3fe304a1..69cf4fcde03 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -179,12 +179,14 @@ xfs_ioc_trim(
* used by the fstrim application. In the end it really doesn't
* matter as trimming blocks is an advisory interface.
*/
+ if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
+ range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)))
+ return -XFS_ERROR(EINVAL);
+
start = BTOBB(range.start);
end = start + BTOBBT(range.len) - 1;
minlen = BTOBB(max_t(u64, granularity, range.minlen));
- if (XFS_BB_TO_FSB(mp, start) >= mp->m_sb.sb_dblocks)
- return -XFS_ERROR(EINVAL);
if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index bf27fcca484..9e1bf5294c9 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -248,7 +248,59 @@ xfs_qm_init_dquot_blk(
xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
}
+static void
+xfs_dquot_buf_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
+ struct xfs_disk_dquot *ddq;
+ xfs_dqid_t id = 0;
+ int i;
+
+ /*
+ * On the first read of the buffer, verify that each dquot is valid.
+ * We don't know what the id of the dquot is supposed to be, just that
+ * they should be increasing monotonically within the buffer. If the
+ * first id is corrupt, then it will fail on the second dquot in the
+ * buffer so corruptions could point to the wrong dquot in this case.
+ */
+ for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+ int error;
+
+ ddq = &d[i].dd_diskdq;
+
+ if (i == 0)
+ id = be32_to_cpu(ddq->d_id);
+
+ error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+ "xfs_dquot_read_verify");
+ if (error) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ break;
+ }
+ }
+}
+
+static void
+xfs_dquot_buf_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dquot_buf_verify(bp);
+}
+
+void
+xfs_dquot_buf_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dquot_buf_verify(bp);
+}
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+ .verify_read = xfs_dquot_buf_read_verify,
+ .verify_write = xfs_dquot_buf_write_verify,
+};
/*
* Allocate a block and fill it with dquots.
@@ -315,6 +367,7 @@ xfs_qm_dqalloc(
error = xfs_buf_geterror(bp);
if (error)
goto error1;
+ bp->b_ops = &xfs_dquot_buf_ops;
/*
* Make a chunk of dquots out of this buffer and log
@@ -359,6 +412,51 @@ xfs_qm_dqalloc(
return (error);
}
+STATIC int
+xfs_qm_dqrepair(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp,
+ xfs_dqid_t firstid,
+ struct xfs_buf **bpp)
+{
+ int error;
+ struct xfs_disk_dquot *ddq;
+ struct xfs_dqblk *d;
+ int i;
+
+ /*
+ * Read the buffer without verification so we get the corrupted
+ * buffer returned to us. make sure we verify it on write, though.
+ */
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen,
+ 0, bpp, NULL);
+
+ if (error) {
+ ASSERT(*bpp == NULL);
+ return XFS_ERROR(error);
+ }
+ (*bpp)->b_ops = &xfs_dquot_buf_ops;
+
+ ASSERT(xfs_buf_islocked(*bpp));
+ d = (struct xfs_dqblk *)(*bpp)->b_addr;
+
+ /* Do the actual repair of dquots in this buffer */
+ for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+ ddq = &d[i].dd_diskdq;
+ error = xfs_qm_dqcheck(mp, ddq, firstid + i,
+ dqp->dq_flags & XFS_DQ_ALLTYPES,
+ XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
+ if (error) {
+ /* repair failed, we're screwed */
+ xfs_trans_brelse(tp, *bpp);
+ return XFS_ERROR(EIO);
+ }
+ }
+
+ return 0;
+}
/*
* Maps a dquot to the buffer containing its on-disk version.
@@ -378,7 +476,6 @@ xfs_qm_dqtobp(
xfs_buf_t *bp;
xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
xfs_mount_t *mp = dqp->q_mount;
- xfs_disk_dquot_t *ddq;
xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
xfs_trans_t *tp = (tpp ? *tpp : NULL);
@@ -439,33 +536,24 @@ xfs_qm_dqtobp(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen,
- 0, &bp);
- if (error || !bp)
- return XFS_ERROR(error);
- }
-
- ASSERT(xfs_buf_islocked(bp));
+ 0, &bp, &xfs_dquot_buf_ops);
- /*
- * calculate the location of the dquot inside the buffer.
- */
- ddq = bp->b_addr + dqp->q_bufoffset;
+ if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
+ xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
+ mp->m_quotainfo->qi_dqperchunk;
+ ASSERT(bp == NULL);
+ error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp);
+ }
- /*
- * A simple sanity check in case we got a corrupted dquot...
- */
- error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
- flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
- "dqtobp");
- if (error) {
- if (!(flags & XFS_QMOPT_DQREPAIR)) {
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EIO);
+ if (error) {
+ ASSERT(bp == NULL);
+ return XFS_ERROR(error);
}
}
+ ASSERT(xfs_buf_islocked(bp));
*O_bpp = bp;
- *O_ddpp = ddq;
+ *O_ddpp = bp->b_addr + dqp->q_bufoffset;
return (0);
}
@@ -920,7 +1008,7 @@ xfs_qm_dqflush(
* Get the buffer containing the on-disk dquot
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
if (error)
goto out_unlock;
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 7d20af27346..c694a8469c4 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -161,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
return dqp;
}
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 42679223a0f..a83611849ce 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -29,6 +29,7 @@
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
/*
* Note that we only accept fileids which are long enough rather than allow
@@ -189,6 +190,9 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid;
struct inode *inode = NULL;
+ if (fh_len < xfs_fileid_length(fileid_type))
+ return NULL;
+
switch (fileid_type) {
case FILEID_INO32_GEN_PARENT:
inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9f7ec15a652..67284edb84d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -31,11 +31,14 @@
#include "xfs_error.h"
#include "xfs_vnodeops.h"
#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2_priv.h"
#include "xfs_ioctl.h"
#include "xfs_trace.h"
#include <linux/dcache.h>
#include <linux/falloc.h>
+#include <linux/pagevec.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -83,7 +86,7 @@ xfs_rw_ilock_demote(
* valid before the operation, it will be read from disk before
* being partially zeroed.
*/
-STATIC int
+int
xfs_iozero(
struct xfs_inode *ip, /* inode */
loff_t pos, /* offset in file */
@@ -236,7 +239,6 @@ xfs_file_aio_read(
ssize_t ret = 0;
int ioflags = 0;
xfs_fsize_t n;
- unsigned long seg;
XFS_STATS_INC(xs_read_calls);
@@ -247,33 +249,22 @@ xfs_file_aio_read(
if (file->f_mode & FMODE_NOCMTIME)
ioflags |= IO_INVIS;
- /* START copy & waste from filemap.c */
- for (seg = 0; seg < nr_segs; seg++) {
- const struct iovec *iv = &iovp[seg];
-
- /*
- * If any segment has a negative length, or the cumulative
- * length ever wraps negative then return -EINVAL.
- */
- size += iv->iov_len;
- if (unlikely((ssize_t)(size|iv->iov_len) < 0))
- return XFS_ERROR(-EINVAL);
- }
- /* END copy & waste from filemap.c */
+ ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE);
+ if (ret < 0)
+ return ret;
if (unlikely(ioflags & IO_ISDIRECT)) {
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
- if ((iocb->ki_pos & target->bt_smask) ||
- (size & target->bt_smask)) {
- if (iocb->ki_pos == i_size_read(inode))
+ if ((pos & target->bt_smask) || (size & target->bt_smask)) {
+ if (pos == i_size_read(inode))
return 0;
return -XFS_ERROR(EINVAL);
}
}
- n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
+ n = mp->m_super->s_maxbytes - pos;
if (n <= 0 || size == 0)
return 0;
@@ -299,20 +290,21 @@ xfs_file_aio_read(
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
if (inode->i_mapping->nrpages) {
- ret = -xfs_flushinval_pages(ip,
- (iocb->ki_pos & PAGE_CACHE_MASK),
- -1, FI_REMAPF_LOCKED);
+ ret = -filemap_write_and_wait_range(
+ VFS_I(ip)->i_mapping,
+ pos, -1);
if (ret) {
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
+ truncate_pagecache_range(VFS_I(ip), pos, -1);
}
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
}
- trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
+ trace_xfs_file_read(ip, size, pos, ioflags);
- ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
+ ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
@@ -680,10 +672,11 @@ xfs_file_dio_aio_write(
goto out;
if (mapping->nrpages) {
- ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
- FI_REMAPF_LOCKED);
+ ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ pos, -1);
if (ret)
goto out;
+ truncate_pagecache_range(VFS_I(ip), pos, -1);
}
/*
@@ -738,16 +731,17 @@ xfs_file_buffered_aio_write(
write_retry:
trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
ret = generic_file_buffered_write(iocb, iovp, nr_segs,
- pos, &iocb->ki_pos, count, ret);
+ pos, &iocb->ki_pos, count, 0);
+
/*
- * if we just got an ENOSPC, flush the inode now we aren't holding any
- * page locks and retry *once*
+ * If we just got an ENOSPC, try to write back all dirty inodes to
+ * convert delalloc space to free up some of the excess reserved
+ * metadata space.
*/
if (ret == -ENOSPC && !enospc) {
enospc = 1;
- ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
- if (!ret)
- goto write_retry;
+ xfs_flush_inodes(ip->i_mount);
+ goto write_retry;
}
current->backing_dev_info = NULL;
@@ -781,10 +775,12 @@ xfs_file_aio_write(
if (ocount == 0)
return 0;
- xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
+ sb_start_write(inode->i_sb);
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EIO;
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ ret = -EIO;
+ goto out;
+ }
if (unlikely(file->f_flags & O_DIRECT))
ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
@@ -803,6 +799,8 @@ xfs_file_aio_write(
ret = err;
}
+out:
+ sb_end_write(inode->i_sb);
return ret;
}
@@ -895,7 +893,7 @@ xfs_dir_open(
*/
mode = xfs_ilock_map_shared(ip);
if (ip->i_d.di_nextents > 0)
- xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+ xfs_dir2_data_readahead(NULL, ip, 0, -1);
xfs_iunlock(ip, mode);
return 0;
}
@@ -946,7 +944,6 @@ xfs_file_mmap(
struct vm_area_struct *vma)
{
vma->vm_ops = &xfs_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
file_accessed(filp);
return 0;
@@ -966,17 +963,232 @@ xfs_vm_page_mkwrite(
return block_page_mkwrite(vma, vmf, xfs_get_blocks);
}
+/*
+ * This type is designed to indicate the type of offset we would like
+ * to search from page cache for either xfs_seek_data() or xfs_seek_hole().
+ */
+enum {
+ HOLE_OFF = 0,
+ DATA_OFF,
+};
+
+/*
+ * Lookup the desired type of offset from the given page.
+ *
+ * On success, return true and the offset argument will point to the
+ * start of the region that was found. Otherwise this function will
+ * return false and keep the offset argument unchanged.
+ */
+STATIC bool
+xfs_lookup_buffer_offset(
+ struct page *page,
+ loff_t *offset,
+ unsigned int type)
+{
+ loff_t lastoff = page_offset(page);
+ bool found = false;
+ struct buffer_head *bh, *head;
+
+ bh = head = page_buffers(page);
+ do {
+ /*
+ * Unwritten extents that have data in the page
+ * cache covering them can be identified by the
+ * BH_Unwritten state flag. Pages with multiple
+ * buffers might have a mix of holes, data and
+ * unwritten extents - any buffer with valid
+ * data in it should have BH_Uptodate flag set
+ * on it.
+ */
+ if (buffer_unwritten(bh) ||
+ buffer_uptodate(bh)) {
+ if (type == DATA_OFF)
+ found = true;
+ } else {
+ if (type == HOLE_OFF)
+ found = true;
+ }
+
+ if (found) {
+ *offset = lastoff;
+ break;
+ }
+ lastoff += bh->b_size;
+ } while ((bh = bh->b_this_page) != head);
+
+ return found;
+}
+
+/*
+ * This routine is called to find out and return a data or hole offset
+ * from the page cache for unwritten extents according to the desired
+ * type for xfs_seek_data() or xfs_seek_hole().
+ *
+ * The argument offset is used to tell where we start to search from the
+ * page cache. Map is used to figure out the end points of the range to
+ * lookup pages.
+ *
+ * Return true if the desired type of offset was found, and the argument
+ * offset is filled with that address. Otherwise, return false and keep
+ * offset unchanged.
+ */
+STATIC bool
+xfs_find_get_desired_pgoff(
+ struct inode *inode,
+ struct xfs_bmbt_irec *map,
+ unsigned int type,
+ loff_t *offset)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct pagevec pvec;
+ pgoff_t index;
+ pgoff_t end;
+ loff_t endoff;
+ loff_t startoff = *offset;
+ loff_t lastoff = startoff;
+ bool found = false;
+
+ pagevec_init(&pvec, 0);
+
+ index = startoff >> PAGE_CACHE_SHIFT;
+ endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount);
+ end = endoff >> PAGE_CACHE_SHIFT;
+ do {
+ int want;
+ unsigned nr_pages;
+ unsigned int i;
+
+ want = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+ want);
+ /*
+ * No page mapped into given range. If we are searching holes
+ * and if this is the first time we got into the loop, it means
+ * that the given offset is landed in a hole, return it.
+ *
+ * If we have already stepped through some block buffers to find
+ * holes but they all contains data. In this case, the last
+ * offset is already updated and pointed to the end of the last
+ * mapped page, if it does not reach the endpoint to search,
+ * that means there should be a hole between them.
+ */
+ if (nr_pages == 0) {
+ /* Data search found nothing */
+ if (type == DATA_OFF)
+ break;
+
+ ASSERT(type == HOLE_OFF);
+ if (lastoff == startoff || lastoff < endoff) {
+ found = true;
+ *offset = lastoff;
+ }
+ break;
+ }
+
+ /*
+ * At lease we found one page. If this is the first time we
+ * step into the loop, and if the first page index offset is
+ * greater than the given search offset, a hole was found.
+ */
+ if (type == HOLE_OFF && lastoff == startoff &&
+ lastoff < page_offset(pvec.pages[0])) {
+ found = true;
+ break;
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ loff_t b_offset;
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL),
+ * or even swizzled back from swapper_space to tmpfs
+ * file mapping. However, page->index will not change
+ * because we have a reference on the page.
+ *
+ * Searching done if the page index is out of range.
+ * If the current offset is not reaches the end of
+ * the specified search range, there should be a hole
+ * between them.
+ */
+ if (page->index > end) {
+ if (type == HOLE_OFF && lastoff < endoff) {
+ *offset = lastoff;
+ found = true;
+ }
+ goto out;
+ }
+
+ lock_page(page);
+ /*
+ * Page truncated or invalidated(page->mapping == NULL).
+ * We can freely skip it and proceed to check the next
+ * page.
+ */
+ if (unlikely(page->mapping != inode->i_mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (!page_has_buffers(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ found = xfs_lookup_buffer_offset(page, &b_offset, type);
+ if (found) {
+ /*
+ * The found offset may be less than the start
+ * point to search if this is the first time to
+ * come here.
+ */
+ *offset = max_t(loff_t, startoff, b_offset);
+ unlock_page(page);
+ goto out;
+ }
+
+ /*
+ * We either searching data but nothing was found, or
+ * searching hole but found a data buffer. In either
+ * case, probably the next page contains the desired
+ * things, update the last offset to it so.
+ */
+ lastoff = page_offset(page) + PAGE_SIZE;
+ unlock_page(page);
+ }
+
+ /*
+ * The number of returned pages less than our desired, search
+ * done. In this case, nothing was found for searching data,
+ * but we found a hole behind the last offset.
+ */
+ if (nr_pages < want) {
+ if (type == HOLE_OFF) {
+ *offset = lastoff;
+ found = true;
+ }
+ break;
+ }
+
+ index = pvec.pages[i - 1]->index + 1;
+ pagevec_release(&pvec);
+ } while (index <= end);
+
+out:
+ pagevec_release(&pvec);
+ return found;
+}
+
STATIC loff_t
xfs_seek_data(
struct file *file,
- loff_t start,
- u32 type)
+ loff_t start)
{
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct xfs_bmbt_irec map[2];
- int nmap = 2;
loff_t uninitialized_var(offset);
xfs_fsize_t isize;
xfs_fileoff_t fsbno;
@@ -992,36 +1204,74 @@ xfs_seek_data(
goto out_unlock;
}
- fsbno = XFS_B_TO_FSBT(mp, start);
-
/*
* Try to read extents from the first block indicated
* by fsbno to the end block of the file.
*/
+ fsbno = XFS_B_TO_FSBT(mp, start);
end = XFS_B_TO_FSB(mp, isize);
+ for (;;) {
+ struct xfs_bmbt_irec map[2];
+ int nmap = 2;
+ unsigned int i;
- error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
- XFS_BMAPI_ENTIRE);
- if (error)
- goto out_unlock;
+ error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
+ XFS_BMAPI_ENTIRE);
+ if (error)
+ goto out_unlock;
- /*
- * Treat unwritten extent as data extent since it might
- * contains dirty data in page cache.
- */
- if (map[0].br_startblock != HOLESTARTBLOCK) {
- offset = max_t(loff_t, start,
- XFS_FSB_TO_B(mp, map[0].br_startoff));
- } else {
+ /* No extents at given offset, must be beyond EOF */
+ if (nmap == 0) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < nmap; i++) {
+ offset = max_t(loff_t, start,
+ XFS_FSB_TO_B(mp, map[i].br_startoff));
+
+ /* Landed in a data extent */
+ if (map[i].br_startblock == DELAYSTARTBLOCK ||
+ (map[i].br_state == XFS_EXT_NORM &&
+ !isnullstartblock(map[i].br_startblock)))
+ goto out;
+
+ /*
+ * Landed in an unwritten extent, try to search data
+ * from page cache.
+ */
+ if (map[i].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_find_get_desired_pgoff(inode, &map[i],
+ DATA_OFF, &offset))
+ goto out;
+ }
+ }
+
+ /*
+ * map[0] is hole or its an unwritten extent but
+ * without data in page cache. Probably means that
+ * we are reading after EOF if nothing in map[1].
+ */
if (nmap == 1) {
error = ENXIO;
goto out_unlock;
}
- offset = max_t(loff_t, start,
- XFS_FSB_TO_B(mp, map[1].br_startoff));
+ ASSERT(i > 1);
+
+ /*
+ * Nothing was found, proceed to the next round of search
+ * if reading offset not beyond or hit EOF.
+ */
+ fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
+ start = XFS_FSB_TO_B(mp, fsbno);
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_unlock;
+ }
}
+out:
if (offset != file->f_pos)
file->f_pos = offset;
@@ -1036,16 +1286,15 @@ out_unlock:
STATIC loff_t
xfs_seek_hole(
struct file *file,
- loff_t start,
- u32 type)
+ loff_t start)
{
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
loff_t uninitialized_var(offset);
- loff_t holeoff;
xfs_fsize_t isize;
xfs_fileoff_t fsbno;
+ xfs_filblks_t end;
uint lock;
int error;
@@ -1061,21 +1310,77 @@ xfs_seek_hole(
}
fsbno = XFS_B_TO_FSBT(mp, start);
- error = xfs_bmap_first_unused(NULL, ip, 1, &fsbno, XFS_DATA_FORK);
- if (error)
- goto out_unlock;
+ end = XFS_B_TO_FSB(mp, isize);
+
+ for (;;) {
+ struct xfs_bmbt_irec map[2];
+ int nmap = 2;
+ unsigned int i;
+
+ error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
+ XFS_BMAPI_ENTIRE);
+ if (error)
+ goto out_unlock;
+
+ /* No extents at given offset, must be beyond EOF */
+ if (nmap == 0) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < nmap; i++) {
+ offset = max_t(loff_t, start,
+ XFS_FSB_TO_B(mp, map[i].br_startoff));
+
+ /* Landed in a hole */
+ if (map[i].br_startblock == HOLESTARTBLOCK)
+ goto out;
+
+ /*
+ * Landed in an unwritten extent, try to search hole
+ * from page cache.
+ */
+ if (map[i].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_find_get_desired_pgoff(inode, &map[i],
+ HOLE_OFF, &offset))
+ goto out;
+ }
+ }
- holeoff = XFS_FSB_TO_B(mp, fsbno);
- if (holeoff <= start)
- offset = start;
- else {
/*
- * xfs_bmap_first_unused() could return a value bigger than
- * isize if there are no more holes past the supplied offset.
+ * map[0] contains data or its unwritten but contains
+ * data in page cache, probably means that we are
+ * reading after EOF. We should fix offset to point
+ * to the end of the file(i.e., there is an implicit
+ * hole at the end of any file).
*/
- offset = min_t(loff_t, holeoff, isize);
+ if (nmap == 1) {
+ offset = isize;
+ break;
+ }
+
+ ASSERT(i > 1);
+
+ /*
+ * Both mappings contains data, proceed to the next round of
+ * search if the current reading offset not beyond or hit EOF.
+ */
+ fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
+ start = XFS_FSB_TO_B(mp, fsbno);
+ if (start >= isize) {
+ offset = isize;
+ break;
+ }
}
+out:
+ /*
+ * At this point, we must have found a hole. However, the returned
+ * offset may be bigger than the file size as it may be aligned to
+ * page boundary for unwritten extents, we need to deal with this
+ * situation in particular.
+ */
+ offset = min_t(loff_t, offset, isize);
if (offset != file->f_pos)
file->f_pos = offset;
@@ -1099,9 +1404,9 @@ xfs_file_llseek(
case SEEK_SET:
return generic_file_llseek(file, offset, origin);
case SEEK_DATA:
- return xfs_seek_data(file, offset, origin);
+ return xfs_seek_data(file, offset);
case SEEK_HOLE:
- return xfs_seek_hole(file, offset, origin);
+ return xfs_seek_hole(file, offset);
default:
return -EINVAL;
}
@@ -1141,4 +1446,5 @@ const struct file_operations xfs_dir_file_operations = {
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = xfs_vm_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c13fed8c394..6dda3f949b0 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */
-#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
+#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
@@ -339,6 +340,35 @@ typedef struct xfs_error_injection {
/*
+ * Speculative preallocation trimming.
+ */
+#define XFS_EOFBLOCKS_VERSION 1
+struct xfs_eofblocks {
+ __u32 eof_version;
+ __u32 eof_flags;
+ uid_t eof_uid;
+ gid_t eof_gid;
+ prid_t eof_prid;
+ __u32 pad32;
+ __u64 eof_min_file_size;
+ __u64 pad64[12];
+};
+
+/* eof_flags values */
+#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */
+#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */
+#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */
+#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */
+#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */
+#define XFS_EOF_FLAGS_VALID \
+ (XFS_EOF_FLAGS_SYNC | \
+ XFS_EOF_FLAGS_UID | \
+ XFS_EOF_FLAGS_GID | \
+ XFS_EOF_FLAGS_PRID | \
+ XFS_EOF_FLAGS_MINFILESIZE)
+
+
+/*
* The user-level Handle Request interface structure.
*/
typedef struct xfs_fsop_handlereq {
@@ -456,6 +486,7 @@ typedef struct xfs_handle {
/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
+#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks)
/*
* ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
deleted file mode 100644
index 652b875a9d4..00000000000
--- a/fs/xfs/xfs_fs_subr.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_vnodeops.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trace.h"
-
-/*
- * note: all filemap functions return negative error codes. These
- * need to be inverted before returning to the xfs core functions.
- */
-void
-xfs_tosspages(
- xfs_inode_t *ip,
- xfs_off_t first,
- xfs_off_t last,
- int fiopt)
-{
- /* can't toss partial tail pages, so mask them out */
- last &= ~(PAGE_SIZE - 1);
- truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
-}
-
-int
-xfs_flushinval_pages(
- xfs_inode_t *ip,
- xfs_off_t first,
- xfs_off_t last,
- int fiopt)
-{
- struct address_space *mapping = VFS_I(ip)->i_mapping;
- int ret = 0;
-
- trace_xfs_pagecache_inval(ip, first, last);
-
- xfs_iflags_clear(ip, XFS_ITRUNCATED);
- ret = filemap_write_and_wait_range(mapping, first,
- last == -1 ? LLONG_MAX : last);
- if (!ret)
- truncate_inode_pages_range(mapping, first, last);
- return -ret;
-}
-
-int
-xfs_flush_pages(
- xfs_inode_t *ip,
- xfs_off_t first,
- xfs_off_t last,
- uint64_t flags,
- int fiopt)
-{
- struct address_space *mapping = VFS_I(ip)->i_mapping;
- int ret = 0;
- int ret2;
-
- xfs_iflags_clear(ip, XFS_ITRUNCATED);
- ret = -filemap_fdatawrite_range(mapping, first,
- last == -1 ? LLONG_MAX : last);
- if (flags & XBF_ASYNC)
- return ret;
- ret2 = xfs_wait_on_pages(ip, first, last);
- if (!ret)
- ret = ret2;
- return ret;
-}
-
-int
-xfs_wait_on_pages(
- xfs_inode_t *ip,
- xfs_off_t first,
- xfs_off_t last)
-{
- struct address_space *mapping = VFS_I(ip)->i_mapping;
-
- if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
- return -filemap_fdatawait_range(mapping, first,
- last == -1 ? XFS_ISIZE(ip) - 1 : last);
- }
- return 0;
-}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index c25b094efbf..94eaeedc549 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -97,7 +97,9 @@ xfs_fs_geometry(
(xfs_sb_version_haslazysbcount(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
(xfs_sb_version_hasattr2(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_ATTR2 : 0);
+ XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
+ (xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
mp->m_sb.sb_logsectsize : BBSIZE;
geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -112,18 +114,40 @@ xfs_fs_geometry(
return 0;
}
+static struct xfs_buf *
+xfs_growfs_get_hdr_buf(
+ struct xfs_mount *mp,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ int flags,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_buf *bp;
+
+ bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+ if (!bp)
+ return NULL;
+
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ bp->b_bn = blkno;
+ bp->b_maps[0].bm_bn = blkno;
+ bp->b_ops = ops;
+
+ return bp;
+}
+
static int
xfs_growfs_data_private(
xfs_mount_t *mp, /* mount point for filesystem */
xfs_growfs_data_t *in) /* growfs data input struct */
{
xfs_agf_t *agf;
+ struct xfs_agfl *agfl;
xfs_agi_t *agi;
xfs_agnumber_t agno;
xfs_extlen_t agsize;
xfs_extlen_t tmpsize;
xfs_alloc_rec_t *arec;
- struct xfs_btree_block *block;
xfs_buf_t *bp;
int bucket;
int dpct;
@@ -146,9 +170,14 @@ xfs_growfs_data_private(
dpct = pct - mp->m_sb.sb_imax_pct;
bp = xfs_buf_read_uncached(mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
- XFS_FSS_TO_BB(mp, 1), 0);
+ XFS_FSS_TO_BB(mp, 1), 0, NULL);
if (!bp)
return EIO;
+ if (bp->b_error) {
+ int error = bp->b_error;
+ xfs_buf_relse(bp);
+ return error;
+ }
xfs_buf_relse(bp);
new = nb; /* use new as a temporary here */
@@ -186,17 +215,18 @@ xfs_growfs_data_private(
nfree = 0;
for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
/*
- * AG freelist header block
+ * AG freespace header block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0);
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0,
+ &xfs_agf_buf_ops);
if (!bp) {
error = ENOMEM;
goto error0;
}
+
agf = XFS_BUF_TO_AGF(bp);
- memset(agf, 0, mp->m_sb.sb_sectsize);
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
agf->agf_seqno = cpu_to_be32(agno);
@@ -223,17 +253,39 @@ xfs_growfs_data_private(
goto error0;
/*
+ * AG freelist header block
+ */
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0,
+ &xfs_agfl_buf_ops);
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
+
+ agfl = XFS_BUF_TO_AGFL(bp);
+ for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
+ agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
+ goto error0;
+
+ /*
* AG inode header block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0);
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0,
+ &xfs_agi_buf_ops);
if (!bp) {
error = ENOMEM;
goto error0;
}
+
agi = XFS_BUF_TO_AGI(bp);
- memset(agi, 0, mp->m_sb.sb_sectsize);
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
agi->agi_seqno = cpu_to_be32(agno);
@@ -254,24 +306,22 @@ xfs_growfs_data_private(
/*
* BNO btree root block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0);
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_allocbt_buf_ops);
+
if (!bp) {
error = ENOMEM;
goto error0;
}
- block = XFS_BUF_TO_BLOCK(bp);
- memset(block, 0, mp->m_sb.sb_blocksize);
- block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
- block->bb_level = 0;
- block->bb_numrecs = cpu_to_be16(1);
- block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
+
+ xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
+ arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
agsize - be32_to_cpu(arec->ar_startblock));
+
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error)
@@ -280,25 +330,22 @@ xfs_growfs_data_private(
/*
* CNT btree root block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0);
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_allocbt_buf_ops);
if (!bp) {
error = ENOMEM;
goto error0;
}
- block = XFS_BUF_TO_BLOCK(bp);
- memset(block, 0, mp->m_sb.sb_blocksize);
- block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
- block->bb_level = 0;
- block->bb_numrecs = cpu_to_be16(1);
- block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
+
+ xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
+ arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
agsize - be32_to_cpu(arec->ar_startblock));
nfree += be32_to_cpu(arec->ar_blockcount);
+
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error)
@@ -307,20 +354,17 @@ xfs_growfs_data_private(
/*
* INO btree root block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0);
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_inobt_buf_ops);
if (!bp) {
error = ENOMEM;
goto error0;
}
- block = XFS_BUF_TO_BLOCK(bp);
- memset(block, 0, mp->m_sb.sb_blocksize);
- block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
- block->bb_level = 0;
- block->bb_numrecs = 0;
- block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+
+ xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0);
+
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error)
@@ -399,9 +443,28 @@ xfs_growfs_data_private(
/* update secondary superblocks. */
for (agno = 1; agno < nagcount; agno++) {
- error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+ error = 0;
+ /*
+ * new secondary superblocks need to be zeroed, not read from
+ * disk as the contents of the new area we are growing into is
+ * completely unknown.
+ */
+ if (agno < oagcount) {
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &bp,
+ &xfs_sb_buf_ops);
+ } else {
+ bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ XFS_FSS_TO_BB(mp, 1), 0);
+ if (bp) {
+ bp->b_ops = &xfs_sb_buf_ops;
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ } else
+ error = ENOMEM;
+ }
+
if (error) {
xfs_warn(mp,
"error %d reading secondary superblock for ag %d",
@@ -409,6 +472,7 @@ xfs_growfs_data_private(
break;
}
xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
+
/*
* If we get an error writing out the alternate superblocks,
* just issue a warning and continue. The real work is
@@ -423,7 +487,7 @@ xfs_growfs_data_private(
break; /* no point in continuing */
}
}
- return 0;
+ return error;
error0:
xfs_trans_cancel(tp, XFS_TRANS_ABORT);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 76e81cff70b..5399ef222dd 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -21,7 +21,8 @@
/*
* Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
* other XFS code uses these values. Times are measured in centisecs (i.e.
- * 100ths of a second).
+ * 100ths of a second) with the exception of eofb_timer, which is measured in
+ * seconds.
*/
xfs_param_t xfs_params = {
/* MIN DFLT MAX */
@@ -40,4 +41,5 @@ xfs_param_t xfs_params = {
.rotorstep = { 1, 1, 255 },
.inherit_nodfrg = { 0, 1, 1 },
.fstrm_timer = { 1, 30*100, 3600*100},
+ .eofb_timer = { 1, 300, 3600*24},
};
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 177a21a7ac4..a815412eab8 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -200,7 +200,8 @@ xfs_ialloc_inode_init(
*/
d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- mp->m_bsize * blks_per_cluster, 0);
+ mp->m_bsize * blks_per_cluster,
+ XBF_UNMAPPED);
if (!fbuf)
return ENOMEM;
/*
@@ -210,6 +211,7 @@ xfs_ialloc_inode_init(
* to log a whole cluster of inodes instead of all the
* individual transactions causing a lot of log traffic.
*/
+ fbuf->b_ops = &xfs_inode_buf_ops;
xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
for (i = 0; i < ninodes; i++) {
int ioffset = i << mp->m_sb.sb_inodelog;
@@ -250,6 +252,7 @@ xfs_ialloc_ag_alloc(
/* boundary */
struct xfs_perag *pag;
+ memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = tp->t_mountp;
@@ -431,7 +434,7 @@ xfs_ialloc_next_ag(
spin_lock(&mp->m_agirotor_lock);
agno = mp->m_agirotor;
- if (++mp->m_agirotor == mp->m_maxagi)
+ if (++mp->m_agirotor >= mp->m_maxagi)
mp->m_agirotor = 0;
spin_unlock(&mp->m_agirotor_lock);
@@ -442,14 +445,13 @@ xfs_ialloc_next_ag(
* Select an allocation group to look for a free inode in, based on the parent
* inode and then mode. Return the allocation group buffer.
*/
-STATIC xfs_buf_t * /* allocation group buffer */
+STATIC xfs_agnumber_t
xfs_ialloc_ag_select(
xfs_trans_t *tp, /* transaction pointer */
xfs_ino_t parent, /* parent directory inode number */
umode_t mode, /* bits set to indicate file type */
int okalloc) /* ok to allocate more space */
{
- xfs_buf_t *agbp; /* allocation group header buffer */
xfs_agnumber_t agcount; /* number of ag's in the filesystem */
xfs_agnumber_t agno; /* current ag number */
int flags; /* alloc buffer locking flags */
@@ -459,6 +461,7 @@ xfs_ialloc_ag_select(
int needspace; /* file mode implies space allocated */
xfs_perag_t *pag; /* per allocation group data */
xfs_agnumber_t pagno; /* parent (starting) ag number */
+ int error;
/*
* Files of these types need at least one block if length > 0
@@ -474,7 +477,9 @@ xfs_ialloc_ag_select(
if (pagno >= agcount)
pagno = 0;
}
+
ASSERT(pagno < agcount);
+
/*
* Loop through allocation groups, looking for one with a little
* free space in it. Note we don't look for free inodes, exactly.
@@ -486,51 +491,45 @@ xfs_ialloc_ag_select(
flags = XFS_ALLOC_FLAG_TRYLOCK;
for (;;) {
pag = xfs_perag_get(mp, agno);
+ if (!pag->pagi_inodeok) {
+ xfs_ialloc_next_ag(mp);
+ goto nextag;
+ }
+
if (!pag->pagi_init) {
- if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
- agbp = NULL;
+ error = xfs_ialloc_pagi_init(mp, tp, agno);
+ if (error)
goto nextag;
- }
- } else
- agbp = NULL;
+ }
- if (!pag->pagi_inodeok) {
- xfs_ialloc_next_ag(mp);
- goto unlock_nextag;
+ if (pag->pagi_freecount) {
+ xfs_perag_put(pag);
+ return agno;
}
- /*
- * Is there enough free space for the file plus a block
- * of inodes (if we need to allocate some)?
- */
- ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp);
- if (ineed && !pag->pagf_init) {
- if (agbp == NULL &&
- xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
- agbp = NULL;
+ if (!okalloc)
+ goto nextag;
+
+ if (!pag->pagf_init) {
+ error = xfs_alloc_pagf_init(mp, tp, agno, flags);
+ if (error)
goto nextag;
- }
- (void)xfs_alloc_pagf_init(mp, tp, agno, flags);
}
- if (!ineed || pag->pagf_init) {
- if (ineed && !(longest = pag->pagf_longest))
- longest = pag->pagf_flcount > 0;
- if (!ineed ||
- (pag->pagf_freeblks >= needspace + ineed &&
- longest >= ineed &&
- okalloc)) {
- if (agbp == NULL &&
- xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
- agbp = NULL;
- goto nextag;
- }
- xfs_perag_put(pag);
- return agbp;
- }
+
+ /*
+ * Is there enough free space for the file plus a block of
+ * inodes? (if we need to allocate some)?
+ */
+ ineed = XFS_IALLOC_BLOCKS(mp);
+ longest = pag->pagf_longest;
+ if (!longest)
+ longest = pag->pagf_flcount > 0;
+
+ if (pag->pagf_freeblks >= needspace + ineed &&
+ longest >= ineed) {
+ xfs_perag_put(pag);
+ return agno;
}
-unlock_nextag:
- if (agbp)
- xfs_trans_brelse(tp, agbp);
nextag:
xfs_perag_put(pag);
/*
@@ -538,13 +537,13 @@ nextag:
* down.
*/
if (XFS_FORCED_SHUTDOWN(mp))
- return NULL;
+ return NULLAGNUMBER;
agno++;
if (agno >= agcount)
agno = 0;
if (agno == pagno) {
if (flags == 0)
- return NULL;
+ return NULLAGNUMBER;
flags = 0;
}
}
@@ -607,195 +606,39 @@ xfs_ialloc_get_rec(
}
/*
- * Visible inode allocation functions.
- */
-/*
- * Find a free (set) bit in the inode bitmask.
- */
-static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
-{
- return xfs_lowbit64(*fp);
-}
-
-/*
- * Allocate an inode on disk.
- * Mode is used to tell whether the new inode will need space, and whether
- * it is a directory.
+ * Allocate an inode.
*
- * The arguments IO_agbp and alloc_done are defined to work within
- * the constraint of one allocation per transaction.
- * xfs_dialloc() is designed to be called twice if it has to do an
- * allocation to make more free inodes. On the first call,
- * IO_agbp should be set to NULL. If an inode is available,
- * i.e., xfs_dialloc() did not need to do an allocation, an inode
- * number is returned. In this case, IO_agbp would be set to the
- * current ag_buf and alloc_done set to false.
- * If an allocation needed to be done, xfs_dialloc would return
- * the current ag_buf in IO_agbp and set alloc_done to true.
- * The caller should then commit the current transaction, allocate a new
- * transaction, and call xfs_dialloc() again, passing in the previous
- * value of IO_agbp. IO_agbp should be held across the transactions.
- * Since the agbp is locked across the two calls, the second call is
- * guaranteed to have a free inode available.
- *
- * Once we successfully pick an inode its number is returned and the
- * on-disk data structures are updated. The inode itself is not read
- * in, since doing so would break ordering constraints with xfs_reclaim.
+ * The caller selected an AG for us, and made sure that free inodes are
+ * available.
*/
-int
-xfs_dialloc(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ino_t parent, /* parent inode (directory) */
- umode_t mode, /* mode bits for new inode */
- int okalloc, /* ok to allocate more space */
- xfs_buf_t **IO_agbp, /* in/out ag header's buffer */
- boolean_t *alloc_done, /* true if we needed to replenish
- inode freelist */
- xfs_ino_t *inop) /* inode number allocated */
+STATIC int
+xfs_dialloc_ag(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_ino_t parent,
+ xfs_ino_t *inop)
{
- xfs_agnumber_t agcount; /* number of allocation groups */
- xfs_buf_t *agbp; /* allocation group header's buffer */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_agi_t *agi; /* allocation group header structure */
- xfs_btree_cur_t *cur; /* inode allocation btree cursor */
- int error; /* error return value */
- int i; /* result code */
- int ialloced; /* inode allocation status */
- int noroom = 0; /* no space for inode blk allocation */
- xfs_ino_t ino; /* fs-relative inode to be returned */
- /* REFERENCED */
- int j; /* result code */
- xfs_mount_t *mp; /* file system mount structure */
- int offset; /* index of inode in chunk */
- xfs_agino_t pagino; /* parent's AG relative inode # */
- xfs_agnumber_t pagno; /* parent's AG number */
- xfs_inobt_rec_incore_t rec; /* inode allocation record */
- xfs_agnumber_t tagno; /* testing allocation group number */
- xfs_btree_cur_t *tcur; /* temp cursor */
- xfs_inobt_rec_incore_t trec; /* temp inode allocation record */
- struct xfs_perag *pag;
-
-
- if (*IO_agbp == NULL) {
- /*
- * We do not have an agbp, so select an initial allocation
- * group for inode allocation.
- */
- agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
- /*
- * Couldn't find an allocation group satisfying the
- * criteria, give up.
- */
- if (!agbp) {
- *inop = NULLFSINO;
- return 0;
- }
- agi = XFS_BUF_TO_AGI(agbp);
- ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
- } else {
- /*
- * Continue where we left off before. In this case, we
- * know that the allocation group has free inodes.
- */
- agbp = *IO_agbp;
- agi = XFS_BUF_TO_AGI(agbp);
- ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
- ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
- }
- mp = tp->t_mountp;
- agcount = mp->m_sb.sb_agcount;
- agno = be32_to_cpu(agi->agi_seqno);
- tagno = agno;
- pagno = XFS_INO_TO_AGNO(mp, parent);
- pagino = XFS_INO_TO_AGINO(mp, parent);
-
- /*
- * If we have already hit the ceiling of inode blocks then clear
- * okalloc so we scan all available agi structures for a free
- * inode.
- */
-
- if (mp->m_maxicount &&
- mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
- noroom = 1;
- okalloc = 0;
- }
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
+ xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *cur, *tcur;
+ struct xfs_inobt_rec_incore rec, trec;
+ xfs_ino_t ino;
+ int error;
+ int offset;
+ int i, j;
- /*
- * Loop until we find an allocation group that either has free inodes
- * or in which we can allocate some inodes. Iterate through the
- * allocation groups upward, wrapping at the end.
- */
- *alloc_done = B_FALSE;
- while (!agi->agi_freecount) {
- /*
- * Don't do anything if we're not supposed to allocate
- * any blocks, just go on to the next ag.
- */
- if (okalloc) {
- /*
- * Try to allocate some new inodes in the allocation
- * group.
- */
- if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) {
- xfs_trans_brelse(tp, agbp);
- if (error == ENOSPC) {
- *inop = NULLFSINO;
- return 0;
- } else
- return error;
- }
- if (ialloced) {
- /*
- * We successfully allocated some inodes, return
- * the current context to the caller so that it
- * can commit the current transaction and call
- * us again where we left off.
- */
- ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
- *alloc_done = B_TRUE;
- *IO_agbp = agbp;
- *inop = NULLFSINO;
- return 0;
- }
- }
- /*
- * If it failed, give up on this ag.
- */
- xfs_trans_brelse(tp, agbp);
- /*
- * Go on to the next ag: get its ag header.
- */
-nextag:
- if (++tagno == agcount)
- tagno = 0;
- if (tagno == agno) {
- *inop = NULLFSINO;
- return noroom ? ENOSPC : 0;
- }
- pag = xfs_perag_get(mp, tagno);
- if (pag->pagi_inodeok == 0) {
- xfs_perag_put(pag);
- goto nextag;
- }
- error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
- xfs_perag_put(pag);
- if (error)
- goto nextag;
- agi = XFS_BUF_TO_AGI(agbp);
- ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
- }
- /*
- * Here with an allocation group that has a free inode.
- * Reset agno since we may have chosen a new ag in the
- * loop above.
- */
- agno = tagno;
- *IO_agbp = NULL;
pag = xfs_perag_get(mp, agno);
+ ASSERT(pag->pagi_init);
+ ASSERT(pag->pagi_inodeok);
+ ASSERT(pag->pagi_freecount > 0);
+
restart_pagno:
- cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
* This must work because we've just allocated some.
@@ -995,7 +838,7 @@ newino:
}
alloc_inode:
- offset = xfs_ialloc_find_free(&rec.ir_free);
+ offset = xfs_lowbit64(rec.ir_free);
ASSERT(offset >= 0);
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1028,6 +871,165 @@ error0:
}
/*
+ * Allocate an inode on disk.
+ *
+ * Mode is used to tell whether the new inode will need space, and whether it
+ * is a directory.
+ *
+ * This function is designed to be called twice if it has to do an allocation
+ * to make more free inodes. On the first call, *IO_agbp should be set to NULL.
+ * If an inode is available without having to performn an allocation, an inode
+ * number is returned. In this case, *IO_agbp is set to NULL. If an allocation
+ * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
+ * The caller should then commit the current transaction, allocate a
+ * new transaction, and call xfs_dialloc() again, passing in the previous value
+ * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI
+ * buffer is locked across the two calls, the second call is guaranteed to have
+ * a free inode available.
+ *
+ * Once we successfully pick an inode its number is returned and the on-disk
+ * data structures are updated. The inode itself is not read in, since doing so
+ * would break ordering constraints with xfs_reclaim.
+ */
+int
+xfs_dialloc(
+ struct xfs_trans *tp,
+ xfs_ino_t parent,
+ umode_t mode,
+ int okalloc,
+ struct xfs_buf **IO_agbp,
+ xfs_ino_t *inop)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *agbp;
+ xfs_agnumber_t agno;
+ int error;
+ int ialloced;
+ int noroom = 0;
+ xfs_agnumber_t start_agno;
+ struct xfs_perag *pag;
+
+ if (*IO_agbp) {
+ /*
+ * If the caller passes in a pointer to the AGI buffer,
+ * continue where we left off before. In this case, we
+ * know that the allocation group has free inodes.
+ */
+ agbp = *IO_agbp;
+ goto out_alloc;
+ }
+
+ /*
+ * We do not have an agbp, so select an initial allocation
+ * group for inode allocation.
+ */
+ start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
+ if (start_agno == NULLAGNUMBER) {
+ *inop = NULLFSINO;
+ return 0;
+ }
+
+ /*
+ * If we have already hit the ceiling of inode blocks then clear
+ * okalloc so we scan all available agi structures for a free
+ * inode.
+ */
+ if (mp->m_maxicount &&
+ mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
+ noroom = 1;
+ okalloc = 0;
+ }
+
+ /*
+ * Loop until we find an allocation group that either has free inodes
+ * or in which we can allocate some inodes. Iterate through the
+ * allocation groups upward, wrapping at the end.
+ */
+ agno = start_agno;
+ for (;;) {
+ pag = xfs_perag_get(mp, agno);
+ if (!pag->pagi_inodeok) {
+ xfs_ialloc_next_ag(mp);
+ goto nextag;
+ }
+
+ if (!pag->pagi_init) {
+ error = xfs_ialloc_pagi_init(mp, tp, agno);
+ if (error)
+ goto out_error;
+ }
+
+ /*
+ * Do a first racy fast path check if this AG is usable.
+ */
+ if (!pag->pagi_freecount && !okalloc)
+ goto nextag;
+
+ /*
+ * Then read in the AGI buffer and recheck with the AGI buffer
+ * lock held.
+ */
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error)
+ goto out_error;
+
+ if (pag->pagi_freecount) {
+ xfs_perag_put(pag);
+ goto out_alloc;
+ }
+
+ if (!okalloc)
+ goto nextag_relse_buffer;
+
+
+ error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
+ if (error) {
+ xfs_trans_brelse(tp, agbp);
+
+ if (error != ENOSPC)
+ goto out_error;
+
+ xfs_perag_put(pag);
+ *inop = NULLFSINO;
+ return 0;
+ }
+
+ if (ialloced) {
+ /*
+ * We successfully allocated some inodes, return
+ * the current context to the caller so that it
+ * can commit the current transaction and call
+ * us again where we left off.
+ */
+ ASSERT(pag->pagi_freecount > 0);
+ xfs_perag_put(pag);
+
+ *IO_agbp = agbp;
+ *inop = NULLFSINO;
+ return 0;
+ }
+
+nextag_relse_buffer:
+ xfs_trans_brelse(tp, agbp);
+nextag:
+ xfs_perag_put(pag);
+ if (++agno == mp->m_sb.sb_agcount)
+ agno = 0;
+ if (agno == start_agno) {
+ *inop = NULLFSINO;
+ return noroom ? ENOSPC : 0;
+ }
+ }
+
+out_alloc:
+ *IO_agbp = NULL;
+ return xfs_dialloc_ag(tp, agbp, parent, inop);
+out_error:
+ xfs_perag_put(pag);
+ return XFS_ERROR(error);
+}
+
+/*
* Free disk inode. Carefully avoids touching the incore inode, all
* manipulations incore are the caller's responsibility.
* The on-disk inode is not changed by this operation, only the
@@ -1472,6 +1474,57 @@ xfs_check_agi_unlinked(
#define xfs_check_agi_unlinked(agi)
#endif
+static void
+xfs_agi_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
+ int agi_ok;
+
+ /*
+ * Validate the magic number of the agi block.
+ */
+ agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
+ XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
+
+ /*
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag)
+ agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) ==
+ bp->b_pag->pag_agno;
+
+ if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
+ XFS_RANDOM_IALLOC_READ_AGI))) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+ xfs_check_agi_unlinked(agi);
+}
+
+static void
+xfs_agi_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agi_verify(bp);
+}
+
+static void
+xfs_agi_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agi_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_agi_buf_ops = {
+ .verify_read = xfs_agi_read_verify,
+ .verify_write = xfs_agi_write_verify,
+};
+
/*
* Read in the allocation group header (inode allocation section)
*/
@@ -1482,38 +1535,18 @@ xfs_read_agi(
xfs_agnumber_t agno, /* allocation group number */
struct xfs_buf **bpp) /* allocation group hdr buf */
{
- struct xfs_agi *agi; /* allocation group header */
- int agi_ok; /* agi is consistent */
int error;
ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, bpp);
+ XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
if (error)
return error;
ASSERT(!xfs_buf_geterror(*bpp));
- agi = XFS_BUF_TO_AGI(*bpp);
-
- /*
- * Validate the magic number of the agi block.
- */
- agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
- XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
- be32_to_cpu(agi->agi_seqno) == agno;
- if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
- XFS_RANDOM_IALLOC_READ_AGI))) {
- XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
- mp, agi);
- xfs_trans_brelse(tp, *bpp);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
xfs_buf_set_ref(*bpp, XFS_AGI_REF);
-
- xfs_check_agi_unlinked(agi);
return 0;
}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 65ac57c8063..c8da3df271e 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -75,8 +75,6 @@ xfs_dialloc(
umode_t mode, /* mode bits for new inode */
int okalloc, /* ok to allocate more space */
struct xfs_buf **agbp, /* buf for a.g. inode header */
- boolean_t *alloc_done, /* an allocation was done to replenish
- the free inodes */
xfs_ino_t *inop); /* inode number allocated */
/*
@@ -149,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
/*
* Get the data from the pointed-to record.
*/
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
xfs_inobt_rec_incore_t *rec, int *stat);
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 2b8b7a37aa1..bec344b3650 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -33,6 +33,7 @@
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
+#include "xfs_trace.h"
STATIC int
@@ -181,6 +182,59 @@ xfs_inobt_key_diff(
cur->bc_rec.i.ir_startino;
}
+void
+xfs_inobt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ unsigned int level;
+ int sblock_ok; /* block passes checks */
+
+ /* magic number and level verification */
+ level = be16_to_cpu(block->bb_level);
+ sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) &&
+ level < mp->m_in_maxlevels;
+
+ /* numrecs verification */
+ sblock_ok = sblock_ok &&
+ be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0];
+
+ /* sibling pointer verification */
+ sblock_ok = sblock_ok &&
+ (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+ be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
+ block->bb_u.s.bb_leftsib &&
+ (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+ be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
+ block->bb_u.s.bb_rightsib;
+
+ if (!sblock_ok) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_inobt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_inobt_verify(bp);
+}
+
+static void
+xfs_inobt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_inobt_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_inobt_buf_ops = {
+ .verify_read = xfs_inobt_read_verify,
+ .verify_write = xfs_inobt_write_verify,
+};
+
#ifdef DEBUG
STATIC int
xfs_inobt_keys_inorder(
@@ -218,6 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
+ .buf_ops = &xfs_inobt_buf_ops,
#ifdef DEBUG
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index f782ad0c476..25c0239a8ea 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_icache.c
index 1e9ee064dbb..96e344e3e92 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_icache.c
@@ -19,6 +19,7 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
@@ -35,11 +36,425 @@
#include "xfs_quota.h"
#include "xfs_trace.h"
#include "xfs_fsops.h"
+#include "xfs_icache.h"
#include <linux/kthread.h>
#include <linux/freezer.h>
-struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
+STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
+ struct xfs_perag *pag, struct xfs_inode *ip);
+
+/*
+ * Allocate and initialise an xfs_inode.
+ */
+STATIC struct xfs_inode *
+xfs_inode_alloc(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ struct xfs_inode *ip;
+
+ /*
+ * if this didn't occur in transactions, we could use
+ * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+ * code up to do this anyway.
+ */
+ ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+ if (!ip)
+ return NULL;
+ if (inode_init_always(mp->m_super, VFS_I(ip))) {
+ kmem_zone_free(xfs_inode_zone, ip);
+ return NULL;
+ }
+
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!spin_is_locked(&ip->i_flags_lock));
+ ASSERT(!xfs_isiflocked(ip));
+ ASSERT(ip->i_ino == 0);
+
+ mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+ /* initialise the xfs inode */
+ ip->i_ino = ino;
+ ip->i_mount = mp;
+ memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+ ip->i_afp = NULL;
+ memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+ ip->i_flags = 0;
+ ip->i_delayed_blks = 0;
+ memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+
+ return ip;
+}
+
+STATIC void
+xfs_inode_free_callback(
+ struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ struct xfs_inode *ip = XFS_I(inode);
+
+ kmem_zone_free(xfs_inode_zone, ip);
+}
+
+STATIC void
+xfs_inode_free(
+ struct xfs_inode *ip)
+{
+ switch (ip->i_d.di_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ xfs_idestroy_fork(ip, XFS_DATA_FORK);
+ break;
+ }
+
+ if (ip->i_afp)
+ xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+ if (ip->i_itemp) {
+ ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
+ xfs_inode_item_destroy(ip);
+ ip->i_itemp = NULL;
+ }
+
+ /* asserts to verify all state is correct here */
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!spin_is_locked(&ip->i_flags_lock));
+ ASSERT(!xfs_isiflocked(ip));
+
+ /*
+ * Because we use RCU freeing we need to ensure the inode always
+ * appears to be reclaimed with an invalid inode number when in the
+ * free state. The ip->i_flags_lock provides the barrier against lookup
+ * races.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags = XFS_IRECLAIM;
+ ip->i_ino = 0;
+ spin_unlock(&ip->i_flags_lock);
+
+ call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip,
+ xfs_ino_t ino,
+ int flags,
+ int lock_flags) __releases(RCU)
+{
+ struct inode *inode = VFS_I(ip);
+ struct xfs_mount *mp = ip->i_mount;
+ int error;
+
+ /*
+ * check for re-use of an inode within an RCU grace period due to the
+ * radix tree nodes not being updated yet. We monitor for this by
+ * setting the inode number to zero before freeing the inode structure.
+ * If the inode has been reallocated and set up, then the inode number
+ * will not match, so check for that, too.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != ino) {
+ trace_xfs_iget_skip(ip);
+ XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
+ goto out_error;
+ }
+
+
+ /*
+ * If we are racing with another cache hit that is currently
+ * instantiating this inode or currently recycling it out of
+ * reclaimabe state, wait for the initialisation to complete
+ * before continuing.
+ *
+ * XXX(hch): eventually we should do something equivalent to
+ * wait_on_inode to wait for these flags to be cleared
+ * instead of polling for it.
+ */
+ if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+ trace_xfs_iget_skip(ip);
+ XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
+ goto out_error;
+ }
+
+ /*
+ * If lookup is racing with unlink return an error immediately.
+ */
+ if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_error;
+ }
+
+ /*
+ * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+ * Need to carefully get it back into useable state.
+ */
+ if (ip->i_flags & XFS_IRECLAIMABLE) {
+ trace_xfs_iget_reclaim(ip);
+
+ /*
+ * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+ * from stomping over us while we recycle the inode. We can't
+ * clear the radix tree reclaimable tag yet as it requires
+ * pag_ici_lock to be held exclusive.
+ */
+ ip->i_flags |= XFS_IRECLAIM;
+
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+
+ error = -inode_init_always(mp->m_super, inode);
+ if (error) {
+ /*
+ * Re-initializing the inode failed, and we are in deep
+ * trouble. Try to re-add it to the reclaim list.
+ */
+ rcu_read_lock();
+ spin_lock(&ip->i_flags_lock);
+
+ ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+ ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+ trace_xfs_iget_reclaim_fail(ip);
+ goto out_error;
+ }
+
+ spin_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ /*
+ * Clear the per-lifetime state in the inode as we are now
+ * effectively a new inode and need to return to the initial
+ * state before reuse occurs.
+ */
+ ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+ ip->i_flags |= XFS_INEW;
+ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+ inode->i_state = I_NEW;
+
+ ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+ mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+ spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&pag->pag_ici_lock);
+ } else {
+ /* If the VFS inode is being torn down, pause and try again. */
+ if (!igrab(inode)) {
+ trace_xfs_iget_skip(ip);
+ error = EAGAIN;
+ goto out_error;
+ }
+
+ /* We've got a live one. */
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ trace_xfs_iget_hit(ip);
+ }
+
+ if (lock_flags != 0)
+ xfs_ilock(ip, lock_flags);
+
+ xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
+ XFS_STATS_INC(xs_ig_found);
+
+ return 0;
+
+out_error:
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ return error;
+}
+
+
+static int
+xfs_iget_cache_miss(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ xfs_trans_t *tp,
+ xfs_ino_t ino,
+ struct xfs_inode **ipp,
+ int flags,
+ int lock_flags)
+{
+ struct xfs_inode *ip;
+ int error;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
+ int iflags;
+
+ ip = xfs_inode_alloc(mp, ino);
+ if (!ip)
+ return ENOMEM;
+
+ error = xfs_iread(mp, tp, ip, flags);
+ if (error)
+ goto out_destroy;
+
+ trace_xfs_iget_miss(ip);
+
+ if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_destroy;
+ }
+
+ /*
+ * Preload the radix tree so we can insert safely under the
+ * write spinlock. Note that we cannot sleep inside the preload
+ * region. Since we can be called from transaction context, don't
+ * recurse into the file system.
+ */
+ if (radix_tree_preload(GFP_NOFS)) {
+ error = EAGAIN;
+ goto out_destroy;
+ }
+
+ /*
+ * Because the inode hasn't been added to the radix-tree yet it can't
+ * be found by another thread, so we can do the non-sleeping lock here.
+ */
+ if (lock_flags) {
+ if (!xfs_ilock_nowait(ip, lock_flags))
+ BUG();
+ }
+
+ /*
+ * These values must be set before inserting the inode into the radix
+ * tree as the moment it is inserted a concurrent lookup (allowed by the
+ * RCU locking mechanism) can find it and that lookup must see that this
+ * is an inode currently under construction (i.e. that XFS_INEW is set).
+ * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+ * memory barrier that ensures this detection works correctly at lookup
+ * time.
+ */
+ iflags = XFS_INEW;
+ if (flags & XFS_IGET_DONTCACHE)
+ iflags |= XFS_IDONTCACHE;
+ ip->i_udquot = ip->i_gdquot = NULL;
+ xfs_iflags_set(ip, iflags);
+
+ /* insert the new inode */
+ spin_lock(&pag->pag_ici_lock);
+ error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
+ if (unlikely(error)) {
+ WARN_ON(error != -EEXIST);
+ XFS_STATS_INC(xs_ig_dup);
+ error = EAGAIN;
+ goto out_preload_end;
+ }
+ spin_unlock(&pag->pag_ici_lock);
+ radix_tree_preload_end();
+
+ *ipp = ip;
+ return 0;
+
+out_preload_end:
+ spin_unlock(&pag->pag_ici_lock);
+ radix_tree_preload_end();
+ if (lock_flags)
+ xfs_iunlock(ip, lock_flags);
+out_destroy:
+ __destroy_inode(VFS_I(ip));
+ xfs_inode_free(ip);
+ return error;
+}
+
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system. It points
+ * to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one. This is
+ * simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired. This is the unique identifier
+ * within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode. See the comment
+ * for xfs_ilock() for a list of valid values.
+ */
+int
+xfs_iget(
+ xfs_mount_t *mp,
+ xfs_trans_t *tp,
+ xfs_ino_t ino,
+ uint flags,
+ uint lock_flags,
+ xfs_inode_t **ipp)
+{
+ xfs_inode_t *ip;
+ int error;
+ xfs_perag_t *pag;
+ xfs_agino_t agino;
+
+ /*
+ * xfs_reclaim_inode() uses the ILOCK to ensure an inode
+ * doesn't get freed while it's being referenced during a
+ * radix tree traversal here. It assumes this function
+ * aqcuires only the ILOCK (and therefore it has no need to
+ * involve the IOLOCK in this synchronization).
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
+
+ /* reject inode numbers outside existing AGs */
+ if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+ return EINVAL;
+
+ /* get the perag structure and ensure that it's inode capable */
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+ agino = XFS_INO_TO_AGINO(mp, ino);
+
+again:
+ error = 0;
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+ if (ip) {
+ error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
+ if (error)
+ goto out_error_or_again;
+ } else {
+ rcu_read_unlock();
+ XFS_STATS_INC(xs_ig_missed);
+
+ error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
+ flags, lock_flags);
+ if (error)
+ goto out_error_or_again;
+ }
+ xfs_perag_put(pag);
+
+ *ipp = ip;
+
+ /*
+ * If we have a real type for an on-disk inode, we can set ops(&unlock)
+ * now. If it's a new inode being created, xfs_ialloc will handle it.
+ */
+ if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+ xfs_setup_inode(ip);
+ return 0;
+
+out_error_or_again:
+ if (error == EAGAIN) {
+ delay(1);
+ goto again;
+ }
+ xfs_perag_put(pag);
+ return error;
+}
/*
* The inode lookup is done in batches to keep the amount of lock traffic and
@@ -101,8 +516,11 @@ xfs_inode_ag_walk(
struct xfs_mount *mp,
struct xfs_perag *pag,
int (*execute)(struct xfs_inode *ip,
- struct xfs_perag *pag, int flags),
- int flags)
+ struct xfs_perag *pag, int flags,
+ void *args),
+ int flags,
+ void *args,
+ int tag)
{
uint32_t first_index;
int last_error = 0;
@@ -121,9 +539,17 @@ restart:
int i;
rcu_read_lock();
- nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+
+ if (tag == -1)
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)batch, first_index,
XFS_LOOKUP_BATCH);
+ else
+ nr_found = radix_tree_gang_lookup_tag(
+ &pag->pag_ici_root,
+ (void **) batch, first_index,
+ XFS_LOOKUP_BATCH, tag);
+
if (!nr_found) {
rcu_read_unlock();
break;
@@ -164,7 +590,7 @@ restart:
for (i = 0; i < nr_found; i++) {
if (!batch[i])
continue;
- error = execute(batch[i], pag, flags);
+ error = execute(batch[i], pag, flags, args);
IRELE(batch[i]);
if (error == EAGAIN) {
skipped++;
@@ -189,12 +615,40 @@ restart:
return last_error;
}
+/*
+ * Background scanning to trim post-EOF preallocated space. This is queued
+ * based on the 'background_prealloc_discard_period' tunable (5m by default).
+ */
+STATIC void
+xfs_queue_eofblocks(
+ struct xfs_mount *mp)
+{
+ rcu_read_lock();
+ if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
+ queue_delayed_work(mp->m_eofblocks_workqueue,
+ &mp->m_eofblocks_work,
+ msecs_to_jiffies(xfs_eofb_secs * 1000));
+ rcu_read_unlock();
+}
+
+void
+xfs_eofblocks_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_eofblocks_work);
+ xfs_icache_free_eofblocks(mp, NULL);
+ xfs_queue_eofblocks(mp);
+}
+
int
xfs_inode_ag_iterator(
struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip,
- struct xfs_perag *pag, int flags),
- int flags)
+ struct xfs_perag *pag, int flags,
+ void *args),
+ int flags,
+ void *args)
{
struct xfs_perag *pag;
int error = 0;
@@ -204,7 +658,7 @@ xfs_inode_ag_iterator(
ag = 0;
while ((pag = xfs_perag_get(mp, ag))) {
ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags);
+ error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
xfs_perag_put(pag);
if (error) {
last_error = error;
@@ -215,215 +669,50 @@ xfs_inode_ag_iterator(
return XFS_ERROR(last_error);
}
-STATIC int
-xfs_sync_inode_data(
- struct xfs_inode *ip,
- struct xfs_perag *pag,
- int flags)
-{
- struct inode *inode = VFS_I(ip);
- struct address_space *mapping = inode->i_mapping;
- int error = 0;
-
- if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- return 0;
-
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
- if (flags & SYNC_TRYLOCK)
- return 0;
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
-
- error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
- 0 : XBF_ASYNC, FI_NONE);
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- return error;
-}
-
-/*
- * Write out pagecache data for the whole filesystem.
- */
-STATIC int
-xfs_sync_data(
- struct xfs_mount *mp,
- int flags)
-{
- int error;
-
- ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
-
- error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
- if (error)
- return XFS_ERROR(error);
-
- xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
- return 0;
-}
-
-STATIC int
-xfs_sync_fsdata(
- struct xfs_mount *mp)
-{
- struct xfs_buf *bp;
- int error;
-
- /*
- * If the buffer is pinned then push on the log so we won't get stuck
- * waiting in the write for someone, maybe ourselves, to flush the log.
- *
- * Even though we just pushed the log above, we did not have the
- * superblock buffer locked at that point so it can become pinned in
- * between there and here.
- */
- bp = xfs_getsb(mp, 0);
- if (xfs_buf_ispinned(bp))
- xfs_log_force(mp, 0);
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- return error;
-}
-
-/*
- * When remounting a filesystem read-only or freezing the filesystem, we have
- * two phases to execute. This first phase is syncing the data before we
- * quiesce the filesystem, and the second is flushing all the inodes out after
- * we've waited for all the transactions created by the first phase to
- * complete. The second phase ensures that the inodes are written to their
- * location on disk rather than just existing in transactions in the log. This
- * means after a quiesce there is no log replay required to write the inodes to
- * disk (this is the main difference between a sync and a quiesce).
- */
-/*
- * First stage of freeze - no writers will make progress now we are here,
- * so we flush delwri and delalloc buffers here, then wait for all I/O to
- * complete. Data is frozen at that point. Metadata is not frozen,
- * transactions can still occur here so don't bother emptying the AIL
- * because it'll just get dirty again.
- */
int
-xfs_quiesce_data(
- struct xfs_mount *mp)
-{
- int error, error2 = 0;
-
- /* force out the log */
- xfs_log_force(mp, XFS_LOG_SYNC);
-
- /* write superblock and hoover up shutdown errors */
- error = xfs_sync_fsdata(mp);
-
- /* mark the log as covered if needed */
- if (xfs_log_need_covered(mp))
- error2 = xfs_fs_log_dummy(mp);
-
- return error ? error : error2;
-}
-
-/*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceeding.
- */
-void
-xfs_quiesce_attr(
- struct xfs_mount *mp)
-{
- int error = 0;
-
- /* wait for all modifications to complete */
- while (atomic_read(&mp->m_active_trans) > 0)
- delay(100);
-
- /* reclaim inodes to do any IO before the freeze completes */
- xfs_reclaim_inodes(mp, 0);
- xfs_reclaim_inodes(mp, SYNC_WAIT);
-
- /* flush all pending changes from the AIL */
- xfs_ail_push_all_sync(mp->m_ail);
-
- /*
- * Just warn here till VFS can correctly support
- * read-only remount without racing.
- */
- WARN_ON(atomic_read(&mp->m_active_trans) != 0);
-
- /* Push the superblock and write an unmount record */
- error = xfs_log_sbcount(mp);
- if (error)
- xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
- "Frozen image may not be consistent.");
- xfs_log_unmount_write(mp);
-
- /*
- * At this point we might have modified the superblock again and thus
- * added an item to the AIL, thus flush it again.
- */
- xfs_ail_push_all_sync(mp->m_ail);
-}
-
-static void
-xfs_syncd_queue_sync(
- struct xfs_mount *mp)
-{
- queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
- msecs_to_jiffies(xfs_syncd_centisecs * 10));
-}
-
-/*
- * Every sync period we need to unpin all items, reclaim inodes and sync
- * disk quotas. We might need to cover the log to indicate that the
- * filesystem is idle and not frozen.
- */
-STATIC void
-xfs_sync_worker(
- struct work_struct *work)
+xfs_inode_ag_iterator_tag(
+ struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip,
+ struct xfs_perag *pag, int flags,
+ void *args),
+ int flags,
+ void *args,
+ int tag)
{
- struct xfs_mount *mp = container_of(to_delayed_work(work),
- struct xfs_mount, m_sync_work);
- int error;
-
- /*
- * We shouldn't write/force the log if we are in the mount/unmount
- * process or on a read only filesystem. The workqueue still needs to be
- * active in both cases, however, because it is used for inode reclaim
- * during these times. Use the MS_ACTIVE flag to avoid doing anything
- * during mount. Doing work during unmount is avoided by calling
- * cancel_delayed_work_sync on this work queue before tearing down
- * the ail and the log in xfs_log_unmount.
- */
- if (!(mp->m_super->s_flags & MS_ACTIVE) &&
- !(mp->m_flags & XFS_MOUNT_RDONLY)) {
- /* dgc: errors ignored here */
- if (mp->m_super->s_frozen == SB_UNFROZEN &&
- xfs_log_need_covered(mp))
- error = xfs_fs_log_dummy(mp);
- else
- xfs_log_force(mp, 0);
+ struct xfs_perag *pag;
+ int error = 0;
+ int last_error = 0;
+ xfs_agnumber_t ag;
- /* start pushing all the metadata that is currently
- * dirty */
- xfs_ail_push_all(mp->m_ail);
+ ag = 0;
+ while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
+ ag = pag->pag_agno + 1;
+ error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
+ xfs_perag_put(pag);
+ if (error) {
+ last_error = error;
+ if (error == EFSCORRUPTED)
+ break;
+ }
}
-
- /* queue us up again */
- xfs_syncd_queue_sync(mp);
+ return XFS_ERROR(last_error);
}
/*
* Queue a new inode reclaim pass if there are reclaimable inodes and there
* isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
* tunable, but that can be done if this method proves to be ineffective or too
* aggressive.
*/
static void
-xfs_syncd_queue_reclaim(
+xfs_reclaim_work_queue(
struct xfs_mount *mp)
{
rcu_read_lock();
if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
- queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
+ queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
}
rcu_read_unlock();
@@ -436,7 +725,7 @@ xfs_syncd_queue_reclaim(
* goes low. It scans as quickly as possible avoiding locked inodes or those
* already being flushed, and once done schedules a future pass.
*/
-STATIC void
+void
xfs_reclaim_worker(
struct work_struct *work)
{
@@ -444,65 +733,10 @@ xfs_reclaim_worker(
struct xfs_mount, m_reclaim_work);
xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
- xfs_syncd_queue_reclaim(mp);
+ xfs_reclaim_work_queue(mp);
}
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations. At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room.
- *
- * Queue a new data flush if there isn't one already in progress and
- * wait for completion of the flush. This means that we only ever have one
- * inode flush in progress no matter how many ENOSPC events are occurring and
- * so will prevent the system from bogging down due to every concurrent
- * ENOSPC event scanning all the active inodes in the system for writeback.
- */
-void
-xfs_flush_inodes(
- struct xfs_inode *ip)
-{
- struct xfs_mount *mp = ip->i_mount;
-
- queue_work(xfs_syncd_wq, &mp->m_flush_work);
- flush_work_sync(&mp->m_flush_work);
-}
-
-STATIC void
-xfs_flush_worker(
- struct work_struct *work)
-{
- struct xfs_mount *mp = container_of(work,
- struct xfs_mount, m_flush_work);
-
- xfs_sync_data(mp, SYNC_TRYLOCK);
- xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-}
-
-int
-xfs_syncd_init(
- struct xfs_mount *mp)
-{
- INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
- INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
- INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-
- xfs_syncd_queue_sync(mp);
-
- return 0;
-}
-
-void
-xfs_syncd_stop(
- struct xfs_mount *mp)
-{
- cancel_delayed_work_sync(&mp->m_sync_work);
- cancel_delayed_work_sync(&mp->m_reclaim_work);
- cancel_work_sync(&mp->m_flush_work);
-}
-
-void
+static void
__xfs_inode_set_reclaim_tag(
struct xfs_perag *pag,
struct xfs_inode *ip)
@@ -520,7 +754,7 @@ __xfs_inode_set_reclaim_tag(
spin_unlock(&ip->i_mount->m_perag_lock);
/* schedule periodic background inode reclaim */
- xfs_syncd_queue_reclaim(ip->i_mount);
+ xfs_reclaim_work_queue(ip->i_mount);
trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
-1, _RET_IP_);
@@ -568,7 +802,7 @@ __xfs_inode_clear_reclaim(
}
}
-void
+STATIC void
__xfs_inode_clear_reclaim_tag(
xfs_mount_t *mp,
xfs_perag_t *pag,
@@ -712,8 +946,8 @@ restart:
* Note that xfs_iflush will never block on the inode buffer lock, as
* xfs_ifree_cluster() can lock the inode buffer before it locks the
* ip->i_lock, and we are doing the exact opposite here. As a result,
- * doing a blocking xfs_itobp() to get the cluster buffer would result
- * in an ABBA deadlock with xfs_ifree_cluster().
+ * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
+ * result in an ABBA deadlock with xfs_ifree_cluster().
*
* As xfs_ifree_cluser() must gather all inodes that are active in the
* cache to mark them stale, if we hit this case we don't actually want
@@ -778,9 +1012,9 @@ out:
/*
* We could return EAGAIN here to make reclaim rescan the inode tree in
* a short while. However, this just burns CPU time scanning the tree
- * waiting for IO to complete and xfssyncd never goes back to the idle
- * state. Instead, return 0 to let the next scheduled background reclaim
- * attempt to reclaim the inode again.
+ * waiting for IO to complete and the reclaim work never goes back to
+ * the idle state. Instead, return 0 to let the next scheduled
+ * background reclaim attempt to reclaim the inode again.
*/
return 0;
}
@@ -791,7 +1025,7 @@ out:
* then a shut down during filesystem unmount reclaim walk leak all the
* unreclaimed inodes.
*/
-int
+STATIC int
xfs_reclaim_inodes_ag(
struct xfs_mount *mp,
int flags,
@@ -936,7 +1170,7 @@ xfs_reclaim_inodes_nr(
int nr_to_scan)
{
/* kick background reclaimer and push the AIL */
- xfs_syncd_queue_reclaim(mp);
+ xfs_reclaim_work_queue(mp);
xfs_ail_push_all(mp->m_ail);
xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
@@ -962,3 +1196,146 @@ xfs_reclaim_inodes_count(
return reclaimable;
}
+STATIC int
+xfs_inode_match_id(
+ struct xfs_inode *ip,
+ struct xfs_eofblocks *eofb)
+{
+ if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
+ ip->i_d.di_uid != eofb->eof_uid)
+ return 0;
+
+ if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
+ ip->i_d.di_gid != eofb->eof_gid)
+ return 0;
+
+ if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
+ xfs_get_projid(ip) != eofb->eof_prid)
+ return 0;
+
+ return 1;
+}
+
+STATIC int
+xfs_inode_free_eofblocks(
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ int flags,
+ void *args)
+{
+ int ret;
+ struct xfs_eofblocks *eofb = args;
+
+ if (!xfs_can_free_eofblocks(ip, false)) {
+ /* inode could be preallocated or append-only */
+ trace_xfs_inode_free_eofblocks_invalid(ip);
+ xfs_inode_clear_eofblocks_tag(ip);
+ return 0;
+ }
+
+ /*
+ * If the mapping is dirty the operation can block and wait for some
+ * time. Unless we are waiting, skip it.
+ */
+ if (!(flags & SYNC_WAIT) &&
+ mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
+ return 0;
+
+ if (eofb) {
+ if (!xfs_inode_match_id(ip, eofb))
+ return 0;
+
+ /* skip the inode if the file size is too small */
+ if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+ XFS_ISIZE(ip) < eofb->eof_min_file_size)
+ return 0;
+ }
+
+ ret = xfs_free_eofblocks(ip->i_mount, ip, true);
+
+ /* don't revisit the inode if we're not waiting */
+ if (ret == EAGAIN && !(flags & SYNC_WAIT))
+ ret = 0;
+
+ return ret;
+}
+
+int
+xfs_icache_free_eofblocks(
+ struct xfs_mount *mp,
+ struct xfs_eofblocks *eofb)
+{
+ int flags = SYNC_TRYLOCK;
+
+ if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
+ flags = SYNC_WAIT;
+
+ return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
+ eofb, XFS_ICI_EOFBLOCKS_TAG);
+}
+
+void
+xfs_inode_set_eofblocks_tag(
+ xfs_inode_t *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+ int tagged;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ spin_lock(&pag->pag_ici_lock);
+ trace_xfs_inode_set_eofblocks_tag(ip);
+
+ tagged = radix_tree_tagged(&pag->pag_ici_root,
+ XFS_ICI_EOFBLOCKS_TAG);
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_EOFBLOCKS_TAG);
+ if (!tagged) {
+ /* propagate the eofblocks tag up into the perag radix tree */
+ spin_lock(&ip->i_mount->m_perag_lock);
+ radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_ICI_EOFBLOCKS_TAG);
+ spin_unlock(&ip->i_mount->m_perag_lock);
+
+ /* kick off background trimming */
+ xfs_queue_eofblocks(ip->i_mount);
+
+ trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
+ -1, _RET_IP_);
+ }
+
+ spin_unlock(&pag->pag_ici_lock);
+ xfs_perag_put(pag);
+}
+
+void
+xfs_inode_clear_eofblocks_tag(
+ xfs_inode_t *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ spin_lock(&pag->pag_ici_lock);
+ trace_xfs_inode_clear_eofblocks_tag(ip);
+
+ radix_tree_tag_clear(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_EOFBLOCKS_TAG);
+ if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
+ /* clear the eofblocks tag from the perag radix tree */
+ spin_lock(&ip->i_mount->m_perag_lock);
+ radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_ICI_EOFBLOCKS_TAG);
+ spin_unlock(&ip->i_mount->m_perag_lock);
+ trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
+ -1, _RET_IP_);
+ }
+
+ spin_unlock(&pag->pag_ici_lock);
+ xfs_perag_put(pag);
+}
+
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_icache.h
index 941202e7ac6..e0f138c70a2 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_icache.h
@@ -24,28 +24,30 @@ struct xfs_perag;
#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
-extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
+int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
+ uint flags, uint lock_flags, xfs_inode_t **ipp);
-int xfs_syncd_init(struct xfs_mount *mp);
-void xfs_syncd_stop(struct xfs_mount *mp);
-
-int xfs_quiesce_data(struct xfs_mount *mp);
-void xfs_quiesce_attr(struct xfs_mount *mp);
-
-void xfs_flush_inodes(struct xfs_inode *ip);
+void xfs_reclaim_worker(struct work_struct *work);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
int xfs_reclaim_inodes_count(struct xfs_mount *mp);
void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
- struct xfs_inode *ip);
+
+void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
+void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
+int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
+void xfs_eofblocks_worker(struct work_struct *);
int xfs_sync_inode_grab(struct xfs_inode *ip);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
- int flags);
+ int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
+ int flags, void *args),
+ int flags, void *args);
+int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
+ int flags, void *args),
+ int flags, void *args, int tag);
#endif
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
deleted file mode 100644
index 1bb4365e8c2..00000000000
--- a/fs/xfs/xfs_iget.c
+++ /dev/null
@@ -1,720 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_acl.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-#include "xfs_trans_priv.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_trace.h"
-
-
-/*
- * Define xfs inode iolock lockdep classes. We need to ensure that all active
- * inodes are considered the same for lockdep purposes, including inodes that
- * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
- * guarantee the locks are considered the same when there are multiple lock
- * initialisation siteѕ. Also, define a reclaimable inode class so it is
- * obvious in lockdep reports which class the report is against.
- */
-static struct lock_class_key xfs_iolock_active;
-struct lock_class_key xfs_iolock_reclaimable;
-
-/*
- * Allocate and initialise an xfs_inode.
- */
-STATIC struct xfs_inode *
-xfs_inode_alloc(
- struct xfs_mount *mp,
- xfs_ino_t ino)
-{
- struct xfs_inode *ip;
-
- /*
- * if this didn't occur in transactions, we could use
- * KM_MAYFAIL and return NULL here on ENOMEM. Set the
- * code up to do this anyway.
- */
- ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
- if (!ip)
- return NULL;
- if (inode_init_always(mp->m_super, VFS_I(ip))) {
- kmem_zone_free(xfs_inode_zone, ip);
- return NULL;
- }
-
- ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!spin_is_locked(&ip->i_flags_lock));
- ASSERT(!xfs_isiflocked(ip));
- ASSERT(ip->i_ino == 0);
-
- mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
- lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
- &xfs_iolock_active, "xfs_iolock_active");
-
- /* initialise the xfs inode */
- ip->i_ino = ino;
- ip->i_mount = mp;
- memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
- ip->i_afp = NULL;
- memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
- ip->i_flags = 0;
- ip->i_delayed_blks = 0;
- memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-
- return ip;
-}
-
-STATIC void
-xfs_inode_free_callback(
- struct rcu_head *head)
-{
- struct inode *inode = container_of(head, struct inode, i_rcu);
- struct xfs_inode *ip = XFS_I(inode);
-
- kmem_zone_free(xfs_inode_zone, ip);
-}
-
-void
-xfs_inode_free(
- struct xfs_inode *ip)
-{
- switch (ip->i_d.di_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- case S_IFLNK:
- xfs_idestroy_fork(ip, XFS_DATA_FORK);
- break;
- }
-
- if (ip->i_afp)
- xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-
- if (ip->i_itemp) {
- ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
- xfs_inode_item_destroy(ip);
- ip->i_itemp = NULL;
- }
-
- /* asserts to verify all state is correct here */
- ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!spin_is_locked(&ip->i_flags_lock));
- ASSERT(!xfs_isiflocked(ip));
-
- /*
- * Because we use RCU freeing we need to ensure the inode always
- * appears to be reclaimed with an invalid inode number when in the
- * free state. The ip->i_flags_lock provides the barrier against lookup
- * races.
- */
- spin_lock(&ip->i_flags_lock);
- ip->i_flags = XFS_IRECLAIM;
- ip->i_ino = 0;
- spin_unlock(&ip->i_flags_lock);
-
- call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
-}
-
-/*
- * Check the validity of the inode we just found it the cache
- */
-static int
-xfs_iget_cache_hit(
- struct xfs_perag *pag,
- struct xfs_inode *ip,
- xfs_ino_t ino,
- int flags,
- int lock_flags) __releases(RCU)
-{
- struct inode *inode = VFS_I(ip);
- struct xfs_mount *mp = ip->i_mount;
- int error;
-
- /*
- * check for re-use of an inode within an RCU grace period due to the
- * radix tree nodes not being updated yet. We monitor for this by
- * setting the inode number to zero before freeing the inode structure.
- * If the inode has been reallocated and set up, then the inode number
- * will not match, so check for that, too.
- */
- spin_lock(&ip->i_flags_lock);
- if (ip->i_ino != ino) {
- trace_xfs_iget_skip(ip);
- XFS_STATS_INC(xs_ig_frecycle);
- error = EAGAIN;
- goto out_error;
- }
-
-
- /*
- * If we are racing with another cache hit that is currently
- * instantiating this inode or currently recycling it out of
- * reclaimabe state, wait for the initialisation to complete
- * before continuing.
- *
- * XXX(hch): eventually we should do something equivalent to
- * wait_on_inode to wait for these flags to be cleared
- * instead of polling for it.
- */
- if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
- trace_xfs_iget_skip(ip);
- XFS_STATS_INC(xs_ig_frecycle);
- error = EAGAIN;
- goto out_error;
- }
-
- /*
- * If lookup is racing with unlink return an error immediately.
- */
- if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- goto out_error;
- }
-
- /*
- * If IRECLAIMABLE is set, we've torn down the VFS inode already.
- * Need to carefully get it back into useable state.
- */
- if (ip->i_flags & XFS_IRECLAIMABLE) {
- trace_xfs_iget_reclaim(ip);
-
- /*
- * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
- * from stomping over us while we recycle the inode. We can't
- * clear the radix tree reclaimable tag yet as it requires
- * pag_ici_lock to be held exclusive.
- */
- ip->i_flags |= XFS_IRECLAIM;
-
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
-
- error = -inode_init_always(mp->m_super, inode);
- if (error) {
- /*
- * Re-initializing the inode failed, and we are in deep
- * trouble. Try to re-add it to the reclaim list.
- */
- rcu_read_lock();
- spin_lock(&ip->i_flags_lock);
-
- ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
- ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
- trace_xfs_iget_reclaim_fail(ip);
- goto out_error;
- }
-
- spin_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
-
- /*
- * Clear the per-lifetime state in the inode as we are now
- * effectively a new inode and need to return to the initial
- * state before reuse occurs.
- */
- ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
- ip->i_flags |= XFS_INEW;
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- inode->i_state = I_NEW;
-
- ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
- mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
- lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
- &xfs_iolock_active, "xfs_iolock_active");
-
- spin_unlock(&ip->i_flags_lock);
- spin_unlock(&pag->pag_ici_lock);
- } else {
- /* If the VFS inode is being torn down, pause and try again. */
- if (!igrab(inode)) {
- trace_xfs_iget_skip(ip);
- error = EAGAIN;
- goto out_error;
- }
-
- /* We've got a live one. */
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
- trace_xfs_iget_hit(ip);
- }
-
- if (lock_flags != 0)
- xfs_ilock(ip, lock_flags);
-
- xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
- XFS_STATS_INC(xs_ig_found);
-
- return 0;
-
-out_error:
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
- return error;
-}
-
-
-static int
-xfs_iget_cache_miss(
- struct xfs_mount *mp,
- struct xfs_perag *pag,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- struct xfs_inode **ipp,
- int flags,
- int lock_flags)
-{
- struct xfs_inode *ip;
- int error;
- xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
- int iflags;
-
- ip = xfs_inode_alloc(mp, ino);
- if (!ip)
- return ENOMEM;
-
- error = xfs_iread(mp, tp, ip, flags);
- if (error)
- goto out_destroy;
-
- trace_xfs_iget_miss(ip);
-
- if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- goto out_destroy;
- }
-
- /*
- * Preload the radix tree so we can insert safely under the
- * write spinlock. Note that we cannot sleep inside the preload
- * region. Since we can be called from transaction context, don't
- * recurse into the file system.
- */
- if (radix_tree_preload(GFP_NOFS)) {
- error = EAGAIN;
- goto out_destroy;
- }
-
- /*
- * Because the inode hasn't been added to the radix-tree yet it can't
- * be found by another thread, so we can do the non-sleeping lock here.
- */
- if (lock_flags) {
- if (!xfs_ilock_nowait(ip, lock_flags))
- BUG();
- }
-
- /*
- * These values must be set before inserting the inode into the radix
- * tree as the moment it is inserted a concurrent lookup (allowed by the
- * RCU locking mechanism) can find it and that lookup must see that this
- * is an inode currently under construction (i.e. that XFS_INEW is set).
- * The ip->i_flags_lock that protects the XFS_INEW flag forms the
- * memory barrier that ensures this detection works correctly at lookup
- * time.
- */
- iflags = XFS_INEW;
- if (flags & XFS_IGET_DONTCACHE)
- iflags |= XFS_IDONTCACHE;
- ip->i_udquot = ip->i_gdquot = NULL;
- xfs_iflags_set(ip, iflags);
-
- /* insert the new inode */
- spin_lock(&pag->pag_ici_lock);
- error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
- if (unlikely(error)) {
- WARN_ON(error != -EEXIST);
- XFS_STATS_INC(xs_ig_dup);
- error = EAGAIN;
- goto out_preload_end;
- }
- spin_unlock(&pag->pag_ici_lock);
- radix_tree_preload_end();
-
- *ipp = ip;
- return 0;
-
-out_preload_end:
- spin_unlock(&pag->pag_ici_lock);
- radix_tree_preload_end();
- if (lock_flags)
- xfs_iunlock(ip, lock_flags);
-out_destroy:
- __destroy_inode(VFS_I(ip));
- xfs_inode_free(ip);
- return error;
-}
-
-/*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, initialise the vfs inode
- * if necessary.
- *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and initialise the vfs inode.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system. It points
- * to the inode hash table.
- * tp -- a pointer to the current transaction if there is one. This is
- * simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired. This is the unique identifier
- * within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode. See the comment
- * for xfs_ilock() for a list of valid values.
- */
-int
-xfs_iget(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- uint flags,
- uint lock_flags,
- xfs_inode_t **ipp)
-{
- xfs_inode_t *ip;
- int error;
- xfs_perag_t *pag;
- xfs_agino_t agino;
-
- /*
- * xfs_reclaim_inode() uses the ILOCK to ensure an inode
- * doesn't get freed while it's being referenced during a
- * radix tree traversal here. It assumes this function
- * aqcuires only the ILOCK (and therefore it has no need to
- * involve the IOLOCK in this synchronization).
- */
- ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
-
- /* reject inode numbers outside existing AGs */
- if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
- return EINVAL;
-
- /* get the perag structure and ensure that it's inode capable */
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
- agino = XFS_INO_TO_AGINO(mp, ino);
-
-again:
- error = 0;
- rcu_read_lock();
- ip = radix_tree_lookup(&pag->pag_ici_root, agino);
-
- if (ip) {
- error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
- if (error)
- goto out_error_or_again;
- } else {
- rcu_read_unlock();
- XFS_STATS_INC(xs_ig_missed);
-
- error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
- flags, lock_flags);
- if (error)
- goto out_error_or_again;
- }
- xfs_perag_put(pag);
-
- *ipp = ip;
-
- /*
- * If we have a real type for an on-disk inode, we can set ops(&unlock)
- * now. If it's a new inode being created, xfs_ialloc will handle it.
- */
- if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
- xfs_setup_inode(ip);
- return 0;
-
-out_error_or_again:
- if (error == EAGAIN) {
- delay(1);
- goto again;
- }
- xfs_perag_put(pag);
- return error;
-}
-
-/*
- * This is a wrapper routine around the xfs_ilock() routine
- * used to centralize some grungy code. It is used in places
- * that wish to lock the inode solely for reading the extents.
- * The reason these places can't just call xfs_ilock(SHARED)
- * is that the inode lock also guards to bringing in of the
- * extents from disk for a file in b-tree format. If the inode
- * is in b-tree format, then we need to lock the inode exclusively
- * until the extents are read in. Locking it exclusively all
- * the time would limit our parallelism unnecessarily, though.
- * What we do instead is check to see if the extents have been
- * read in yet, and only lock the inode exclusively if they
- * have not.
- *
- * The function returns a value which should be given to the
- * corresponding xfs_iunlock_map_shared(). This value is
- * the mode in which the lock was actually taken.
- */
-uint
-xfs_ilock_map_shared(
- xfs_inode_t *ip)
-{
- uint lock_mode;
-
- if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
- ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
- lock_mode = XFS_ILOCK_EXCL;
- } else {
- lock_mode = XFS_ILOCK_SHARED;
- }
-
- xfs_ilock(ip, lock_mode);
-
- return lock_mode;
-}
-
-/*
- * This is simply the unlock routine to go with xfs_ilock_map_shared().
- * All it does is call xfs_iunlock() with the given lock_mode.
- */
-void
-xfs_iunlock_map_shared(
- xfs_inode_t *ip,
- unsigned int lock_mode)
-{
- xfs_iunlock(ip, lock_mode);
-}
-
-/*
- * The xfs inode contains 2 locks: a multi-reader lock called the
- * i_iolock and a multi-reader lock called the i_lock. This routine
- * allows either or both of the locks to be obtained.
- *
- * The 2 locks should always be ordered so that the IO lock is
- * obtained first in order to prevent deadlock.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks
- * to be locked. It can be:
- * XFS_IOLOCK_SHARED,
- * XFS_IOLOCK_EXCL,
- * XFS_ILOCK_SHARED,
- * XFS_ILOCK_EXCL,
- * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
- * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
- * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
- * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
- */
-void
-xfs_ilock(
- xfs_inode_t *ip,
- uint lock_flags)
-{
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-
- if (lock_flags & XFS_IOLOCK_EXCL)
- mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
- else if (lock_flags & XFS_IOLOCK_SHARED)
- mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-
- if (lock_flags & XFS_ILOCK_EXCL)
- mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
- else if (lock_flags & XFS_ILOCK_SHARED)
- mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-
- trace_xfs_ilock(ip, lock_flags, _RET_IP_);
-}
-
-/*
- * This is just like xfs_ilock(), except that the caller
- * is guaranteed not to sleep. It returns 1 if it gets
- * the requested locks and 0 otherwise. If the IO lock is
- * obtained but the inode lock cannot be, then the IO lock
- * is dropped before returning.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks to be
- * to be locked. See the comment for xfs_ilock() for a list
- * of valid values.
- */
-int
-xfs_ilock_nowait(
- xfs_inode_t *ip,
- uint lock_flags)
-{
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-
- if (lock_flags & XFS_IOLOCK_EXCL) {
- if (!mrtryupdate(&ip->i_iolock))
- goto out;
- } else if (lock_flags & XFS_IOLOCK_SHARED) {
- if (!mrtryaccess(&ip->i_iolock))
- goto out;
- }
- if (lock_flags & XFS_ILOCK_EXCL) {
- if (!mrtryupdate(&ip->i_lock))
- goto out_undo_iolock;
- } else if (lock_flags & XFS_ILOCK_SHARED) {
- if (!mrtryaccess(&ip->i_lock))
- goto out_undo_iolock;
- }
- trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
- return 1;
-
- out_undo_iolock:
- if (lock_flags & XFS_IOLOCK_EXCL)
- mrunlock_excl(&ip->i_iolock);
- else if (lock_flags & XFS_IOLOCK_SHARED)
- mrunlock_shared(&ip->i_iolock);
- out:
- return 0;
-}
-
-/*
- * xfs_iunlock() is used to drop the inode locks acquired with
- * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
- * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
- * that we know which locks to drop.
- *
- * ip -- the inode being unlocked
- * lock_flags -- this parameter indicates the inode's locks to be
- * to be unlocked. See the comment for xfs_ilock() for a list
- * of valid values for this parameter.
- *
- */
-void
-xfs_iunlock(
- xfs_inode_t *ip,
- uint lock_flags)
-{
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
- ASSERT(lock_flags != 0);
-
- if (lock_flags & XFS_IOLOCK_EXCL)
- mrunlock_excl(&ip->i_iolock);
- else if (lock_flags & XFS_IOLOCK_SHARED)
- mrunlock_shared(&ip->i_iolock);
-
- if (lock_flags & XFS_ILOCK_EXCL)
- mrunlock_excl(&ip->i_lock);
- else if (lock_flags & XFS_ILOCK_SHARED)
- mrunlock_shared(&ip->i_lock);
-
- trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
-}
-
-/*
- * give up write locks. the i/o lock cannot be held nested
- * if it is being demoted.
- */
-void
-xfs_ilock_demote(
- xfs_inode_t *ip,
- uint lock_flags)
-{
- ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
-
- if (lock_flags & XFS_ILOCK_EXCL)
- mrdemote(&ip->i_lock);
- if (lock_flags & XFS_IOLOCK_EXCL)
- mrdemote(&ip->i_iolock);
-
- trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
-}
-
-#ifdef DEBUG
-int
-xfs_isilocked(
- xfs_inode_t *ip,
- uint lock_flags)
-{
- if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
- if (!(lock_flags & XFS_ILOCK_SHARED))
- return !!ip->i_lock.mr_writer;
- return rwsem_is_locked(&ip->i_lock.mr_lock);
- }
-
- if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
- if (!(lock_flags & XFS_IOLOCK_SHARED))
- return !!ip->i_iolock.mr_writer;
- return rwsem_is_locked(&ip->i_iolock.mr_lock);
- }
-
- ASSERT(0);
- return 0;
-}
-#endif
-
-void
-__xfs_iflock(
- struct xfs_inode *ip)
-{
- wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
- DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
- do {
- prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
- if (xfs_isiflocked(ip))
- io_schedule();
- } while (!xfs_iflock_nowait(ip));
-
- finish_wait(wq, &wait.wait);
-}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a59eea09930..66282dcb821 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -45,6 +45,7 @@
#include "xfs_filestream.h"
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
kmem_zone_t *xfs_ifork_zone;
kmem_zone_t *xfs_inode_zone;
@@ -74,6 +75,256 @@ xfs_get_extsz_hint(
return 0;
}
+/*
+ * This is a wrapper routine around the xfs_ilock() routine used to centralize
+ * some grungy code. It is used in places that wish to lock the inode solely
+ * for reading the extents. The reason these places can't just call
+ * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
+ * extents from disk for a file in b-tree format. If the inode is in b-tree
+ * format, then we need to lock the inode exclusively until the extents are read
+ * in. Locking it exclusively all the time would limit our parallelism
+ * unnecessarily, though. What we do instead is check to see if the extents
+ * have been read in yet, and only lock the inode exclusively if they have not.
+ *
+ * The function returns a value which should be given to the corresponding
+ * xfs_iunlock_map_shared(). This value is the mode in which the lock was
+ * actually taken.
+ */
+uint
+xfs_ilock_map_shared(
+ xfs_inode_t *ip)
+{
+ uint lock_mode;
+
+ if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+ ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+ lock_mode = XFS_ILOCK_EXCL;
+ } else {
+ lock_mode = XFS_ILOCK_SHARED;
+ }
+
+ xfs_ilock(ip, lock_mode);
+
+ return lock_mode;
+}
+
+/*
+ * This is simply the unlock routine to go with xfs_ilock_map_shared().
+ * All it does is call xfs_iunlock() with the given lock_mode.
+ */
+void
+xfs_iunlock_map_shared(
+ xfs_inode_t *ip,
+ unsigned int lock_mode)
+{
+ xfs_iunlock(ip, lock_mode);
+}
+
+/*
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock. This routine
+ * allows either or both of the locks to be obtained.
+ *
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ * to be locked. It can be:
+ * XFS_IOLOCK_SHARED,
+ * XFS_IOLOCK_EXCL,
+ * XFS_ILOCK_SHARED,
+ * XFS_ILOCK_EXCL,
+ * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ */
+void
+xfs_ilock(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ trace_xfs_ilock(ip, lock_flags, _RET_IP_);
+
+ /*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+ * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+ else if (lock_flags & XFS_IOLOCK_SHARED)
+ mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+
+ if (lock_flags & XFS_ILOCK_EXCL)
+ mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+ else if (lock_flags & XFS_ILOCK_SHARED)
+ mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+}
+
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep. It returns 1 if it gets
+ * the requested locks and 0 otherwise. If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ * to be locked. See the comment for xfs_ilock() for a list
+ * of valid values.
+ */
+int
+xfs_ilock_nowait(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
+
+ /*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+ * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+ if (lock_flags & XFS_IOLOCK_EXCL) {
+ if (!mrtryupdate(&ip->i_iolock))
+ goto out;
+ } else if (lock_flags & XFS_IOLOCK_SHARED) {
+ if (!mrtryaccess(&ip->i_iolock))
+ goto out;
+ }
+ if (lock_flags & XFS_ILOCK_EXCL) {
+ if (!mrtryupdate(&ip->i_lock))
+ goto out_undo_iolock;
+ } else if (lock_flags & XFS_ILOCK_SHARED) {
+ if (!mrtryaccess(&ip->i_lock))
+ goto out_undo_iolock;
+ }
+ return 1;
+
+ out_undo_iolock:
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrunlock_excl(&ip->i_iolock);
+ else if (lock_flags & XFS_IOLOCK_SHARED)
+ mrunlock_shared(&ip->i_iolock);
+ out:
+ return 0;
+}
+
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ * to be unlocked. See the comment for xfs_ilock() for a list
+ * of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ /*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+ * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+ ASSERT(lock_flags != 0);
+
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrunlock_excl(&ip->i_iolock);
+ else if (lock_flags & XFS_IOLOCK_SHARED)
+ mrunlock_shared(&ip->i_iolock);
+
+ if (lock_flags & XFS_ILOCK_EXCL)
+ mrunlock_excl(&ip->i_lock);
+ else if (lock_flags & XFS_ILOCK_SHARED)
+ mrunlock_shared(&ip->i_lock);
+
+ trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
+}
+
+/*
+ * give up write locks. the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+
+ if (lock_flags & XFS_ILOCK_EXCL)
+ mrdemote(&ip->i_lock);
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrdemote(&ip->i_iolock);
+
+ trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
+}
+
+#ifdef DEBUG
+int
+xfs_isilocked(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+ if (!(lock_flags & XFS_ILOCK_SHARED))
+ return !!ip->i_lock.mr_writer;
+ return rwsem_is_locked(&ip->i_lock.mr_lock);
+ }
+
+ if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+ if (!(lock_flags & XFS_IOLOCK_SHARED))
+ return !!ip->i_iolock.mr_writer;
+ return rwsem_is_locked(&ip->i_iolock.mr_lock);
+ }
+
+ ASSERT(0);
+ return 0;
+}
+#endif
+
+void
+__xfs_iflock(
+ struct xfs_inode *ip)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+ do {
+ prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_isiflocked(ip))
+ io_schedule();
+ } while (!xfs_iflock_nowait(ip));
+
+ finish_wait(wq, &wait.wait);
+}
+
#ifdef DEBUG
/*
* Make sure that the extents in the given memory buffer
@@ -131,169 +382,108 @@ xfs_inobp_check(
}
#endif
-/*
- * Find the buffer associated with the given inode map
- * We do basic validation checks on the buffer once it has been
- * retrieved from disk.
- */
-STATIC int
-xfs_imap_to_bp(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- struct xfs_imap *imap,
- xfs_buf_t **bpp,
- uint buf_flags,
- uint iget_flags)
+static void
+xfs_inode_buf_verify(
+ struct xfs_buf *bp)
{
- int error;
+ struct xfs_mount *mp = bp->b_target->bt_mount;
int i;
int ni;
- xfs_buf_t *bp;
-
- buf_flags |= XBF_UNMAPPED;
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
- (int)imap->im_len, buf_flags, &bp);
- if (error) {
- if (error != EAGAIN) {
- xfs_warn(mp,
- "%s: xfs_trans_read_buf() returned error %d.",
- __func__, error);
- } else {
- ASSERT(buf_flags & XBF_TRYLOCK);
- }
- return error;
- }
/*
* Validate the magic number and version of every inode in the buffer
- * (if DEBUG kernel) or the first inode in the buffer, otherwise.
*/
-#ifdef DEBUG
- ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
-#else /* usual case */
- ni = 1;
-#endif
-
+ ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
for (i = 0; i < ni; i++) {
int di_ok;
xfs_dinode_t *dip;
- dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+ dip = (struct xfs_dinode *)xfs_buf_offset(bp,
(i << mp->m_sb.sb_inodelog));
di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
XFS_DINODE_GOOD_VERSION(dip->di_version);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP,
XFS_RANDOM_ITOBP_INOTOBP))) {
- if (iget_flags & XFS_IGET_UNTRUSTED) {
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EINVAL);
- }
- XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
- XFS_ERRLEVEL_HIGH, mp, dip);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
+ mp, dip);
#ifdef DEBUG
xfs_emerg(mp,
"bad inode magic/vsn daddr %lld #%d (magic=%x)",
- (unsigned long long)imap->im_blkno, i,
+ (unsigned long long)bp->b_bn, i,
be16_to_cpu(dip->di_magic));
ASSERT(0);
#endif
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EFSCORRUPTED);
}
}
-
xfs_inobp_check(mp, bp);
- *bpp = bp;
- return 0;
}
-/*
- * This routine is called to map an inode number within a file
- * system to the buffer containing the on-disk version of the
- * inode. It returns a pointer to the buffer containing the
- * on-disk inode in the bpp parameter, and in the dip parameter
- * it returns a pointer to the on-disk inode within that buffer.
- *
- * If a non-zero error is returned, then the contents of bpp and
- * dipp are undefined.
- *
- * Use xfs_imap() to determine the size and location of the
- * buffer to read from disk.
- */
-int
-xfs_inotobp(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- xfs_dinode_t **dipp,
- xfs_buf_t **bpp,
- int *offset,
- uint imap_flags)
-{
- struct xfs_imap imap;
- xfs_buf_t *bp;
- int error;
-
- imap.im_blkno = 0;
- error = xfs_imap(mp, tp, ino, &imap, imap_flags);
- if (error)
- return error;
- error = xfs_imap_to_bp(mp, tp, &imap, &bp, 0, imap_flags);
- if (error)
- return error;
+static void
+xfs_inode_buf_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_inode_buf_verify(bp);
+}
- *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
- *bpp = bp;
- *offset = imap.im_boffset;
- return 0;
+static void
+xfs_inode_buf_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_inode_buf_verify(bp);
}
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+ .verify_read = xfs_inode_buf_read_verify,
+ .verify_write = xfs_inode_buf_write_verify,
+};
+
/*
- * This routine is called to map an inode to the buffer containing
- * the on-disk version of the inode. It returns a pointer to the
- * buffer containing the on-disk inode in the bpp parameter, and in
- * the dip parameter it returns a pointer to the on-disk inode within
- * that buffer.
+ * This routine is called to map an inode to the buffer containing the on-disk
+ * version of the inode. It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
+ * pointer to the on-disk inode within that buffer.
*
- * If a non-zero error is returned, then the contents of bpp and
- * dipp are undefined.
- *
- * The inode is expected to already been mapped to its buffer and read
- * in once, thus we can use the mapping information stored in the inode
- * rather than calling xfs_imap(). This allows us to avoid the overhead
- * of looking at the inode btree for small block file systems
- * (see xfs_imap()).
+ * If a non-zero error is returned, then the contents of bpp and dipp are
+ * undefined.
*/
int
-xfs_itobp(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- xfs_dinode_t **dipp,
- xfs_buf_t **bpp,
- uint buf_flags)
+xfs_imap_to_bp(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_imap *imap,
+ struct xfs_dinode **dipp,
+ struct xfs_buf **bpp,
+ uint buf_flags,
+ uint iget_flags)
{
- xfs_buf_t *bp;
- int error;
+ struct xfs_buf *bp;
+ int error;
+
+ buf_flags |= XBF_UNMAPPED;
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+ (int)imap->im_len, buf_flags, &bp,
+ &xfs_inode_buf_ops);
+ if (error) {
+ if (error == EAGAIN) {
+ ASSERT(buf_flags & XBF_TRYLOCK);
+ return error;
+ }
- ASSERT(ip->i_imap.im_blkno != 0);
+ if (error == EFSCORRUPTED &&
+ (iget_flags & XFS_IGET_UNTRUSTED))
+ return XFS_ERROR(EINVAL);
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0);
- if (error)
+ xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
+ __func__, error);
return error;
-
- if (!bp) {
- ASSERT(buf_flags & XBF_TRYLOCK);
- ASSERT(tp == NULL);
- *bpp = NULL;
- return EAGAIN;
}
- *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
*bpp = bp;
+ *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
return 0;
}
@@ -796,10 +986,9 @@ xfs_iread(
/*
* Get pointers to the on-disk inode and the buffer containing it.
*/
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 0, iget_flags);
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
if (error)
return error;
- dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
/*
* If we got something that isn't an inode it means someone
@@ -876,7 +1065,7 @@ xfs_iread(
/*
* Use xfs_trans_brelse() to release the buffer containing the
* on-disk inode, because it was acquired with xfs_trans_read_buf()
- * in xfs_itobp() above. If tp is NULL, this is just a normal
+ * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal
* brelse(). If we're within a transaction, then xfs_trans_brelse()
* will only release the buffer if it is not dirty within the
* transaction. It will be OK to release the buffer in this case,
@@ -936,16 +1125,16 @@ xfs_iread_extents(
* set according to the contents of the given cred structure.
*
* Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
- * has a free inode available, call xfs_iget()
- * to obtain the in-core version of the allocated inode. Finally,
- * fill in the inode and log its initial contents. In this case,
- * ialloc_context would be set to NULL and call_again set to false.
+ * has a free inode available, call xfs_iget() to obtain the in-core
+ * version of the allocated inode. Finally, fill in the inode and
+ * log its initial contents. In this case, ialloc_context would be
+ * set to NULL.
*
- * If xfs_dialloc() does not have an available inode,
- * it will replenish its supply by doing an allocation. Since we can
- * only do one allocation within a transaction without deadlocks, we
- * must commit the current transaction before returning the inode itself.
- * In this case, therefore, we will set call_again to true and return.
+ * If xfs_dialloc() does not have an available inode, it will replenish
+ * its supply by doing an allocation. Since we can only do one
+ * allocation within a transaction without deadlocks, we must commit
+ * the current transaction before returning the inode itself.
+ * In this case, therefore, we will set ialloc_context and return.
* The caller should then commit the current transaction, start a new
* transaction, and call xfs_ialloc() again to actually get the inode.
*
@@ -970,7 +1159,6 @@ xfs_ialloc(
prid_t prid,
int okalloc,
xfs_buf_t **ialloc_context,
- boolean_t *call_again,
xfs_inode_t **ipp)
{
xfs_ino_t ino;
@@ -985,10 +1173,10 @@ xfs_ialloc(
* the on-disk inode to be allocated.
*/
error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
- ialloc_context, call_again, &ino);
+ ialloc_context, &ino);
if (error)
return error;
- if (*call_again || ino == NULLFSINO) {
+ if (*ialloc_context || ino == NULLFSINO) {
*ipp = NULL;
return 0;
}
@@ -1207,7 +1395,9 @@ xfs_itruncate_extents(
int error = 0;
int done = 0;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
+ xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(new_size <= XFS_ISIZE(ip));
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(ip->i_itemp != NULL);
@@ -1226,7 +1416,7 @@ xfs_itruncate_extents(
* then there is nothing to do.
*/
first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
- last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+ last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
if (first_unmap_block == last_block)
return 0;
@@ -1355,7 +1545,8 @@ xfs_iunlink(
* Here we put the head pointer into our next pointer,
* and then we fall through to point the head at us.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
+ 0, 0);
if (error)
return error;
@@ -1429,16 +1620,16 @@ xfs_iunlink_remove(
if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
/*
- * We're at the head of the list. Get the inode's
- * on-disk buffer to see if there is anyone after us
- * on the list. Only modify our next pointer if it
- * is not already NULLAGINO. This saves us the overhead
- * of dealing with the buffer when there is no need to
- * change it.
+ * We're at the head of the list. Get the inode's on-disk
+ * buffer to see if there is anyone after us on the list.
+ * Only modify our next pointer if it is not already NULLAGINO.
+ * This saves us the overhead of dealing with the buffer when
+ * there is no need to change it.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
+ 0, 0);
if (error) {
- xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
+ xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
__func__, error);
return error;
}
@@ -1472,34 +1663,45 @@ xfs_iunlink_remove(
next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
last_ibp = NULL;
while (next_agino != agino) {
- /*
- * If the last inode wasn't the one pointing to
- * us, then release its buffer since we're not
- * going to do anything with it.
- */
- if (last_ibp != NULL) {
+ struct xfs_imap imap;
+
+ if (last_ibp)
xfs_trans_brelse(tp, last_ibp);
- }
+
+ imap.im_blkno = 0;
next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
- error = xfs_inotobp(mp, tp, next_ino, &last_dip,
- &last_ibp, &last_offset, 0);
+
+ error = xfs_imap(mp, tp, next_ino, &imap, 0);
+ if (error) {
+ xfs_warn(mp,
+ "%s: xfs_imap returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
+ &last_ibp, 0, 0);
if (error) {
xfs_warn(mp,
- "%s: xfs_inotobp() returned error %d.",
+ "%s: xfs_imap_to_bp returned error %d.",
__func__, error);
return error;
}
+
+ last_offset = imap.im_boffset;
next_agino = be32_to_cpu(last_dip->di_next_unlinked);
ASSERT(next_agino != NULLAGINO);
ASSERT(next_agino != 0);
}
+
/*
- * Now last_ibp points to the buffer previous to us on
- * the unlinked list. Pull us from the list.
+ * Now last_ibp points to the buffer previous to us on the
+ * unlinked list. Pull us from the list.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
+ 0, 0);
if (error) {
- xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
+ xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
__func__, error);
return error;
}
@@ -1579,10 +1781,23 @@ xfs_ifree_cluster(
* to mark all the active inodes on the buffer stale.
*/
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
- mp->m_bsize * blks_per_cluster, 0);
+ mp->m_bsize * blks_per_cluster,
+ XBF_UNMAPPED);
if (!bp)
return ENOMEM;
+
+ /*
+ * This buffer may not have been correctly initialised as we
+ * didn't read it from disk. That's not important because we are
+ * only using to mark the buffer as stale in the log, and to
+ * attach stale cached inodes on it. That means it will never be
+ * dispatched for IO. If it is, we want to know about it, and we
+ * want it to fail. We can acheive this by adding a write
+ * verifier to the buffer.
+ */
+ bp->b_ops = &xfs_inode_buf_ops;
+
/*
* Walk the inodes already attached to the buffer and mark them
* stale. These will all have the flush locks held, so an
@@ -1749,7 +1964,8 @@ xfs_ifree(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0);
+ error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp,
+ 0, 0);
if (error)
return error;
@@ -2428,7 +2644,7 @@ xfs_iflush(
/*
* For stale inodes we cannot rely on the backing buffer remaining
* stale in cache for the remaining life of the stale inode and so
- * xfs_itobp() below may give us a buffer that no longer contains
+ * xfs_imap_to_bp() below may give us a buffer that no longer contains
* inodes below. We have to check this after ensuring the inode is
* unpinned so that it is safe to reclaim the stale inode after the
* flush call.
@@ -2454,7 +2670,8 @@ xfs_iflush(
/*
* Get the buffer containing the on-disk inode.
*/
- error = xfs_itobp(mp, NULL, ip, &dip, &bp, XBF_TRYLOCK);
+ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
+ 0);
if (error || !bp) {
xfs_ifunlock(ip);
return error;
@@ -3728,3 +3945,40 @@ xfs_iext_irec_update_extoffs(
ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
}
}
+
+/*
+ * Test whether it is appropriate to check an inode for and free post EOF
+ * blocks. The 'force' parameter determines whether we should also consider
+ * regular files that are marked preallocated or append-only.
+ */
+bool
+xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
+{
+ /* prealloc/delalloc exists only on regular files */
+ if (!S_ISREG(ip->i_d.di_mode))
+ return false;
+
+ /*
+ * Zero sized files with no cached pages and delalloc blocks will not
+ * have speculative prealloc/delalloc blocks to remove.
+ */
+ if (VFS_I(ip)->i_size == 0 &&
+ VN_CACHED(VFS_I(ip)) == 0 &&
+ ip->i_delayed_blks == 0)
+ return false;
+
+ /* If we haven't read in the extent list, then don't do it now. */
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
+ return false;
+
+ /*
+ * Do not free real preallocated or append-only files unless the file
+ * has delalloc blocks and we are forced to remove them.
+ */
+ if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
+ if (!force || ip->i_delayed_blks == 0)
+ return false;
+
+ return true;
+}
+
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1efff36a75b..22baf6ea4fa 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -487,8 +487,6 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
-extern struct lock_class_key xfs_iolock_reclaimable;
-
/*
* For multiple groups support: if S_ISGID bit is set in the parent
* directory, group of new file is set to that of the parent, and
@@ -498,11 +496,10 @@ extern struct lock_class_key xfs_iolock_reclaimable;
(((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
((pip)->i_d.di_mode & S_ISGID))
+
/*
- * xfs_iget.c prototypes.
+ * xfs_inode.c prototypes.
*/
-int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
- uint, uint, xfs_inode_t **);
void xfs_ilock(xfs_inode_t *, uint);
int xfs_ilock_nowait(xfs_inode_t *, uint);
void xfs_iunlock(xfs_inode_t *, uint);
@@ -510,14 +507,9 @@ void xfs_ilock_demote(xfs_inode_t *, uint);
int xfs_isilocked(xfs_inode_t *, uint);
uint xfs_ilock_map_shared(xfs_inode_t *);
void xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void xfs_inode_free(struct xfs_inode *ip);
-
-/*
- * xfs_inode.c prototypes.
- */
int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
xfs_nlink_t, xfs_dev_t, prid_t, int,
- struct xfs_buf **, boolean_t *, xfs_inode_t **);
+ struct xfs_buf **, xfs_inode_t **);
uint xfs_ip2xflags(struct xfs_inode *);
uint xfs_dic2xflags(struct xfs_dinode *);
@@ -557,12 +549,9 @@ do { \
#define XFS_IGET_UNTRUSTED 0x2
#define XFS_IGET_DONTCACHE 0x4
-int xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
- xfs_ino_t, struct xfs_dinode **,
- struct xfs_buf **, int *, uint);
-int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
- struct xfs_inode *, struct xfs_dinode **,
- struct xfs_buf **, uint);
+int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
+ struct xfs_imap *, struct xfs_dinode **,
+ struct xfs_buf **, uint, uint);
int xfs_iread(struct xfs_mount *, struct xfs_trans *,
struct xfs_inode *, uint);
void xfs_dinode_to_disk(struct xfs_dinode *,
@@ -596,6 +585,7 @@ void xfs_iext_irec_compact(xfs_ifork_t *);
void xfs_iext_irec_compact_pages(xfs_ifork_t *);
void xfs_iext_irec_compact_full(xfs_ifork_t *);
void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
+bool xfs_can_free_eofblocks(struct xfs_inode *, bool);
#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
@@ -608,5 +598,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
extern struct kmem_zone *xfs_ifork_zone;
extern struct kmem_zone *xfs_inode_zone;
extern struct kmem_zone *xfs_ili_zone;
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 3a05a41b5d7..c1c3ef88a26 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -42,6 +42,7 @@
#include "xfs_inode_item.h"
#include "xfs_export.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
#include <linux/capability.h>
#include <linux/dcache.h>
@@ -70,16 +71,16 @@ xfs_find_handle(
int hsize;
xfs_handle_t handle;
struct inode *inode;
- struct file *file = NULL;
+ struct fd f = {0};
struct path path;
int error;
struct xfs_inode *ip;
if (cmd == XFS_IOC_FD_TO_HANDLE) {
- file = fget(hreq->fd);
- if (!file)
+ f = fdget(hreq->fd);
+ if (!f.file)
return -EBADF;
- inode = file->f_path.dentry->d_inode;
+ inode = f.file->f_path.dentry->d_inode;
} else {
error = user_lpath((const char __user *)hreq->path, &path);
if (error)
@@ -134,7 +135,7 @@ xfs_find_handle(
out_put:
if (cmd == XFS_IOC_FD_TO_HANDLE)
- fput(file);
+ fdput(f);
else
path_put(&path);
return error;
@@ -208,6 +209,7 @@ xfs_open_by_handle(
struct inode *inode;
struct dentry *dentry;
fmode_t fmode;
+ struct path path;
if (!capable(CAP_SYS_ADMIN))
return -XFS_ERROR(EPERM);
@@ -252,8 +254,10 @@ xfs_open_by_handle(
goto out_dput;
}
- filp = dentry_open(dentry, mntget(parfilp->f_path.mnt),
- hreq->oflags, cred);
+ path.mnt = parfilp->f_path.mnt;
+ path.dentry = dentry;
+ filp = dentry_open(&path, hreq->oflags, cred);
+ dput(dentry);
if (IS_ERR(filp)) {
put_unused_fd(fd);
return PTR_ERR(filp);
@@ -361,9 +365,15 @@ xfs_fssetdm_by_handle(
if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(parfilp);
+ if (error)
+ return error;
+
dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
- if (IS_ERR(dentry))
+ if (IS_ERR(dentry)) {
+ mnt_drop_write_file(parfilp);
return PTR_ERR(dentry);
+ }
if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
error = -XFS_ERROR(EPERM);
@@ -379,6 +389,7 @@ xfs_fssetdm_by_handle(
fsd.fsd_dmstate);
out:
+ mnt_drop_write_file(parfilp);
dput(dentry);
return error;
}
@@ -631,7 +642,11 @@ xfs_ioc_space(
if (ioflags & IO_INVIS)
attr_flags |= XFS_ATTR_DMI;
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
+ mnt_drop_write_file(filp);
return -error;
}
@@ -1160,6 +1175,7 @@ xfs_ioc_fssetxattr(
{
struct fsxattr fa;
unsigned int mask;
+ int error;
if (copy_from_user(&fa, arg, sizeof(fa)))
return -EFAULT;
@@ -1168,7 +1184,12 @@ xfs_ioc_fssetxattr(
if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
mask |= FSX_NONBLOCK;
- return -xfs_ioctl_setattr(ip, &fa, mask);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
+ error = xfs_ioctl_setattr(ip, &fa, mask);
+ mnt_drop_write_file(filp);
+ return -error;
}
STATIC int
@@ -1193,6 +1214,7 @@ xfs_ioc_setxflags(
struct fsxattr fa;
unsigned int flags;
unsigned int mask;
+ int error;
if (copy_from_user(&flags, arg, sizeof(flags)))
return -EFAULT;
@@ -1207,7 +1229,12 @@ xfs_ioc_setxflags(
mask |= FSX_NONBLOCK;
fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
- return -xfs_ioctl_setattr(ip, &fa, mask);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
+ error = xfs_ioctl_setattr(ip, &fa, mask);
+ mnt_drop_write_file(filp);
+ return -error;
}
STATIC int
@@ -1382,8 +1409,13 @@ xfs_file_ioctl(
if (copy_from_user(&dmi, arg, sizeof(dmi)))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
+
error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
dmi.fsd_dmstate);
+ mnt_drop_write_file(filp);
return -error;
}
@@ -1431,7 +1463,11 @@ xfs_file_ioctl(
if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_swapext(&sxp);
+ mnt_drop_write_file(filp);
return -error;
}
@@ -1460,9 +1496,14 @@ xfs_file_ioctl(
if (copy_from_user(&inout, arg, sizeof(inout)))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
+
/* input parameter is passed in resblks field of structure */
in = inout.resblks;
error = xfs_reserve_blocks(mp, &in, &inout);
+ mnt_drop_write_file(filp);
if (error)
return -error;
@@ -1493,7 +1534,11 @@ xfs_file_ioctl(
if (copy_from_user(&in, arg, sizeof(in)))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_growfs_data(mp, &in);
+ mnt_drop_write_file(filp);
return -error;
}
@@ -1503,7 +1548,11 @@ xfs_file_ioctl(
if (copy_from_user(&in, arg, sizeof(in)))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_growfs_log(mp, &in);
+ mnt_drop_write_file(filp);
return -error;
}
@@ -1513,7 +1562,11 @@ xfs_file_ioctl(
if (copy_from_user(&in, arg, sizeof(in)))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_growfs_rt(mp, &in);
+ mnt_drop_write_file(filp);
return -error;
}
@@ -1550,6 +1603,26 @@ xfs_file_ioctl(
error = xfs_errortag_clearall(mp, 1);
return -error;
+ case XFS_IOC_FREE_EOFBLOCKS: {
+ struct xfs_eofblocks eofb;
+
+ if (copy_from_user(&eofb, arg, sizeof(eofb)))
+ return -XFS_ERROR(EFAULT);
+
+ if (eofb.eof_version != XFS_EOFBLOCKS_VERSION)
+ return -XFS_ERROR(EINVAL);
+
+ if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID)
+ return -XFS_ERROR(EINVAL);
+
+ if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) ||
+ memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
+ return -XFS_ERROR(EINVAL);
+
+ error = xfs_icache_free_eofblocks(mp, &eofb);
+ return -error;
+ }
+
default:
return -ENOTTY;
}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index c4f2da0d2bf..1244274a567 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -600,7 +600,11 @@ xfs_file_compat_ioctl(
if (xfs_compat_growfs_data_copyin(&in, arg))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_growfs_data(mp, &in);
+ mnt_drop_write_file(filp);
return -error;
}
case XFS_IOC_FSGROWFSRT_32: {
@@ -608,7 +612,11 @@ xfs_file_compat_ioctl(
if (xfs_compat_growfs_rt_copyin(&in, arg))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_growfs_rt(mp, &in);
+ mnt_drop_write_file(filp);
return -error;
}
#endif
@@ -627,7 +635,11 @@ xfs_file_compat_ioctl(
offsetof(struct xfs_swapext, sx_stat)) ||
xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_swapext(&sxp);
+ mnt_drop_write_file(filp);
return -error;
}
case XFS_IOC_FSBULKSTAT_32:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index aadfce6681e..364818eef40 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -41,6 +41,7 @@
#include "xfs_utils.h"
#include "xfs_iomap.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
@@ -285,7 +286,7 @@ xfs_iomap_eof_want_preallocate(
* do any speculative allocation.
*/
start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1)));
- count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+ count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
while (count_fsb > 0) {
imaps = nimaps;
firstblock = NULLFSBLOCK;
@@ -350,6 +351,15 @@ xfs_iomap_prealloc_size(
}
if (shift)
alloc_blocks >>= shift;
+
+ /*
+ * If we are still trying to allocate more space than is
+ * available, squash the prealloc hard. This can happen if we
+ * have a large file on a small filesystem and the above
+ * lowspace thresholds are smaller than MAXEXTLEN.
+ */
+ while (alloc_blocks >= freesp)
+ alloc_blocks >>= 4;
}
if (alloc_blocks < mp->m_writeio_blocks)
@@ -373,7 +383,7 @@ xfs_iomap_write_delay(
xfs_extlen_t extsz;
int nimaps;
xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
- int prealloc, flushed = 0;
+ int prealloc;
int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -416,8 +426,8 @@ retry:
* Make sure preallocation does not create extents beyond the range we
* actually support in this filesystem.
*/
- if (last_fsb > XFS_B_TO_FSB(mp, mp->m_maxioffset))
- last_fsb = XFS_B_TO_FSB(mp, mp->m_maxioffset);
+ if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes))
+ last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
ASSERT(last_fsb > offset_fsb);
@@ -434,31 +444,29 @@ retry:
}
/*
- * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
- * ENOSPC, * flush all other inodes with delalloc blocks to free up
- * some of the excess reserved metadata space. For both cases, retry
+ * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
* without EOF preallocation.
*/
if (nimaps == 0) {
trace_xfs_delalloc_enospc(ip, offset, count);
- if (flushed)
- return XFS_ERROR(error ? error : ENOSPC);
-
- if (error == ENOSPC) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_flush_inodes(ip);
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (prealloc) {
+ prealloc = 0;
+ error = 0;
+ goto retry;
}
-
- flushed = 1;
- error = 0;
- prealloc = 0;
- goto retry;
+ return XFS_ERROR(error ? error : ENOSPC);
}
if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
return xfs_alert_fsblock_zero(ip, &imap[0]);
+ /*
+ * Tag the inode as speculatively preallocated so we can reclaim this
+ * space on demand, if necessary.
+ */
+ if (prealloc)
+ xfs_inode_set_eofblocks_tag(ip);
+
*ret_imap = imap[0];
return 0;
}
@@ -584,7 +592,9 @@ xfs_iomap_write_allocate(
* pointer that the caller gave to us.
*/
error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb, 0, &first_block, 1,
+ count_fsb,
+ XFS_BMAPI_STACK_SWITCH,
+ &first_block, 1,
imap, &nimaps, &free_list);
if (error)
goto trans_cancel;
@@ -680,9 +690,9 @@ xfs_iomap_write_unwritten(
* the same inode that we complete here and might deadlock
* on the iolock.
*/
- xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+ sb_start_intwrite(mp->m_super);
tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
- tp->t_flags |= XFS_TRANS_RESERVE;
+ tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
error = xfs_trans_reserve(tp, resblks,
XFS_WRITE_LOG_RES(mp), 0,
XFS_TRANS_PERM_LOG_RES,
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 1a25fd80279..d82efaa2ac7 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
#include "xfs_vnodeops.h"
#include "xfs_inode_item.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
#include <linux/capability.h>
#include <linux/xattr.h>
@@ -179,7 +180,7 @@ xfs_vn_create(
struct inode *dir,
struct dentry *dentry,
umode_t mode,
- struct nameidata *nd)
+ bool flags)
{
return xfs_vn_mknod(dir, dentry, mode, 0);
}
@@ -197,7 +198,7 @@ STATIC struct dentry *
xfs_vn_lookup(
struct inode *dir,
struct dentry *dentry,
- struct nameidata *nd)
+ unsigned int flags)
{
struct xfs_inode *cip;
struct xfs_name name;
@@ -222,7 +223,7 @@ STATIC struct dentry *
xfs_vn_ci_lookup(
struct inode *dir,
struct dentry *dentry,
- struct nameidata *nd)
+ unsigned int flags)
{
struct xfs_inode *ip;
struct xfs_name xname;
@@ -779,8 +780,8 @@ xfs_setattr_size(
* care about here.
*/
if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
- error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
- FI_NONE);
+ error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ ip->i_d.di_size, newsize);
if (error)
goto out_unlock;
}
@@ -854,6 +855,9 @@ xfs_setattr_size(
* and do not wait the usual (long) time for writeout.
*/
xfs_iflags_set(ip, XFS_ITRUNCATED);
+
+ /* A truncate down always removes post-EOF blocks. */
+ xfs_inode_clear_eofblocks_tag(ip);
}
if (mask & ATTR_CTIME) {
@@ -897,6 +901,47 @@ xfs_vn_setattr(
return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0);
}
+STATIC int
+xfs_vn_update_time(
+ struct inode *inode,
+ struct timespec *now,
+ int flags)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ trace_xfs_update_time(ip);
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+ error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return -error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (flags & S_CTIME) {
+ inode->i_ctime = *now;
+ ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec;
+ ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec;
+ }
+ if (flags & S_MTIME) {
+ inode->i_mtime = *now;
+ ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec;
+ ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec;
+ }
+ if (flags & S_ATIME) {
+ inode->i_atime = *now;
+ ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec;
+ ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec;
+ }
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
+ return -xfs_trans_commit(tp, 0);
+}
+
#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
/*
@@ -991,6 +1036,7 @@ static const struct inode_operations xfs_inode_operations = {
.removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
.fiemap = xfs_vn_fiemap,
+ .update_time = xfs_vn_update_time,
};
static const struct inode_operations xfs_dir_inode_operations = {
@@ -1016,6 +1062,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
.getxattr = generic_getxattr,
.removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
+ .update_time = xfs_vn_update_time,
};
static const struct inode_operations xfs_dir_ci_inode_operations = {
@@ -1041,6 +1088,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
.getxattr = generic_getxattr,
.removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
+ .update_time = xfs_vn_update_time,
};
static const struct inode_operations xfs_symlink_inode_operations = {
@@ -1054,6 +1102,7 @@ static const struct inode_operations xfs_symlink_inode_operations = {
.getxattr = generic_getxattr,
.removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
+ .update_time = xfs_vn_update_time,
};
STATIC void
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index eff577a9b67..2ea7d402188 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -34,6 +34,7 @@
#include "xfs_error.h"
#include "xfs_btree.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
STATIC int
xfs_internal_inum(
@@ -395,7 +396,8 @@ xfs_bulkstat(
if (xfs_inobt_maskn(chunkidx, nicluster)
& ~r.ir_free)
xfs_btree_reada_bufs(mp, agno,
- agbno, nbcluster);
+ agbno, nbcluster,
+ &xfs_inode_buf_ops);
}
irbp->ir_startino = r.ir_startino;
irbp->ir_freecount = r.ir_freecount;
@@ -555,7 +557,7 @@ xfs_bulkstat_single(
/*
* note that requesting valid inode numbers which are not allocated
- * to inodes will most likely cause xfs_itobp to generate warning
+ * to inodes will most likely cause xfs_imap_to_bp to generate warning
* messages about bad magic numbers. This is ok. The fact that
* the inode isn't actually an inode is handled by the
* error check below. Done this way to make the usual case faster
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 828662f70d6..fe7e4df85a7 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -44,6 +44,7 @@
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/crc32c.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/file.h>
@@ -118,6 +119,7 @@
#define xfs_rotorstep xfs_params.rotorstep.val
#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val
+#define xfs_eofb_secs xfs_params.eofb_timer.val
#define current_cpu() (raw_smp_processor_id())
#define current_pid() (current->pid)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index d90d4a38860..46bd9d52ab5 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,8 @@
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_trace.h"
+#include "xfs_fsops.h"
+#include "xfs_cksum.h"
kmem_zone_t *xfs_log_ticket_zone;
@@ -45,51 +47,85 @@ xlog_commit_record(
struct xlog_in_core **iclog,
xfs_lsn_t *commitlsnp);
-STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
- xfs_buftarg_t *log_target,
- xfs_daddr_t blk_offset,
- int num_bblks);
+STATIC struct xlog *
+xlog_alloc_log(
+ struct xfs_mount *mp,
+ struct xfs_buftarg *log_target,
+ xfs_daddr_t blk_offset,
+ int num_bblks);
STATIC int
xlog_space_left(
struct xlog *log,
atomic64_t *head);
-STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
-STATIC void xlog_dealloc_log(xlog_t *log);
+STATIC int
+xlog_sync(
+ struct xlog *log,
+ struct xlog_in_core *iclog);
+STATIC void
+xlog_dealloc_log(
+ struct xlog *log);
/* local state machine functions */
STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
-STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog);
-STATIC int xlog_state_get_iclog_space(xlog_t *log,
- int len,
- xlog_in_core_t **iclog,
- xlog_ticket_t *ticket,
- int *continued_write,
- int *logoffsetp);
-STATIC int xlog_state_release_iclog(xlog_t *log,
- xlog_in_core_t *iclog);
-STATIC void xlog_state_switch_iclogs(xlog_t *log,
- xlog_in_core_t *iclog,
- int eventual_size);
-STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
+STATIC void
+xlog_state_do_callback(
+ struct xlog *log,
+ int aborted,
+ struct xlog_in_core *iclog);
+STATIC int
+xlog_state_get_iclog_space(
+ struct xlog *log,
+ int len,
+ struct xlog_in_core **iclog,
+ struct xlog_ticket *ticket,
+ int *continued_write,
+ int *logoffsetp);
+STATIC int
+xlog_state_release_iclog(
+ struct xlog *log,
+ struct xlog_in_core *iclog);
+STATIC void
+xlog_state_switch_iclogs(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ int eventual_size);
+STATIC void
+xlog_state_want_sync(
+ struct xlog *log,
+ struct xlog_in_core *iclog);
STATIC void
xlog_grant_push_ail(
- struct xlog *log,
- int need_bytes);
-STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
- xlog_ticket_t *ticket);
-STATIC void xlog_ungrant_log_space(xlog_t *log,
- xlog_ticket_t *ticket);
+ struct xlog *log,
+ int need_bytes);
+STATIC void
+xlog_regrant_reserve_log_space(
+ struct xlog *log,
+ struct xlog_ticket *ticket);
+STATIC void
+xlog_ungrant_log_space(
+ struct xlog *log,
+ struct xlog_ticket *ticket);
#if defined(DEBUG)
-STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
+STATIC void
+xlog_verify_dest_ptr(
+ struct xlog *log,
+ char *ptr);
STATIC void
xlog_verify_grant_tail(
- struct xlog *log);
-STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
- int count, boolean_t syncing);
-STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
- xfs_lsn_t tail_lsn);
+ struct xlog *log);
+STATIC void
+xlog_verify_iclog(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ int count,
+ boolean_t syncing);
+STATIC void
+xlog_verify_tail_lsn(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ xfs_lsn_t tail_lsn);
#else
#define xlog_verify_dest_ptr(a,b)
#define xlog_verify_grant_tail(a)
@@ -97,7 +133,9 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
#define xlog_verify_tail_lsn(a,b,c)
#endif
-STATIC int xlog_iclogs_empty(xlog_t *log);
+STATIC int
+xlog_iclogs_empty(
+ struct xlog *log);
static void
xlog_grant_sub_space(
@@ -422,7 +460,8 @@ xfs_log_reserve(
tic->t_trans_type = t_type;
*ticp = tic;
- xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt);
+ xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
+ : tic->t_unit_res);
trace_xfs_log_reserve(log, tic);
@@ -643,25 +682,29 @@ out:
}
/*
- * Finish the recovery of the file system. This is separate from
- * the xfs_log_mount() call, because it depends on the code in
- * xfs_mountfs() to read in the root and real-time bitmap inodes
- * between calling xfs_log_mount() and here.
+ * Finish the recovery of the file system. This is separate from the
+ * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
+ * in the root and real-time bitmap inodes between calling xfs_log_mount() and
+ * here.
*
- * mp - ubiquitous xfs mount point structure
+ * If we finish recovery successfully, start the background log work. If we are
+ * not doing recovery, then we have a RO filesystem and we don't need to start
+ * it.
*/
int
xfs_log_mount_finish(xfs_mount_t *mp)
{
- int error;
+ int error = 0;
- if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+ if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
error = xlog_recover_finish(mp->m_log);
- else {
- error = 0;
+ if (!error)
+ xfs_log_work_queue(mp);
+ } else {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
}
+
return error;
}
@@ -684,7 +727,7 @@ xfs_log_mount_finish(xfs_mount_t *mp)
int
xfs_log_unmount_write(xfs_mount_t *mp)
{
- xlog_t *log = mp->m_log;
+ struct xlog *log = mp->m_log;
xlog_in_core_t *iclog;
#ifdef DEBUG
xlog_in_core_t *first_iclog;
@@ -814,15 +857,49 @@ xfs_log_unmount_write(xfs_mount_t *mp)
} /* xfs_log_unmount_write */
/*
- * Deallocate log structures for unmount/relocation.
+ * Empty the log for unmount/freeze.
*
- * We need to stop the aild from running before we destroy
- * and deallocate the log as the aild references the log.
+ * To do this, we first need to shut down the background log work so it is not
+ * trying to cover the log as we clean up. We then need to unpin all objects in
+ * the log so we can then flush them out. Once they have completed their IO and
+ * run the callbacks removing themselves from the AIL, we can write the unmount
+ * record.
*/
void
-xfs_log_unmount(xfs_mount_t *mp)
+xfs_log_quiesce(
+ struct xfs_mount *mp)
{
- cancel_delayed_work_sync(&mp->m_sync_work);
+ cancel_delayed_work_sync(&mp->m_log->l_work);
+ xfs_log_force(mp, XFS_LOG_SYNC);
+
+ /*
+ * The superblock buffer is uncached and while xfs_ail_push_all_sync()
+ * will push it, xfs_wait_buftarg() will not wait for it. Further,
+ * xfs_buf_iowait() cannot be used because it was pushed with the
+ * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
+ * the IO to complete.
+ */
+ xfs_ail_push_all_sync(mp->m_ail);
+ xfs_wait_buftarg(mp->m_ddev_targp);
+ xfs_buf_lock(mp->m_sb_bp);
+ xfs_buf_unlock(mp->m_sb_bp);
+
+ xfs_log_unmount_write(mp);
+}
+
+/*
+ * Shut down and release the AIL and Log.
+ *
+ * During unmount, we need to ensure we flush all the dirty metadata objects
+ * from the AIL so that the log is empty before we write the unmount record to
+ * the log. Once this is done, we can tear down the AIL and the log.
+ */
+void
+xfs_log_unmount(
+ struct xfs_mount *mp)
+{
+ xfs_log_quiesce(mp);
+
xfs_trans_ail_destroy(mp);
xlog_dealloc_log(mp->m_log);
}
@@ -893,7 +970,7 @@ int
xfs_log_need_covered(xfs_mount_t *mp)
{
int needed = 0;
- xlog_t *log = mp->m_log;
+ struct xlog *log = mp->m_log;
if (!xfs_fs_writable(mp))
return 0;
@@ -1024,9 +1101,9 @@ xlog_space_left(
void
xlog_iodone(xfs_buf_t *bp)
{
- xlog_in_core_t *iclog = bp->b_fspriv;
- xlog_t *l = iclog->ic_log;
- int aborted = 0;
+ struct xlog_in_core *iclog = bp->b_fspriv;
+ struct xlog *l = iclog->ic_log;
+ int aborted = 0;
/*
* Race to shutdown the filesystem if we see an error.
@@ -1054,8 +1131,7 @@ xlog_iodone(xfs_buf_t *bp)
* with it being freed after writing the unmount record to the
* log.
*/
-
-} /* xlog_iodone */
+}
/*
* Return size of each in-core log record buffer.
@@ -1067,8 +1143,9 @@ xlog_iodone(xfs_buf_t *bp)
*/
STATIC void
-xlog_get_iclog_buffer_size(xfs_mount_t *mp,
- xlog_t *log)
+xlog_get_iclog_buffer_size(
+ struct xfs_mount *mp,
+ struct xlog *log)
{
int size;
int xhdrs;
@@ -1124,18 +1201,53 @@ done:
} /* xlog_get_iclog_buffer_size */
+void
+xfs_log_work_queue(
+ struct xfs_mount *mp)
+{
+ queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
+ msecs_to_jiffies(xfs_syncd_centisecs * 10));
+}
+
+/*
+ * Every sync period we need to unpin all items in the AIL and push them to
+ * disk. If there is nothing dirty, then we might need to cover the log to
+ * indicate that the filesystem is idle.
+ */
+void
+xfs_log_worker(
+ struct work_struct *work)
+{
+ struct xlog *log = container_of(to_delayed_work(work),
+ struct xlog, l_work);
+ struct xfs_mount *mp = log->l_mp;
+
+ /* dgc: errors ignored - not fatal and nowhere to report them */
+ if (xfs_log_need_covered(mp))
+ xfs_fs_log_dummy(mp);
+ else
+ xfs_log_force(mp, 0);
+
+ /* start pushing all the metadata that is currently dirty */
+ xfs_ail_push_all(mp->m_ail);
+
+ /* queue us up again */
+ xfs_log_work_queue(mp);
+}
+
/*
* This routine initializes some of the log structure for a given mount point.
* Its primary purpose is to fill in enough, so recovery can occur. However,
* some other stuff may be filled in too.
*/
-STATIC xlog_t *
-xlog_alloc_log(xfs_mount_t *mp,
- xfs_buftarg_t *log_target,
- xfs_daddr_t blk_offset,
- int num_bblks)
+STATIC struct xlog *
+xlog_alloc_log(
+ struct xfs_mount *mp,
+ struct xfs_buftarg *log_target,
+ xfs_daddr_t blk_offset,
+ int num_bblks)
{
- xlog_t *log;
+ struct xlog *log;
xlog_rec_header_t *head;
xlog_in_core_t **iclogp;
xlog_in_core_t *iclog, *prev_iclog=NULL;
@@ -1144,7 +1256,7 @@ xlog_alloc_log(xfs_mount_t *mp,
int error = ENOMEM;
uint log2_size = 0;
- log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
+ log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL);
if (!log) {
xfs_warn(mp, "Log allocation failed: No memory!");
goto out;
@@ -1157,6 +1269,7 @@ xlog_alloc_log(xfs_mount_t *mp,
log->l_logBBsize = num_bblks;
log->l_covered_state = XLOG_STATE_COVER_IDLE;
log->l_flags |= XLOG_ACTIVE_RECOVERY;
+ INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
log->l_prev_block = -1;
/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
@@ -1379,6 +1492,84 @@ xlog_grant_push_ail(
}
/*
+ * Stamp cycle number in every block
+ */
+STATIC void
+xlog_pack_data(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ int roundoff)
+{
+ int i, j, k;
+ int size = iclog->ic_offset + roundoff;
+ __be32 cycle_lsn;
+ xfs_caddr_t dp;
+
+ cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+
+ dp = iclog->ic_datap;
+ for (i = 0; i < BTOBB(size); i++) {
+ if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
+ break;
+ iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
+ *(__be32 *)dp = cycle_lsn;
+ dp += BBSIZE;
+ }
+
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ xlog_in_core_2_t *xhdr = iclog->ic_data;
+
+ for ( ; i < BTOBB(size); i++) {
+ j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
+ *(__be32 *)dp = cycle_lsn;
+ dp += BBSIZE;
+ }
+
+ for (i = 1; i < log->l_iclog_heads; i++)
+ xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+ }
+}
+
+/*
+ * Calculate the checksum for a log buffer.
+ *
+ * This is a little more complicated than it should be because the various
+ * headers and the actual data are non-contiguous.
+ */
+__le32
+xlog_cksum(
+ struct xlog *log,
+ struct xlog_rec_header *rhead,
+ char *dp,
+ int size)
+{
+ __uint32_t crc;
+
+ /* first generate the crc for the record header ... */
+ crc = xfs_start_cksum((char *)rhead,
+ sizeof(struct xlog_rec_header),
+ offsetof(struct xlog_rec_header, h_crc));
+
+ /* ... then for additional cycle data for v2 logs ... */
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
+ int i;
+
+ for (i = 1; i < log->l_iclog_heads; i++) {
+ crc = crc32c(crc, &xhdr[i].hic_xheader,
+ sizeof(struct xlog_rec_ext_header));
+ }
+ }
+
+ /* ... and finally for the payload */
+ crc = crc32c(crc, dp, size);
+
+ return xfs_end_cksum(crc);
+}
+
+/*
* The bdstrat callback function for log bufs. This gives us a central
* place to trap bufs in case we get hit by a log I/O error and need to
* shutdown. Actually, in practice, even when we didn't get a log error,
@@ -1434,10 +1625,10 @@ xlog_bdstrat(
*/
STATIC int
-xlog_sync(xlog_t *log,
- xlog_in_core_t *iclog)
+xlog_sync(
+ struct xlog *log,
+ struct xlog_in_core *iclog)
{
- xfs_caddr_t dptr; /* pointer to byte sized element */
xfs_buf_t *bp;
int i;
uint count; /* byte count of bwrite */
@@ -1446,6 +1637,7 @@ xlog_sync(xlog_t *log,
int split = 0; /* split write into two regions */
int error;
int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
+ int size;
XFS_STATS_INC(xs_log_writes);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
@@ -1476,13 +1668,10 @@ xlog_sync(xlog_t *log,
xlog_pack_data(log, iclog, roundoff);
/* real byte length */
- if (v2) {
- iclog->ic_header.h_len =
- cpu_to_be32(iclog->ic_offset + roundoff);
- } else {
- iclog->ic_header.h_len =
- cpu_to_be32(iclog->ic_offset);
- }
+ size = iclog->ic_offset;
+ if (v2)
+ size += roundoff;
+ iclog->ic_header.h_len = cpu_to_be32(size);
bp = iclog->ic_bp;
XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
@@ -1491,12 +1680,36 @@ xlog_sync(xlog_t *log,
/* Do we need to split this write into 2 parts? */
if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
+ char *dptr;
+
split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
- iclog->ic_bwritecnt = 2; /* split into 2 writes */
+ iclog->ic_bwritecnt = 2;
+
+ /*
+ * Bump the cycle numbers at the start of each block in the
+ * part of the iclog that ends up in the buffer that gets
+ * written to the start of the log.
+ *
+ * Watch out for the header magic number case, though.
+ */
+ dptr = (char *)&iclog->ic_header + count;
+ for (i = 0; i < split; i += BBSIZE) {
+ __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
+ if (++cycle == XLOG_HEADER_MAGIC_NUM)
+ cycle++;
+ *(__be32 *)dptr = cpu_to_be32(cycle);
+
+ dptr += BBSIZE;
+ }
} else {
iclog->ic_bwritecnt = 1;
}
+
+ /* calculcate the checksum */
+ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
+ iclog->ic_datap, size);
+
bp->b_io_length = BTOBB(count);
bp->b_fspriv = iclog;
XFS_BUF_ZEROFLAGS(bp);
@@ -1550,19 +1763,6 @@ xlog_sync(xlog_t *log,
bp->b_flags |= XBF_SYNCIO;
if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
bp->b_flags |= XBF_FUA;
- dptr = bp->b_addr;
- /*
- * Bump the cycle numbers at the start of each block
- * since this part of the buffer is at the start of
- * a new cycle. Watch out for the header magic number
- * case, though.
- */
- for (i = 0; i < split; i += BBSIZE) {
- be32_add_cpu((__be32 *)dptr, 1);
- if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
- be32_add_cpu((__be32 *)dptr, 1);
- dptr += BBSIZE;
- }
ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1579,12 +1779,12 @@ xlog_sync(xlog_t *log,
return 0;
} /* xlog_sync */
-
/*
* Deallocate a log structure
*/
STATIC void
-xlog_dealloc_log(xlog_t *log)
+xlog_dealloc_log(
+ struct xlog *log)
{
xlog_in_core_t *iclog, *next_iclog;
int i;
@@ -1616,10 +1816,11 @@ xlog_dealloc_log(xlog_t *log)
*/
/* ARGSUSED */
static inline void
-xlog_state_finish_copy(xlog_t *log,
- xlog_in_core_t *iclog,
- int record_cnt,
- int copy_bytes)
+xlog_state_finish_copy(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ int record_cnt,
+ int copy_bytes)
{
spin_lock(&log->l_icloglock);
@@ -2142,7 +2343,8 @@ xlog_write(
* State Change: DIRTY -> ACTIVE
*/
STATIC void
-xlog_state_clean_log(xlog_t *log)
+xlog_state_clean_log(
+ struct xlog *log)
{
xlog_in_core_t *iclog;
int changed = 0;
@@ -2222,7 +2424,7 @@ xlog_state_clean_log(xlog_t *log)
STATIC xfs_lsn_t
xlog_get_lowest_lsn(
- xlog_t *log)
+ struct xlog *log)
{
xlog_in_core_t *lsn_log;
xfs_lsn_t lowest_lsn, lsn;
@@ -2245,9 +2447,9 @@ xlog_get_lowest_lsn(
STATIC void
xlog_state_do_callback(
- xlog_t *log,
- int aborted,
- xlog_in_core_t *ciclog)
+ struct xlog *log,
+ int aborted,
+ struct xlog_in_core *ciclog)
{
xlog_in_core_t *iclog;
xlog_in_core_t *first_iclog; /* used to know when we've
@@ -2345,14 +2547,27 @@ xlog_state_do_callback(
/*
- * update the last_sync_lsn before we drop the
+ * Completion of a iclog IO does not imply that
+ * a transaction has completed, as transactions
+ * can be large enough to span many iclogs. We
+ * cannot change the tail of the log half way
+ * through a transaction as this may be the only
+ * transaction in the log and moving th etail to
+ * point to the middle of it will prevent
+ * recovery from finding the start of the
+ * transaction. Hence we should only update the
+ * last_sync_lsn if this iclog contains
+ * transaction completion callbacks on it.
+ *
+ * We have to do this before we drop the
* icloglock to ensure we are the only one that
* can update it.
*/
ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
- atomic64_set(&log->l_last_sync_lsn,
- be64_to_cpu(iclog->ic_header.h_lsn));
+ if (iclog->ic_callback)
+ atomic64_set(&log->l_last_sync_lsn,
+ be64_to_cpu(iclog->ic_header.h_lsn));
} else
ioerrors++;
@@ -2467,7 +2682,7 @@ xlog_state_done_syncing(
xlog_in_core_t *iclog,
int aborted)
{
- xlog_t *log = iclog->ic_log;
+ struct xlog *log = iclog->ic_log;
spin_lock(&log->l_icloglock);
@@ -2521,12 +2736,13 @@ xlog_state_done_syncing(
* is copied.
*/
STATIC int
-xlog_state_get_iclog_space(xlog_t *log,
- int len,
- xlog_in_core_t **iclogp,
- xlog_ticket_t *ticket,
- int *continued_write,
- int *logoffsetp)
+xlog_state_get_iclog_space(
+ struct xlog *log,
+ int len,
+ struct xlog_in_core **iclogp,
+ struct xlog_ticket *ticket,
+ int *continued_write,
+ int *logoffsetp)
{
int log_offset;
xlog_rec_header_t *head;
@@ -2631,8 +2847,9 @@ restart:
* move grant reservation head forward.
*/
STATIC void
-xlog_regrant_reserve_log_space(xlog_t *log,
- xlog_ticket_t *ticket)
+xlog_regrant_reserve_log_space(
+ struct xlog *log,
+ struct xlog_ticket *ticket)
{
trace_xfs_log_regrant_reserve_enter(log, ticket);
@@ -2677,8 +2894,9 @@ xlog_regrant_reserve_log_space(xlog_t *log,
* in the current reservation field.
*/
STATIC void
-xlog_ungrant_log_space(xlog_t *log,
- xlog_ticket_t *ticket)
+xlog_ungrant_log_space(
+ struct xlog *log,
+ struct xlog_ticket *ticket)
{
int bytes;
@@ -2717,8 +2935,8 @@ xlog_ungrant_log_space(xlog_t *log,
*/
STATIC int
xlog_state_release_iclog(
- xlog_t *log,
- xlog_in_core_t *iclog)
+ struct xlog *log,
+ struct xlog_in_core *iclog)
{
int sync = 0; /* do we sync? */
@@ -2768,9 +2986,10 @@ xlog_state_release_iclog(
* that every data block. We have run out of space in this log record.
*/
STATIC void
-xlog_state_switch_iclogs(xlog_t *log,
- xlog_in_core_t *iclog,
- int eventual_size)
+xlog_state_switch_iclogs(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ int eventual_size)
{
ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
if (!eventual_size)
@@ -3114,7 +3333,9 @@ xfs_log_force_lsn(
* disk.
*/
STATIC void
-xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
+xlog_state_want_sync(
+ struct xlog *log,
+ struct xlog_in_core *iclog)
{
assert_spin_locked(&log->l_icloglock);
@@ -3158,7 +3379,7 @@ xfs_log_ticket_get(
/*
* Allocate and initialise a new log ticket.
*/
-xlog_ticket_t *
+struct xlog_ticket *
xlog_ticket_alloc(
struct xlog *log,
int unit_bytes,
@@ -3346,9 +3567,10 @@ xlog_verify_grant_tail(
/* check if it will fit */
STATIC void
-xlog_verify_tail_lsn(xlog_t *log,
- xlog_in_core_t *iclog,
- xfs_lsn_t tail_lsn)
+xlog_verify_tail_lsn(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ xfs_lsn_t tail_lsn)
{
int blocks;
@@ -3385,10 +3607,11 @@ xlog_verify_tail_lsn(xlog_t *log,
* the cycle numbers agree with the current cycle number.
*/
STATIC void
-xlog_verify_iclog(xlog_t *log,
- xlog_in_core_t *iclog,
- int count,
- boolean_t syncing)
+xlog_verify_iclog(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ int count,
+ boolean_t syncing)
{
xlog_op_header_t *ophead;
xlog_in_core_t *icptr;
@@ -3482,7 +3705,7 @@ xlog_verify_iclog(xlog_t *log,
*/
STATIC int
xlog_state_ioerror(
- xlog_t *log)
+ struct xlog *log)
{
xlog_in_core_t *iclog, *ic;
@@ -3527,7 +3750,7 @@ xfs_log_force_umount(
struct xfs_mount *mp,
int logerror)
{
- xlog_t *log;
+ struct xlog *log;
int retval;
log = mp->m_log;
@@ -3634,7 +3857,8 @@ xfs_log_force_umount(
}
STATIC int
-xlog_iclogs_empty(xlog_t *log)
+xlog_iclogs_empty(
+ struct xlog *log)
{
xlog_in_core_t *iclog;
@@ -3649,3 +3873,4 @@ xlog_iclogs_empty(xlog_t *log)
} while (iclog != log->l_iclog);
return 1;
}
+
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 748d312850e..5caee96059d 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,9 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_lsn_t *commit_lsn, int flags);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
+void xfs_log_work_queue(struct xfs_mount *mp);
+void xfs_log_worker(struct work_struct *work);
+void xfs_log_quiesce(struct xfs_mount *mp);
+
#endif
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 72eba2201b1..16d8d12ea3b 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i)
/*
* Flags for log structure
*/
-#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */
#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
@@ -291,7 +290,7 @@ typedef struct xlog_rec_header {
__be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
__be64 h_lsn; /* lsn of this LR : 8 */
__be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
- __be32 h_chksum; /* may not be used; non-zero if used : 4 */
+ __le32 h_crc; /* crc of log record : 4 */
__be32 h_prev_block; /* block number to previous LR : 4 */
__be32 h_num_logops; /* number of log operations in this LR : 4 */
__be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
@@ -487,7 +486,7 @@ struct xlog_grant_head {
* overflow 31 bits worth of byte offset, so using a byte number will mean
* that round off problems won't occur when releasing partial reservations.
*/
-typedef struct xlog {
+struct xlog {
/* The following fields don't need locking */
struct xfs_mount *l_mp; /* mount point */
struct xfs_ail *l_ailp; /* AIL log is working with */
@@ -495,6 +494,7 @@ typedef struct xlog {
struct xfs_buf *l_xbuf; /* extra buffer for log
* wrapping */
struct xfs_buftarg *l_targ; /* buftarg of log */
+ struct delayed_work l_work; /* background flush work */
uint l_flags;
uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
struct list_head *l_buf_cancel_table;
@@ -540,7 +540,7 @@ typedef struct xlog {
char *l_iclog_bak[XLOG_MAX_ICLOGS];
#endif
-} xlog_t;
+};
#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
@@ -548,9 +548,15 @@ typedef struct xlog {
#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
/* common routines */
-extern int xlog_recover(xlog_t *log);
-extern int xlog_recover_finish(xlog_t *log);
-extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
+extern int
+xlog_recover(
+ struct xlog *log);
+extern int
+xlog_recover_finish(
+ struct xlog *log);
+
+extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
+ char *dp, int size);
extern kmem_zone_t *xfs_log_ticket_zone;
struct xlog_ticket *
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a7be98abd6a..96fcbb85ff8 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -41,12 +41,22 @@
#include "xfs_trans_priv.h"
#include "xfs_quota.h"
#include "xfs_utils.h"
+#include "xfs_cksum.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
-STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
-STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
+STATIC int
+xlog_find_zeroed(
+ struct xlog *,
+ xfs_daddr_t *);
+STATIC int
+xlog_clear_stale_blocks(
+ struct xlog *,
+ xfs_lsn_t);
#if defined(DEBUG)
-STATIC void xlog_recover_check_summary(xlog_t *);
+STATIC void
+xlog_recover_check_summary(
+ struct xlog *);
#else
#define xlog_recover_check_summary(log)
#endif
@@ -74,7 +84,7 @@ struct xfs_buf_cancel {
static inline int
xlog_buf_bbcount_valid(
- xlog_t *log,
+ struct xlog *log,
int bbcount)
{
return bbcount > 0 && bbcount <= log->l_logBBsize;
@@ -87,7 +97,7 @@ xlog_buf_bbcount_valid(
*/
STATIC xfs_buf_t *
xlog_get_bp(
- xlog_t *log,
+ struct xlog *log,
int nbblks)
{
struct xfs_buf *bp;
@@ -138,10 +148,10 @@ xlog_put_bp(
*/
STATIC xfs_caddr_t
xlog_align(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
@@ -155,10 +165,10 @@ xlog_align(
*/
STATIC int
xlog_bread_noalign(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
int error;
@@ -189,10 +199,10 @@ xlog_bread_noalign(
STATIC int
xlog_bread(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
xfs_caddr_t *offset)
{
int error;
@@ -211,10 +221,10 @@ xlog_bread(
*/
STATIC int
xlog_bread_offset(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t blk_no, /* block to read from */
int nbblks, /* blocks to read */
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
xfs_caddr_t offset)
{
xfs_caddr_t orig_offset = bp->b_addr;
@@ -241,10 +251,10 @@ xlog_bread_offset(
*/
STATIC int
xlog_bwrite(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
int error;
@@ -378,8 +388,8 @@ xlog_recover_iodone(
*/
STATIC int
xlog_find_cycle_start(
- xlog_t *log,
- xfs_buf_t *bp,
+ struct xlog *log,
+ struct xfs_buf *bp,
xfs_daddr_t first_blk,
xfs_daddr_t *last_blk,
uint cycle)
@@ -421,7 +431,7 @@ xlog_find_cycle_start(
*/
STATIC int
xlog_find_verify_cycle(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t start_blk,
int nbblks,
uint stop_on_cycle_no,
@@ -490,7 +500,7 @@ out:
*/
STATIC int
xlog_find_verify_log_record(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t start_blk,
xfs_daddr_t *last_blk,
int extra_bblks)
@@ -600,7 +610,7 @@ out:
*/
STATIC int
xlog_find_head(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t *return_head_blk)
{
xfs_buf_t *bp;
@@ -871,7 +881,7 @@ validate_head:
*/
STATIC int
xlog_find_tail(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t *head_blk,
xfs_daddr_t *tail_blk)
{
@@ -1080,7 +1090,7 @@ done:
*/
STATIC int
xlog_find_zeroed(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t *blk_no)
{
xfs_buf_t *bp;
@@ -1183,7 +1193,7 @@ bp_err:
*/
STATIC void
xlog_add_record(
- xlog_t *log,
+ struct xlog *log,
xfs_caddr_t buf,
int cycle,
int block,
@@ -1205,7 +1215,7 @@ xlog_add_record(
STATIC int
xlog_write_log_records(
- xlog_t *log,
+ struct xlog *log,
int cycle,
int start_block,
int blocks,
@@ -1305,7 +1315,7 @@ xlog_write_log_records(
*/
STATIC int
xlog_clear_stale_blocks(
- xlog_t *log,
+ struct xlog *log,
xfs_lsn_t tail_lsn)
{
int tail_cycle, head_cycle;
@@ -2050,11 +2060,11 @@ xfs_qm_dqcheck(
*/
STATIC void
xlog_recover_do_dquot_buffer(
- xfs_mount_t *mp,
- xlog_t *log,
- xlog_recover_item_t *item,
- xfs_buf_t *bp,
- xfs_buf_log_format_t *buf_f)
+ struct xfs_mount *mp,
+ struct xlog *log,
+ struct xlog_recover_item *item,
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f)
{
uint type;
@@ -2108,9 +2118,9 @@ xlog_recover_do_dquot_buffer(
*/
STATIC int
xlog_recover_buffer_pass2(
- xlog_t *log,
- struct list_head *buffer_list,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item)
{
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
xfs_mount_t *mp = log->l_mp;
@@ -2135,7 +2145,7 @@ xlog_recover_buffer_pass2(
buf_flags |= XBF_UNMAPPED;
bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
- buf_flags);
+ buf_flags, NULL);
if (!bp)
return XFS_ERROR(ENOMEM);
error = bp->b_error;
@@ -2189,9 +2199,9 @@ xlog_recover_buffer_pass2(
STATIC int
xlog_recover_inode_pass2(
- xlog_t *log,
- struct list_head *buffer_list,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item)
{
xfs_inode_log_format_t *in_f;
xfs_mount_t *mp = log->l_mp;
@@ -2228,7 +2238,8 @@ xlog_recover_inode_pass2(
}
trace_xfs_log_recover_inode_recover(log, in_f);
- bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0);
+ bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
+ NULL);
if (!bp) {
error = ENOMEM;
goto error;
@@ -2452,14 +2463,14 @@ error:
}
/*
- * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
+ * Recover QUOTAOFF records. We simply make a note of it in the xlog
* structure, so that we know not to do any dquot item or dquot buffer recovery,
* of that type.
*/
STATIC int
xlog_recover_quotaoff_pass1(
- xlog_t *log,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct xlog_recover_item *item)
{
xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
ASSERT(qoff_f);
@@ -2483,9 +2494,9 @@ xlog_recover_quotaoff_pass1(
*/
STATIC int
xlog_recover_dquot_pass2(
- xlog_t *log,
- struct list_head *buffer_list,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item)
{
xfs_mount_t *mp = log->l_mp;
xfs_buf_t *bp;
@@ -2539,7 +2550,8 @@ xlog_recover_dquot_pass2(
ASSERT(dq_f->qlf_len == 1);
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
- XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp);
+ XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
+ NULL);
if (error)
return error;
@@ -2578,9 +2590,9 @@ xlog_recover_dquot_pass2(
*/
STATIC int
xlog_recover_efi_pass2(
- xlog_t *log,
- xlog_recover_item_t *item,
- xfs_lsn_t lsn)
+ struct xlog *log,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
{
int error;
xfs_mount_t *mp = log->l_mp;
@@ -2616,8 +2628,8 @@ xlog_recover_efi_pass2(
*/
STATIC int
xlog_recover_efd_pass2(
- xlog_t *log,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct xlog_recover_item *item)
{
xfs_efd_log_format_t *efd_formatp;
xfs_efi_log_item_t *efip = NULL;
@@ -2812,9 +2824,9 @@ xlog_recover_unmount_trans(
*/
STATIC int
xlog_recover_process_data(
- xlog_t *log,
+ struct xlog *log,
struct hlist_head rhash[],
- xlog_rec_header_t *rhead,
+ struct xlog_rec_header *rhead,
xfs_caddr_t dp,
int pass)
{
@@ -2986,7 +2998,7 @@ abort_error:
*/
STATIC int
xlog_recover_process_efis(
- xlog_t *log)
+ struct xlog *log)
{
xfs_log_item_t *lip;
xfs_efi_log_item_t *efip;
@@ -3098,7 +3110,7 @@ xlog_recover_process_one_iunlink(
/*
* Get the on disk inode to find the next inode in the bucket.
*/
- error = xfs_itobp(mp, NULL, ip, &dip, &ibp, 0);
+ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
if (error)
goto fail_iput;
@@ -3147,7 +3159,7 @@ xlog_recover_process_one_iunlink(
*/
STATIC void
xlog_recover_process_iunlinks(
- xlog_t *log)
+ struct xlog *log)
{
xfs_mount_t *mp;
xfs_agnumber_t agno;
@@ -3205,80 +3217,58 @@ xlog_recover_process_iunlinks(
mp->m_dmevmask = mp_dmevmask;
}
-
-#ifdef DEBUG
-STATIC void
-xlog_pack_data_checksum(
- xlog_t *log,
- xlog_in_core_t *iclog,
- int size)
-{
- int i;
- __be32 *up;
- uint chksum = 0;
-
- up = (__be32 *)iclog->ic_datap;
- /* divide length by 4 to get # words */
- for (i = 0; i < (size >> 2); i++) {
- chksum ^= be32_to_cpu(*up);
- up++;
- }
- iclog->ic_header.h_chksum = cpu_to_be32(chksum);
-}
-#else
-#define xlog_pack_data_checksum(log, iclog, size)
-#endif
-
/*
- * Stamp cycle number in every block
+ * Upack the log buffer data and crc check it. If the check fails, issue a
+ * warning if and only if the CRC in the header is non-zero. This makes the
+ * check an advisory warning, and the zero CRC check will prevent failure
+ * warnings from being emitted when upgrading the kernel from one that does not
+ * add CRCs by default.
+ *
+ * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
+ * corruption failure
*/
-void
-xlog_pack_data(
- xlog_t *log,
- xlog_in_core_t *iclog,
- int roundoff)
+STATIC int
+xlog_unpack_data_crc(
+ struct xlog_rec_header *rhead,
+ xfs_caddr_t dp,
+ struct xlog *log)
{
- int i, j, k;
- int size = iclog->ic_offset + roundoff;
- __be32 cycle_lsn;
- xfs_caddr_t dp;
-
- xlog_pack_data_checksum(log, iclog, size);
-
- cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
-
- dp = iclog->ic_datap;
- for (i = 0; i < BTOBB(size) &&
- i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
- iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
- *(__be32 *)dp = cycle_lsn;
- dp += BBSIZE;
- }
-
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- xlog_in_core_2_t *xhdr = iclog->ic_data;
-
- for ( ; i < BTOBB(size); i++) {
- j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
- *(__be32 *)dp = cycle_lsn;
- dp += BBSIZE;
+ __le32 crc;
+
+ crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+ if (crc != rhead->h_crc) {
+ if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+ xfs_alert(log->l_mp,
+ "log record CRC mismatch: found 0x%x, expected 0x%x.\n",
+ le32_to_cpu(rhead->h_crc),
+ le32_to_cpu(crc));
+ xfs_hex_dump(dp, 32);
}
- for (i = 1; i < log->l_iclog_heads; i++) {
- xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
- }
+ /*
+ * If we've detected a log record corruption, then we can't
+ * recover past this point. Abort recovery if we are enforcing
+ * CRC protection by punting an error back up the stack.
+ */
+ if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
+ return EFSCORRUPTED;
}
+
+ return 0;
}
-STATIC void
+STATIC int
xlog_unpack_data(
- xlog_rec_header_t *rhead,
+ struct xlog_rec_header *rhead,
xfs_caddr_t dp,
- xlog_t *log)
+ struct xlog *log)
{
int i, j, k;
+ int error;
+
+ error = xlog_unpack_data_crc(rhead, dp, log);
+ if (error)
+ return error;
for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3295,12 +3285,14 @@ xlog_unpack_data(
dp += BBSIZE;
}
}
+
+ return 0;
}
STATIC int
xlog_valid_rec_header(
- xlog_t *log,
- xlog_rec_header_t *rhead,
+ struct xlog *log,
+ struct xlog_rec_header *rhead,
xfs_daddr_t blkno)
{
int hlen;
@@ -3343,7 +3335,7 @@ xlog_valid_rec_header(
*/
STATIC int
xlog_do_recovery_pass(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk,
int pass)
@@ -3426,9 +3418,13 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- xlog_unpack_data(rhead, offset, log);
- if ((error = xlog_recover_process_data(log,
- rhash, rhead, offset, pass)))
+ error = xlog_unpack_data(rhead, offset, log);
+ if (error)
+ goto bread_err2;
+
+ error = xlog_recover_process_data(log,
+ rhash, rhead, offset, pass);
+ if (error)
goto bread_err2;
blk_no += bblks + hblks;
}
@@ -3533,14 +3529,19 @@ xlog_do_recovery_pass(
* - order is important.
*/
error = xlog_bread_offset(log, 0,
- bblks - split_bblks, hbp,
+ bblks - split_bblks, dbp,
offset + BBTOB(split_bblks));
if (error)
goto bread_err2;
}
- xlog_unpack_data(rhead, offset, log);
- if ((error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass)))
+
+ error = xlog_unpack_data(rhead, offset, log);
+ if (error)
+ goto bread_err2;
+
+ error = xlog_recover_process_data(log, rhash,
+ rhead, offset, pass);
+ if (error)
goto bread_err2;
blk_no += bblks;
}
@@ -3565,9 +3566,13 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- xlog_unpack_data(rhead, offset, log);
- if ((error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass)))
+ error = xlog_unpack_data(rhead, offset, log);
+ if (error)
+ goto bread_err2;
+
+ error = xlog_recover_process_data(log, rhash,
+ rhead, offset, pass);
+ if (error)
goto bread_err2;
blk_no += bblks + hblks;
}
@@ -3595,7 +3600,7 @@ xlog_do_recovery_pass(
*/
STATIC int
xlog_do_log_recovery(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
{
@@ -3646,7 +3651,7 @@ xlog_do_log_recovery(
*/
STATIC int
xlog_do_recover(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
{
@@ -3681,13 +3686,14 @@ xlog_do_recover(
/*
* Now that we've finished replaying all buffer and inode
- * updates, re-read in the superblock.
+ * updates, re-read in the superblock and reverify it.
*/
bp = xfs_getsb(log->l_mp, 0);
XFS_BUF_UNDONE(bp);
ASSERT(!(XFS_BUF_ISWRITE(bp)));
XFS_BUF_READ(bp);
XFS_BUF_UNASYNC(bp);
+ bp->b_ops = &xfs_sb_buf_ops;
xfsbdstrat(log->l_mp, bp);
error = xfs_buf_iowait(bp);
if (error) {
@@ -3699,7 +3705,7 @@ xlog_do_recover(
/* Convert superblock from on-disk format */
sbp = &log->l_mp->m_sb;
- xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp));
+ xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
ASSERT(xfs_sb_good_version(sbp));
xfs_buf_relse(bp);
@@ -3721,7 +3727,7 @@ xlog_do_recover(
*/
int
xlog_recover(
- xlog_t *log)
+ struct xlog *log)
{
xfs_daddr_t head_blk, tail_blk;
int error;
@@ -3767,7 +3773,7 @@ xlog_recover(
*/
int
xlog_recover_finish(
- xlog_t *log)
+ struct xlog *log)
{
/*
* Now we're ready to do the transactions needed for the
@@ -3814,7 +3820,7 @@ xlog_recover_finish(
*/
void
xlog_recover_check_summary(
- xlog_t *log)
+ struct xlog *log)
{
xfs_mount_t *mp;
xfs_agf_t *agfp;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 536021fb3d4..7d6df7c00c3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
#include "xfs_fsops.h"
#include "xfs_utils.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
#ifdef HAVE_PERCPU_SB
@@ -303,9 +304,8 @@ STATIC int
xfs_mount_validate_sb(
xfs_mount_t *mp,
xfs_sb_t *sbp,
- int flags)
+ bool check_inprogress)
{
- int loud = !(flags & XFS_MFSI_QUIET);
/*
* If the log device and data device have the
@@ -315,21 +315,18 @@ xfs_mount_validate_sb(
* a volume filesystem in a non-volume manner.
*/
if (sbp->sb_magicnum != XFS_SB_MAGIC) {
- if (loud)
- xfs_warn(mp, "bad magic number");
+ xfs_warn(mp, "bad magic number");
return XFS_ERROR(EWRONGFS);
}
if (!xfs_sb_good_version(sbp)) {
- if (loud)
- xfs_warn(mp, "bad version");
+ xfs_warn(mp, "bad version");
return XFS_ERROR(EWRONGFS);
}
if (unlikely(
sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
- if (loud)
- xfs_warn(mp,
+ xfs_warn(mp,
"filesystem is marked as having an external log; "
"specify logdev on the mount command line.");
return XFS_ERROR(EINVAL);
@@ -337,8 +334,7 @@ xfs_mount_validate_sb(
if (unlikely(
sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
- if (loud)
- xfs_warn(mp,
+ xfs_warn(mp,
"filesystem is marked as having an internal log; "
"do not specify logdev on the mount command line.");
return XFS_ERROR(EINVAL);
@@ -372,8 +368,7 @@ xfs_mount_validate_sb(
sbp->sb_dblocks == 0 ||
sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
- if (loud)
- XFS_CORRUPTION_ERROR("SB sanity check failed",
+ XFS_CORRUPTION_ERROR("SB sanity check failed",
XFS_ERRLEVEL_LOW, mp, sbp);
return XFS_ERROR(EFSCORRUPTED);
}
@@ -382,12 +377,10 @@ xfs_mount_validate_sb(
* Until this is fixed only page-sized or smaller data blocks work.
*/
if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
- if (loud) {
- xfs_warn(mp,
+ xfs_warn(mp,
"File system with blocksize %d bytes. "
"Only pagesize (%ld) or less will currently work.",
sbp->sb_blocksize, PAGE_SIZE);
- }
return XFS_ERROR(ENOSYS);
}
@@ -401,23 +394,20 @@ xfs_mount_validate_sb(
case 2048:
break;
default:
- if (loud)
- xfs_warn(mp, "inode size of %d bytes not supported",
+ xfs_warn(mp, "inode size of %d bytes not supported",
sbp->sb_inodesize);
return XFS_ERROR(ENOSYS);
}
if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
- if (loud)
- xfs_warn(mp,
+ xfs_warn(mp,
"file system too large to be mounted on this system.");
return XFS_ERROR(EFBIG);
}
- if (unlikely(sbp->sb_inprogress)) {
- if (loud)
- xfs_warn(mp, "file system busy");
+ if (check_inprogress && sbp->sb_inprogress) {
+ xfs_warn(mp, "Offline file system operation in progress!");
return XFS_ERROR(EFSCORRUPTED);
}
@@ -425,9 +415,7 @@ xfs_mount_validate_sb(
* Version 1 directory format has never worked on Linux.
*/
if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
- if (loud)
- xfs_warn(mp,
- "file system using version 1 directory format");
+ xfs_warn(mp, "file system using version 1 directory format");
return XFS_ERROR(ENOSYS);
}
@@ -440,7 +428,7 @@ xfs_initialize_perag(
xfs_agnumber_t agcount,
xfs_agnumber_t *maxagi)
{
- xfs_agnumber_t index, max_metadata;
+ xfs_agnumber_t index;
xfs_agnumber_t first_initialised = 0;
xfs_perag_t *pag;
xfs_agino_t agino;
@@ -500,43 +488,10 @@ xfs_initialize_perag(
else
mp->m_flags &= ~XFS_MOUNT_32BITINODES;
- if (mp->m_flags & XFS_MOUNT_32BITINODES) {
- /*
- * Calculate how much should be reserved for inodes to meet
- * the max inode percentage.
- */
- if (mp->m_maxicount) {
- __uint64_t icount;
-
- icount = sbp->sb_dblocks * sbp->sb_imax_pct;
- do_div(icount, 100);
- icount += sbp->sb_agblocks - 1;
- do_div(icount, sbp->sb_agblocks);
- max_metadata = icount;
- } else {
- max_metadata = agcount;
- }
-
- for (index = 0; index < agcount; index++) {
- ino = XFS_AGINO_TO_INO(mp, index, agino);
- if (ino > XFS_MAXINUMBER_32) {
- index++;
- break;
- }
-
- pag = xfs_perag_get(mp, index);
- pag->pagi_inodeok = 1;
- if (index < max_metadata)
- pag->pagf_metadata = 1;
- xfs_perag_put(pag);
- }
- } else {
- for (index = 0; index < agcount; index++) {
- pag = xfs_perag_get(mp, index);
- pag->pagi_inodeok = 1;
- xfs_perag_put(pag);
- }
- }
+ if (mp->m_flags & XFS_MOUNT_32BITINODES)
+ index = xfs_set_inode32(mp);
+ else
+ index = xfs_set_inode64(mp);
if (maxagi)
*maxagi = index;
@@ -553,11 +508,9 @@ out_unwind:
void
xfs_sb_from_disk(
- struct xfs_mount *mp,
+ struct xfs_sb *to,
xfs_dsb_t *from)
{
- struct xfs_sb *to = &mp->m_sb;
-
to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -659,6 +612,72 @@ xfs_sb_to_disk(
}
}
+static void
+xfs_sb_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_sb sb;
+ int error;
+
+ xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+
+ /*
+ * Only check the in progress field for the primary superblock as
+ * mkfs.xfs doesn't clear it from secondary superblocks.
+ */
+ error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
+ if (error)
+ xfs_buf_ioerror(bp, error);
+}
+
+static void
+xfs_sb_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_sb_verify(bp);
+}
+
+/*
+ * We may be probed for a filesystem match, so we may not want to emit
+ * messages when the superblock buffer is not actually an XFS superblock.
+ * If we find an XFS superblock, the run a normal, noisy mount because we are
+ * really going to mount it and want to know about errors.
+ */
+static void
+xfs_sb_quiet_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_sb sb;
+
+ xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+
+ if (sb.sb_magicnum == XFS_SB_MAGIC) {
+ /* XFS filesystem, verify noisily! */
+ xfs_sb_read_verify(bp);
+ return;
+ }
+ /* quietly fail */
+ xfs_buf_ioerror(bp, EWRONGFS);
+}
+
+static void
+xfs_sb_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_sb_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_sb_buf_ops = {
+ .verify_read = xfs_sb_read_verify,
+ .verify_write = xfs_sb_write_verify,
+};
+
+static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+ .verify_read = xfs_sb_quiet_read_verify,
+ .verify_write = xfs_sb_write_verify,
+};
+
/*
* xfs_readsb
*
@@ -684,26 +703,27 @@ xfs_readsb(xfs_mount_t *mp, int flags)
reread:
bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
- BTOBB(sector_size), 0);
+ BTOBB(sector_size), 0,
+ loud ? &xfs_sb_buf_ops
+ : &xfs_sb_quiet_buf_ops);
if (!bp) {
if (loud)
xfs_warn(mp, "SB buffer read failed");
return EIO;
}
-
- /*
- * Initialize the mount structure from the superblock.
- * But first do some basic consistency checking.
- */
- xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
- error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
- if (error) {
+ if (bp->b_error) {
+ error = bp->b_error;
if (loud)
xfs_warn(mp, "SB validate failed");
goto release_buf;
}
/*
+ * Initialize the mount structure from the superblock.
+ */
+ xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+
+ /*
* We must be able to do sector-sized and sector-aligned IO.
*/
if (sector_size > mp->m_sb.sb_sectsize) {
@@ -1034,7 +1054,7 @@ xfs_check_sizes(xfs_mount_t *mp)
}
bp = xfs_buf_read_uncached(mp->m_ddev_targp,
d - XFS_FSS_TO_BB(mp, 1),
- XFS_FSS_TO_BB(mp, 1), 0);
+ XFS_FSS_TO_BB(mp, 1), 0, NULL);
if (!bp) {
xfs_warn(mp, "last sector read failed");
return EIO;
@@ -1049,7 +1069,7 @@ xfs_check_sizes(xfs_mount_t *mp)
}
bp = xfs_buf_read_uncached(mp->m_logdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_BB(mp, 1), 0);
+ XFS_FSB_TO_BB(mp, 1), 0, NULL);
if (!bp) {
xfs_warn(mp, "log device read failed");
return EIO;
@@ -1200,8 +1220,6 @@ xfs_mountfs(
xfs_set_maxicount(mp);
- mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
-
error = xfs_uuid_mount(mp);
if (error)
goto out;
@@ -1462,6 +1480,8 @@ xfs_unmountfs(
__uint64_t resblks;
int error;
+ cancel_delayed_work_sync(&mp->m_eofblocks_work);
+
xfs_qm_unmount_quotas(mp);
xfs_rtunmount_inodes(mp);
IRELE(mp->m_rootip);
@@ -1485,21 +1505,16 @@ xfs_unmountfs(
/*
* And reclaim all inodes. At this point there should be no dirty
- * inode, and none should be pinned or locked, but use synchronous
- * reclaim just to be sure.
+ * inodes and none should be pinned or locked, but use synchronous
+ * reclaim just to be sure. We can stop background inode reclaim
+ * here as well if it is still running.
*/
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
xfs_reclaim_inodes(mp, SYNC_WAIT);
xfs_qm_unmount(mp);
/*
- * Flush out the log synchronously so that we know for sure
- * that nothing is pinned. This is important because bflush()
- * will skip pinned buffers.
- */
- xfs_log_force(mp, XFS_LOG_SYNC);
-
- /*
* Unreserve any blocks we have so that when we unmount we don't account
* the reserved free space as used. This is really only necessary for
* lazy superblock counting because it trusts the incore superblock
@@ -1524,14 +1539,6 @@ xfs_unmountfs(
xfs_warn(mp, "Unable to update superblock counters. "
"Freespace may not be correct on next mount.");
- /*
- * At this point we might have modified the superblock again and thus
- * added an item to the AIL, thus flush it again.
- */
- xfs_ail_push_all_sync(mp->m_ail);
- xfs_wait_buftarg(mp->m_ddev_targp);
-
- xfs_log_unmount_write(mp);
xfs_log_unmount(mp);
xfs_uuid_unmount(mp);
@@ -1544,7 +1551,7 @@ xfs_unmountfs(
int
xfs_fs_writable(xfs_mount_t *mp)
{
- return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) ||
+ return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) ||
(mp->m_flags & XFS_MOUNT_RDONLY));
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 90c1fc9eaea..bab8314507e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -51,15 +51,8 @@ typedef struct xfs_trans_reservations {
#else /* __KERNEL__ */
-#include "xfs_sync.h"
-
struct xlog;
-struct xfs_mount_args;
struct xfs_inode;
-struct xfs_bmbt_irec;
-struct xfs_bmap_free;
-struct xfs_extdelta;
-struct xfs_swapext;
struct xfs_mru_cache;
struct xfs_nameops;
struct xfs_ail;
@@ -176,7 +169,6 @@ typedef struct xfs_mount {
uint m_qflags; /* quota status flags */
xfs_trans_reservations_t m_reservations;/* precomputed res values */
__uint64_t m_maxicount; /* maximum inode count */
- __uint64_t m_maxioffset; /* maximum inode offset */
__uint64_t m_resblks; /* total reserved blocks */
__uint64_t m_resblks_avail;/* available reserved blocks */
__uint64_t m_resblks_save; /* reserved blks @ remount,ro */
@@ -203,9 +195,9 @@ typedef struct xfs_mount {
struct mutex m_icsb_mutex; /* balancer sync lock */
#endif
struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
- struct delayed_work m_sync_work; /* background sync work */
struct delayed_work m_reclaim_work; /* background inode reclaim */
- struct work_struct m_flush_work; /* background inode flush */
+ struct delayed_work m_eofblocks_work; /* background eof blocks
+ trimming */
__int64_t m_update_flags; /* sb flags we need to update
on the next remount,rw */
struct shrinker m_inode_shrink; /* inode reclaim shrinker */
@@ -215,6 +207,9 @@ typedef struct xfs_mount {
struct workqueue_struct *m_data_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
struct workqueue_struct *m_cil_workqueue;
+ struct workqueue_struct *m_reclaim_workqueue;
+ struct workqueue_struct *m_log_workqueue;
+ struct workqueue_struct *m_eofblocks_workqueue;
} xfs_mount_t;
/*
@@ -297,8 +292,6 @@ xfs_preferred_iosize(xfs_mount_t *mp)
PAGE_CACHE_SIZE));
}
-#define XFS_MAXIOFFSET(mp) ((mp)->m_maxioffset)
-
#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \
((mp)->m_flags & XFS_MOUNT_WAS_CLEAN)
#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
@@ -314,9 +307,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */
#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */
-#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen)
-#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l))
-
/*
* Flags for xfs_mountfs
*/
@@ -398,7 +388,9 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
xfs_agnumber_t *);
-extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *);
+extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
+
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 249db198776..60eff476315 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -40,6 +40,7 @@
#include "xfs_utils.h"
#include "xfs_qm.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -891,7 +892,8 @@ xfs_qm_dqiter_bufs(
while (blkcnt--) {
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, bno),
- mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+ &xfs_dquot_buf_ops);
if (error)
break;
@@ -940,7 +942,7 @@ xfs_qm_dqiterate(
map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
lblkno = 0;
- maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+ maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
do {
nmaps = XFS_DQITER_MAP_SIZE;
/*
@@ -978,7 +980,8 @@ xfs_qm_dqiterate(
while (rablkcnt--) {
xfs_buf_readahead(mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, rablkno),
- mp->m_quotainfo->qi_dqchunklen);
+ mp->m_quotainfo->qi_dqchunklen,
+ NULL);
rablkno++;
}
}
@@ -1453,7 +1456,7 @@ xfs_qm_dqreclaim_one(
int error;
if (!xfs_dqlock_nowait(dqp))
- goto out_busy;
+ goto out_move_tail;
/*
* This dquot has acquired a reference in the meantime remove it from
@@ -1476,7 +1479,7 @@ xfs_qm_dqreclaim_one(
* getting flushed to disk, we don't want to reclaim it.
*/
if (!xfs_dqflock_nowait(dqp))
- goto out_busy;
+ goto out_unlock_move_tail;
if (XFS_DQ_IS_DIRTY(dqp)) {
struct xfs_buf *bp = NULL;
@@ -1487,7 +1490,7 @@ xfs_qm_dqreclaim_one(
if (error) {
xfs_warn(mp, "%s: dquot %p flush failed",
__func__, dqp);
- goto out_busy;
+ goto out_unlock_move_tail;
}
xfs_buf_delwri_queue(bp, buffer_list);
@@ -1496,7 +1499,7 @@ xfs_qm_dqreclaim_one(
* Give the dquot another try on the freelist, as the
* flushing will take some time.
*/
- goto out_busy;
+ goto out_unlock_move_tail;
}
xfs_dqfunlock(dqp);
@@ -1515,14 +1518,13 @@ xfs_qm_dqreclaim_one(
XFS_STATS_INC(xs_qm_dqreclaims);
return;
-out_busy:
- xfs_dqunlock(dqp);
-
/*
* Move the dquot to the tail of the list so that we don't spin on it.
*/
+out_unlock_move_tail:
+ xfs_dqunlock(dqp);
+out_move_tail:
list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
-
trace_xfs_dqreclaim_busy(dqp);
XFS_STATS_INC(xs_qm_dqreclaim_misses);
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 858a3b18611..8a59f854655 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -40,6 +40,7 @@
#include "xfs_utils.h"
#include "xfs_qm.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
@@ -783,11 +784,11 @@ xfs_qm_scall_getquota(
(XFS_IS_OQUOTA_ENFORCED(mp) &&
(dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
dst->d_id != 0) {
- if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) &&
+ if ((dst->d_bcount > dst->d_blk_softlimit) &&
(dst->d_blk_softlimit > 0)) {
ASSERT(dst->d_btimer != 0);
}
- if (((int) dst->d_icount > (int) dst->d_ino_softlimit) &&
+ if ((dst->d_icount > dst->d_ino_softlimit) &&
(dst->d_ino_softlimit > 0)) {
ASSERT(dst->d_itimer != 0);
}
@@ -845,7 +846,8 @@ STATIC int
xfs_dqrele_inode(
struct xfs_inode *ip,
struct xfs_perag *pag,
- int flags)
+ int flags,
+ void *args)
{
/* skip quota inodes */
if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
@@ -881,5 +883,5 @@ xfs_qm_dqrele_all_inodes(
uint flags)
{
ASSERT(mp->m_quotainfo);
- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
+ xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
}
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index fed504fc299..71926d63052 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -97,8 +97,7 @@ xfs_fs_set_xstate(
STATIC int
xfs_fs_get_dqblk(
struct super_block *sb,
- int type,
- qid_t id,
+ struct kqid qid,
struct fs_disk_quota *fdq)
{
struct xfs_mount *mp = XFS_M(sb);
@@ -108,14 +107,14 @@ xfs_fs_get_dqblk(
if (!XFS_IS_QUOTA_ON(mp))
return -ESRCH;
- return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq);
+ return -xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
+ xfs_quota_type(qid.type), fdq);
}
STATIC int
xfs_fs_set_dqblk(
struct super_block *sb,
- int type,
- qid_t id,
+ struct kqid qid,
struct fs_disk_quota *fdq)
{
struct xfs_mount *mp = XFS_M(sb);
@@ -127,7 +126,8 @@ xfs_fs_set_dqblk(
if (!XFS_IS_QUOTA_ON(mp))
return -ESRCH;
- return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
+ return -xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
+ xfs_quota_type(qid.type), fdq);
}
const struct quotactl_ops xfs_quotactl_operations = {
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 92d4331cd4f..98dc670d3ee 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -38,6 +38,7 @@
#include "xfs_utils.h"
#include "xfs_trace.h"
#include "xfs_buf.h"
+#include "xfs_icache.h"
/*
@@ -857,7 +858,7 @@ xfs_rtbuf_get(
xfs_buf_t *bp; /* block buffer, result */
xfs_inode_t *ip; /* bitmap or summary inode */
xfs_bmbt_irec_t map;
- int nmap;
+ int nmap = 1;
int error; /* error value */
ip = issum ? mp->m_rsumip : mp->m_rbmip;
@@ -869,7 +870,7 @@ xfs_rtbuf_get(
ASSERT(map.br_startblock != NULLFSBLOCK);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map.br_startblock),
- mp->m_bsize, 0, &bp);
+ mp->m_bsize, 0, &bp, NULL);
if (error)
return error;
ASSERT(!xfs_buf_geterror(bp));
@@ -1872,9 +1873,14 @@ xfs_growfs_rt(
*/
bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
XFS_FSB_TO_BB(mp, nrblocks - 1),
- XFS_FSB_TO_BB(mp, 1), 0);
+ XFS_FSB_TO_BB(mp, 1), 0, NULL);
if (!bp)
return EIO;
+ if (bp->b_error) {
+ error = bp->b_error;
+ xfs_buf_relse(bp);
+ return error;
+ }
xfs_buf_relse(bp);
/*
@@ -2219,9 +2225,11 @@ xfs_rtmount_init(
}
bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_BB(mp, 1), 0);
- if (!bp) {
+ XFS_FSB_TO_BB(mp, 1), 0, NULL);
+ if (!bp || bp->b_error) {
xfs_warn(mp, "realtime device size check failed");
+ if (bp)
+ xfs_buf_relse(bp);
return EIO;
}
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index f429d9d5d32..a05b45175fb 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -81,6 +81,7 @@ struct xfs_mount;
#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
+#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
#define XFS_SB_VERSION2_OKREALFBITS \
(XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
(sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
}
+static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
+{
+ return (xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT));
+}
+
/*
* end of superblock version macros
*/
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 0d9de41a715..ab8839b2627 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -49,7 +49,7 @@
#include "xfs_extfree_item.h"
#include "xfs_mru_cache.h"
#include "xfs_inode_item.h"
-#include "xfs_sync.h"
+#include "xfs_icache.h"
#include "xfs_trace.h"
#include <linux/namei.h>
@@ -88,6 +88,8 @@ mempool_t *xfs_ioend_pool;
* unwritten extent conversion */
#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */
#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */
+#define MNTOPT_32BITINODE "inode32" /* inode allocation limited to
+ * XFS_MAXINUMBER_32 */
#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */
#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */
#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */
@@ -120,12 +122,18 @@ mempool_t *xfs_ioend_pool;
* in the future, too.
*/
enum {
- Opt_barrier, Opt_nobarrier, Opt_err
+ Opt_barrier,
+ Opt_nobarrier,
+ Opt_inode64,
+ Opt_inode32,
+ Opt_err
};
static const match_table_t tokens = {
{Opt_barrier, "barrier"},
{Opt_nobarrier, "nobarrier"},
+ {Opt_inode64, "inode64"},
+ {Opt_inode32, "inode32"},
{Opt_err, NULL}
};
@@ -197,7 +205,9 @@ xfs_parseargs(
*/
mp->m_flags |= XFS_MOUNT_BARRIER;
mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+#if !XFS_BIG_INUMS
mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+#endif
/*
* These can be overridden by the mount option parsing.
@@ -294,6 +304,8 @@ xfs_parseargs(
return EINVAL;
}
dswidth = simple_strtoul(value, &eov, 10);
+ } else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
+ mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
} else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
#if !XFS_BIG_INUMS
@@ -492,6 +504,7 @@ xfs_showargs(
{ XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
{ XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
+ { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE },
{ 0, NULL }
};
static struct proc_xfs_info xfs_info_unset[] = {
@@ -591,6 +604,80 @@ xfs_max_file_offset(
return (((__uint64_t)pagefactor) << bitshift) - 1;
}
+xfs_agnumber_t
+xfs_set_inode32(struct xfs_mount *mp)
+{
+ xfs_agnumber_t index = 0;
+ xfs_agnumber_t maxagi = 0;
+ xfs_sb_t *sbp = &mp->m_sb;
+ xfs_agnumber_t max_metadata;
+ xfs_agino_t agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks -1, 0);
+ xfs_ino_t ino = XFS_AGINO_TO_INO(mp, sbp->sb_agcount -1, agino);
+ xfs_perag_t *pag;
+
+ /* Calculate how much should be reserved for inodes to meet
+ * the max inode percentage.
+ */
+ if (mp->m_maxicount) {
+ __uint64_t icount;
+
+ icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+ do_div(icount, 100);
+ icount += sbp->sb_agblocks - 1;
+ do_div(icount, sbp->sb_agblocks);
+ max_metadata = icount;
+ } else {
+ max_metadata = sbp->sb_agcount;
+ }
+
+ for (index = 0; index < sbp->sb_agcount; index++) {
+ ino = XFS_AGINO_TO_INO(mp, index, agino);
+
+ if (ino > XFS_MAXINUMBER_32) {
+ pag = xfs_perag_get(mp, index);
+ pag->pagi_inodeok = 0;
+ pag->pagf_metadata = 0;
+ xfs_perag_put(pag);
+ continue;
+ }
+
+ pag = xfs_perag_get(mp, index);
+ pag->pagi_inodeok = 1;
+ maxagi++;
+ if (index < max_metadata)
+ pag->pagf_metadata = 1;
+ xfs_perag_put(pag);
+ }
+ mp->m_flags |= (XFS_MOUNT_32BITINODES |
+ XFS_MOUNT_SMALL_INUMS);
+
+ return maxagi;
+}
+
+xfs_agnumber_t
+xfs_set_inode64(struct xfs_mount *mp)
+{
+ xfs_agnumber_t index = 0;
+
+ for (index = 0; index < mp->m_sb.sb_agcount; index++) {
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, index);
+ pag->pagi_inodeok = 1;
+ pag->pagf_metadata = 0;
+ xfs_perag_put(pag);
+ }
+
+ /* There is no need for lock protection on m_flags,
+ * the rw_semaphore of the VFS superblock is locked
+ * during mount/umount/remount operations, so this is
+ * enough to avoid concurency on the m_flags field
+ */
+ mp->m_flags &= ~(XFS_MOUNT_32BITINODES |
+ XFS_MOUNT_SMALL_INUMS);
+ return index;
+}
+
STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
@@ -776,8 +863,30 @@ xfs_init_mount_workqueues(
WQ_MEM_RECLAIM, 0, mp->m_fsname);
if (!mp->m_cil_workqueue)
goto out_destroy_unwritten;
+
+ mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
+ WQ_NON_REENTRANT, 0, mp->m_fsname);
+ if (!mp->m_reclaim_workqueue)
+ goto out_destroy_cil;
+
+ mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
+ WQ_NON_REENTRANT, 0, mp->m_fsname);
+ if (!mp->m_log_workqueue)
+ goto out_destroy_reclaim;
+
+ mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
+ WQ_NON_REENTRANT, 0, mp->m_fsname);
+ if (!mp->m_eofblocks_workqueue)
+ goto out_destroy_log;
+
return 0;
+out_destroy_log:
+ destroy_workqueue(mp->m_log_workqueue);
+out_destroy_reclaim:
+ destroy_workqueue(mp->m_reclaim_workqueue);
+out_destroy_cil:
+ destroy_workqueue(mp->m_cil_workqueue);
out_destroy_unwritten:
destroy_workqueue(mp->m_unwritten_workqueue);
out_destroy_data_iodone_queue:
@@ -790,11 +899,32 @@ STATIC void
xfs_destroy_mount_workqueues(
struct xfs_mount *mp)
{
+ destroy_workqueue(mp->m_eofblocks_workqueue);
+ destroy_workqueue(mp->m_log_workqueue);
+ destroy_workqueue(mp->m_reclaim_workqueue);
destroy_workqueue(mp->m_cil_workqueue);
destroy_workqueue(mp->m_data_workqueue);
destroy_workqueue(mp->m_unwritten_workqueue);
}
+/*
+ * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
+ * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
+ * for IO to complete so that we effectively throttle multiple callers to the
+ * rate at which IO is completing.
+ */
+void
+xfs_flush_inodes(
+ struct xfs_mount *mp)
+{
+ struct super_block *sb = mp->m_super;
+
+ if (down_read_trylock(&sb->s_umount)) {
+ sync_inodes_sb(sb);
+ up_read(&sb->s_umount);
+ }
+}
+
/* Catch misguided souls that try to use this interface on XFS */
STATIC struct inode *
xfs_fs_alloc_inode(
@@ -868,67 +998,14 @@ xfs_fs_inode_init_once(
"xfsino", ip->i_ino);
}
-/*
- * This is called by the VFS when dirtying inode metadata. This can happen
- * for a few reasons, but we only care about timestamp updates, given that
- * we handled the rest ourselves. In theory no other calls should happen,
- * but for example generic_write_end() keeps dirtying the inode after
- * updating i_size. Thus we check that the flags are exactly I_DIRTY_SYNC,
- * and skip this call otherwise.
- *
- * We'll hopefull get a different method just for updating timestamps soon,
- * at which point this hack can go away, and maybe we'll also get real
- * error handling here.
- */
-STATIC void
-xfs_fs_dirty_inode(
- struct inode *inode,
- int flags)
-{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
- int error;
-
- if (flags != I_DIRTY_SYNC)
- return;
-
- trace_xfs_dirty_inode(ip);
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- goto trouble;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- /*
- * Grab all the latest timestamps from the Linux inode.
- */
- ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
- ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
- ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
- ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
- ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
-
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
- error = xfs_trans_commit(tp, 0);
- if (error)
- goto trouble;
- return;
-
-trouble:
- xfs_warn(mp, "failed to update timestamps for inode 0x%llx", ip->i_ino);
-}
-
STATIC void
xfs_fs_evict_inode(
struct inode *inode)
{
xfs_inode_t *ip = XFS_I(inode);
+ ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+
trace_xfs_evict_inode(ip);
truncate_inode_pages(&inode->i_data, 0);
@@ -937,22 +1014,6 @@ xfs_fs_evict_inode(
XFS_STATS_INC(vn_remove);
XFS_STATS_DEC(vn_active);
- /*
- * The iolock is used by the file system to coordinate reads,
- * writes, and block truncates. Up to this point the lock
- * protected concurrent accesses by users of the inode. But
- * from here forward we're doing some final processing of the
- * inode because we're done with it, and although we reuse the
- * iolock for protection it is really a distinct lock class
- * (in the lockdep sense) from before. To keep lockdep happy
- * (and basically indicate what we are doing), we explicitly
- * re-init the iolock here.
- */
- ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
- mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
- lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
- &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
-
xfs_inactive(ip);
}
@@ -989,7 +1050,7 @@ xfs_fs_put_super(
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
- xfs_syncd_stop(mp);
+
xfs_freesb(mp);
xfs_icsb_destroy_counters(mp);
xfs_destroy_mount_workqueues(mp);
@@ -1004,7 +1065,6 @@ xfs_fs_sync_fs(
int wait)
{
struct xfs_mount *mp = XFS_M(sb);
- int error;
/*
* Doing anything during the async pass would be counterproductive.
@@ -1012,17 +1072,14 @@ xfs_fs_sync_fs(
if (!wait)
return 0;
- error = xfs_quiesce_data(mp);
- if (error)
- return -error;
-
+ xfs_log_force(mp, XFS_LOG_SYNC);
if (laptop_mode) {
/*
* The disk must be active because we're syncing.
- * We schedule xfssyncd now (now that the disk is
+ * We schedule log work now (now that the disk is
* active) instead of later (when it might not be).
*/
- flush_delayed_work_sync(&mp->m_sync_work);
+ flush_delayed_work(&mp->m_log->l_work);
}
return 0;
@@ -1099,6 +1156,48 @@ xfs_restore_resvblks(struct xfs_mount *mp)
xfs_reserve_blocks(mp, &resblks, NULL);
}
+/*
+ * Trigger writeback of all the dirty metadata in the file system.
+ *
+ * This ensures that the metadata is written to their location on disk rather
+ * than just existing in transactions in the log. This means after a quiesce
+ * there is no log replay required to write the inodes to disk - this is the
+ * primary difference between a sync and a quiesce.
+ *
+ * Note: xfs_log_quiesce() stops background log work - the callers must ensure
+ * it is started again when appropriate.
+ */
+void
+xfs_quiesce_attr(
+ struct xfs_mount *mp)
+{
+ int error = 0;
+
+ /* wait for all modifications to complete */
+ while (atomic_read(&mp->m_active_trans) > 0)
+ delay(100);
+
+ /* force the log to unpin objects from the now complete transactions */
+ xfs_log_force(mp, XFS_LOG_SYNC);
+
+ /* reclaim inodes to do any IO before the freeze completes */
+ xfs_reclaim_inodes(mp, 0);
+ xfs_reclaim_inodes(mp, SYNC_WAIT);
+
+ /* Push the superblock and write an unmount record */
+ error = xfs_log_sbcount(mp);
+ if (error)
+ xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
+ "Frozen image may not be consistent.");
+ /*
+ * Just warn here till VFS can correctly support
+ * read-only remount without racing.
+ */
+ WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+
+ xfs_log_quiesce(mp);
+}
+
STATIC int
xfs_fs_remount(
struct super_block *sb,
@@ -1124,6 +1223,12 @@ xfs_fs_remount(
case Opt_nobarrier:
mp->m_flags &= ~XFS_MOUNT_BARRIER;
break;
+ case Opt_inode64:
+ mp->m_maxagi = xfs_set_inode64(mp);
+ break;
+ case Opt_inode32:
+ mp->m_maxagi = xfs_set_inode32(mp);
+ break;
default:
/*
* Logically we would return an error here to prevent
@@ -1173,20 +1278,18 @@ xfs_fs_remount(
* value if it is non-zero, otherwise go with the default.
*/
xfs_restore_resvblks(mp);
+ xfs_log_work_queue(mp);
}
/* rw -> ro */
if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
/*
- * After we have synced the data but before we sync the
- * metadata, we need to free up the reserve block pool so that
- * the used block count in the superblock on disk is correct at
- * the end of the remount. Stash the current reserve pool size
- * so that if we get remounted rw, we can return it to the same
- * size.
+ * Before we sync the metadata, we need to free up the reserve
+ * block pool so that the used block count in the superblock on
+ * disk is correct at the end of the remount. Stash the current
+ * reserve pool size so that if we get remounted rw, we can
+ * return it to the same size.
*/
-
- xfs_quiesce_data(mp);
xfs_save_resvblks(mp);
xfs_quiesce_attr(mp);
mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -1218,6 +1321,7 @@ xfs_fs_unfreeze(
struct xfs_mount *mp = XFS_M(sb);
xfs_restore_resvblks(mp);
+ xfs_log_work_queue(mp);
return 0;
}
@@ -1296,6 +1400,8 @@ xfs_fs_fill_super(
spin_lock_init(&mp->m_sb_lock);
mutex_init(&mp->m_growlock);
atomic_set(&mp->m_active_trans, 0);
+ INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+ INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
mp->m_super = sb;
sb->s_fs_info = mp;
@@ -1346,10 +1452,6 @@ xfs_fs_fill_super(
/*
* we must configure the block size in the superblock before we run the
* full mount process as the mount process can lookup and cache inodes.
- * For the same reason we must also initialise the syncd and register
- * the inode cache shrinker so that inodes can be reclaimed during
- * operations like a quotacheck that iterate all inodes in the
- * filesystem.
*/
sb->s_magic = XFS_SB_MAGIC;
sb->s_blocksize = mp->m_sb.sb_blocksize;
@@ -1359,13 +1461,9 @@ xfs_fs_fill_super(
sb->s_time_gran = 1;
set_posix_acl_flag(sb);
- error = xfs_syncd_init(mp);
- if (error)
- goto out_filestream_unmount;
-
error = xfs_mountfs(mp);
if (error)
- goto out_syncd_stop;
+ goto out_filestream_unmount;
root = igrab(VFS_I(mp->m_rootip));
if (!root) {
@@ -1383,8 +1481,7 @@ xfs_fs_fill_super(
}
return 0;
- out_syncd_stop:
- xfs_syncd_stop(mp);
+
out_filestream_unmount:
xfs_filestream_unmount(mp);
out_free_sb:
@@ -1404,7 +1501,6 @@ out_destroy_workqueues:
out_unmount:
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
- xfs_syncd_stop(mp);
goto out_free_sb;
}
@@ -1436,7 +1532,6 @@ xfs_fs_free_cached_objects(
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
- .dirty_inode = xfs_fs_dirty_inode,
.evict_inode = xfs_fs_evict_inode,
.drop_inode = xfs_fs_drop_inode,
.put_super = xfs_fs_put_super,
@@ -1491,13 +1586,9 @@ xfs_init_zones(void)
if (!xfs_da_state_zone)
goto out_destroy_btree_cur_zone;
- xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
- if (!xfs_dabuf_zone)
- goto out_destroy_da_state_zone;
-
xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
if (!xfs_ifork_zone)
- goto out_destroy_dabuf_zone;
+ goto out_destroy_da_state_zone;
xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
if (!xfs_trans_zone)
@@ -1514,9 +1605,8 @@ xfs_init_zones(void)
* size possible under XFS. This wastes a little bit of memory,
* but it is much faster.
*/
- xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
- (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
- NBWORD) * sizeof(int))), "xfs_buf_item");
+ xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item),
+ "xfs_buf_item");
if (!xfs_buf_item_zone)
goto out_destroy_log_item_desc_zone;
@@ -1561,8 +1651,6 @@ xfs_init_zones(void)
kmem_zone_destroy(xfs_trans_zone);
out_destroy_ifork_zone:
kmem_zone_destroy(xfs_ifork_zone);
- out_destroy_dabuf_zone:
- kmem_zone_destroy(xfs_dabuf_zone);
out_destroy_da_state_zone:
kmem_zone_destroy(xfs_da_state_zone);
out_destroy_btree_cur_zone:
@@ -1582,6 +1670,11 @@ xfs_init_zones(void)
STATIC void
xfs_destroy_zones(void)
{
+ /*
+ * Make sure all delayed rcu free are flushed before we
+ * destroy caches.
+ */
+ rcu_barrier();
kmem_zone_destroy(xfs_ili_zone);
kmem_zone_destroy(xfs_inode_zone);
kmem_zone_destroy(xfs_efi_zone);
@@ -1590,7 +1683,6 @@ xfs_destroy_zones(void)
kmem_zone_destroy(xfs_log_item_desc_zone);
kmem_zone_destroy(xfs_trans_zone);
kmem_zone_destroy(xfs_ifork_zone);
- kmem_zone_destroy(xfs_dabuf_zone);
kmem_zone_destroy(xfs_da_state_zone);
kmem_zone_destroy(xfs_btree_cur_zone);
kmem_zone_destroy(xfs_bmap_free_item_zone);
@@ -1604,16 +1696,6 @@ STATIC int __init
xfs_init_workqueues(void)
{
/*
- * We never want to the same work item to run twice, reclaiming inodes
- * or idling the log is not going to get any faster by multiple CPUs
- * competing for ressources. Use the default large max_active value
- * so that even lots of filesystems can perform these task in parallel.
- */
- xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
- if (!xfs_syncd_wq)
- return -ENOMEM;
-
- /*
* The allocation workqueue can be used in memory reclaim situations
* (writepage path), and parallelism is only limited by the number of
* AGs in all the filesystems mounted. Hence use the default large
@@ -1621,20 +1703,15 @@ xfs_init_workqueues(void)
*/
xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
if (!xfs_alloc_wq)
- goto out_destroy_syncd;
+ return -ENOMEM;
return 0;
-
-out_destroy_syncd:
- destroy_workqueue(xfs_syncd_wq);
- return -ENOMEM;
}
STATIC void
xfs_destroy_workqueues(void)
{
destroy_workqueue(xfs_alloc_wq);
- destroy_workqueue(xfs_syncd_wq);
}
STATIC int __init
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 09b0c26b224..bbe3d15a790 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -74,7 +74,10 @@ struct block_device;
extern __uint64_t xfs_max_file_offset(unsigned int);
+extern void xfs_flush_inodes(struct xfs_mount *mp);
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
+extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
+extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
extern const struct export_operations xfs_export_operations;
extern const struct xattr_handler *xfs_xattr_handlers[];
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index ee2d2adaa43..2801b5ce6cd 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -202,6 +202,15 @@ static ctl_table xfs_table[] =