aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorPatrick McHardy <kaber@trash.net>2012-08-08 21:03:47 +0200
committerPatrick McHardy <kaber@trash.net>2012-08-08 21:03:47 +0200
commitd53b4ed072d9779cdf53582c46436dec06d0961f (patch)
treeac95ecab33e31cd79aae69c475e8348adac51230 /fs/xfs
parent5d4dff7f1011a81a693a9c7b1f6a0b9c842eb60c (diff)
parent28a33cbc24e4256c143dce96c7d93bf423229f92 (diff)
Merge tag 'v3.5' of 192.168.0.154:/repos/git/linux-2.6
Conflicts: drivers/Kconfig Signed-off-by: Patrick McHardy <kaber@trash.net>
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Makefile5
-rw-r--r--fs/xfs/kmem.c10
-rw-r--r--fs/xfs/kmem.h27
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_ag.h18
-rw-r--r--fs/xfs/xfs_alloc.c633
-rw-r--r--fs/xfs/xfs_alloc.h40
-rw-r--r--fs/xfs/xfs_alloc_btree.c9
-rw-r--r--fs/xfs/xfs_aops.c429
-rw-r--r--fs/xfs/xfs_aops.h4
-rw-r--r--fs/xfs/xfs_attr.c45
-rw-r--r--fs/xfs/xfs_attr_leaf.c52
-rw-r--r--fs/xfs/xfs_bmap.c170
-rw-r--r--fs/xfs/xfs_bmap.h3
-rw-r--r--fs/xfs/xfs_bmap_btree.c1
-rw-r--r--fs/xfs/xfs_btree.c1
-rw-r--r--fs/xfs/xfs_buf.c661
-rw-r--r--fs/xfs/xfs_buf.h99
-rw-r--r--fs/xfs/xfs_buf_item.c123
-rw-r--r--fs/xfs/xfs_da_btree.c49
-rw-r--r--fs/xfs/xfs_dfrag.c69
-rw-r--r--fs/xfs/xfs_dir2.c1
-rw-r--r--fs/xfs/xfs_dir2_block.c2
-rw-r--r--fs/xfs/xfs_dir2_data.c1
-rw-r--r--fs/xfs/xfs_dir2_leaf.c1
-rw-r--r--fs/xfs/xfs_dir2_node.c1
-rw-r--r--fs/xfs/xfs_dir2_sf.c1
-rw-r--r--fs/xfs/xfs_discard.c71
-rw-r--r--fs/xfs/xfs_dquot.c930
-rw-r--r--fs/xfs/xfs_dquot.h83
-rw-r--r--fs/xfs/xfs_dquot_item.c165
-rw-r--r--fs/xfs/xfs_error.c1
-rw-r--r--fs/xfs/xfs_export.c24
-rw-r--r--fs/xfs/xfs_extent_busy.c603
-rw-r--r--fs/xfs/xfs_extent_busy.h69
-rw-r--r--fs/xfs/xfs_extfree_item.c59
-rw-r--r--fs/xfs/xfs_file.c580
-rw-r--r--fs/xfs/xfs_fs_subr.c2
-rw-r--r--fs/xfs/xfs_fsops.c82
-rw-r--r--fs/xfs/xfs_ialloc.c14
-rw-r--r--fs/xfs/xfs_ialloc.h11
-rw-r--r--fs/xfs/xfs_ialloc_btree.c1
-rw-r--r--fs/xfs/xfs_iget.c96
-rw-r--r--fs/xfs/xfs_inode.c421
-rw-r--r--fs/xfs/xfs_inode.h146
-rw-r--r--fs/xfs/xfs_inode_item.c480
-rw-r--r--fs/xfs/xfs_inode_item.h18
-rw-r--r--fs/xfs/xfs_inum.h5
-rw-r--r--fs/xfs/xfs_ioctl.c38
-rw-r--r--fs/xfs/xfs_ioctl32.c12
-rw-r--r--fs/xfs/xfs_iomap.c124
-rw-r--r--fs/xfs/xfs_iops.c140
-rw-r--r--fs/xfs/xfs_itable.c25
-rw-r--r--fs/xfs/xfs_log.c800
-rw-r--r--fs/xfs/xfs_log.h25
-rw-r--r--fs/xfs/xfs_log_cil.c361
-rw-r--r--fs/xfs/xfs_log_priv.h74
-rw-r--r--fs/xfs/xfs_log_recover.c186
-rw-r--r--fs/xfs/xfs_message.c1
-rw-r--r--fs/xfs/xfs_mount.c85
-rw-r--r--fs/xfs/xfs_mount.h12
-rw-r--r--fs/xfs/xfs_qm.c1283
-rw-r--r--fs/xfs/xfs_qm.h69
-rw-r--r--fs/xfs/xfs_qm_bhv.c44
-rw-r--r--fs/xfs/xfs_qm_stats.c105
-rw-r--r--fs/xfs/xfs_qm_stats.h53
-rw-r--r--fs/xfs/xfs_qm_syscalls.c143
-rw-r--r--fs/xfs/xfs_quota.h14
-rw-r--r--fs/xfs/xfs_quota_priv.h11
-rw-r--r--fs/xfs/xfs_quotaops.c1
-rw-r--r--fs/xfs/xfs_rename.c12
-rw-r--r--fs/xfs/xfs_rtalloc.c19
-rw-r--r--fs/xfs/xfs_rw.c156
-rw-r--r--fs/xfs/xfs_rw.h47
-rw-r--r--fs/xfs/xfs_sb.h1
-rw-r--r--fs/xfs/xfs_stats.c99
-rw-r--r--fs/xfs/xfs_stats.h10
-rw-r--r--fs/xfs/xfs_super.c301
-rw-r--r--fs/xfs/xfs_super.h8
-rw-r--r--fs/xfs/xfs_sync.c322
-rw-r--r--fs/xfs/xfs_sync.h2
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trace.h213
-rw-r--r--fs/xfs/xfs_trans.c519
-rw-r--r--fs/xfs/xfs_trans.h23
-rw-r--r--fs/xfs/xfs_trans_ail.c258
-rw-r--r--fs/xfs/xfs_trans_buf.c151
-rw-r--r--fs/xfs/xfs_trans_dquot.c21
-rw-r--r--fs/xfs/xfs_trans_extfree.c1
-rw-r--r--fs/xfs/xfs_trans_inode.c10
-rw-r--r--fs/xfs/xfs_trans_priv.h15
-rw-r--r--fs/xfs/xfs_types.h5
-rw-r--r--fs/xfs/xfs_utils.c6
-rw-r--r--fs/xfs/xfs_utils.h2
-rw-r--r--fs/xfs/xfs_vnode.h1
-rw-r--r--fs/xfs/xfs_vnodeops.c98
-rw-r--r--fs/xfs/xfs_vnodeops.h7
97 files changed, 4985 insertions, 7248 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 427a4e82a58..d2bf974b1a2 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -33,6 +33,7 @@ xfs-y += xfs_aops.o \
xfs_discard.o \
xfs_error.o \
xfs_export.o \
+ xfs_extent_busy.o \
xfs_file.o \
xfs_filestream.o \
xfs_fsops.o \
@@ -49,7 +50,6 @@ xfs-y += xfs_aops.o \
xfs_sync.o \
xfs_xattr.o \
xfs_rename.o \
- xfs_rw.o \
xfs_utils.o \
xfs_vnodeops.o \
kmem.o \
@@ -96,9 +96,6 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs_qm_bhv.o \
xfs_qm.o \
xfs_quotaops.o
-ifeq ($(CONFIG_XFS_QUOTA),y)
-xfs-$(CONFIG_PROC_FS) += xfs_qm_stats.o
-endif
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_PROC_FS) += xfs_stats.o
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index a907de565db..4a7286c1dc8 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -46,7 +46,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
}
void *
-kmem_alloc(size_t size, unsigned int __nocast flags)
+kmem_alloc(size_t size, xfs_km_flags_t flags)
{
int retries = 0;
gfp_t lflags = kmem_flags_convert(flags);
@@ -65,7 +65,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
}
void *
-kmem_zalloc(size_t size, unsigned int __nocast flags)
+kmem_zalloc(size_t size, xfs_km_flags_t flags)
{
void *ptr;
@@ -87,7 +87,7 @@ kmem_free(const void *ptr)
void *
kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
- unsigned int __nocast flags)
+ xfs_km_flags_t flags)
{
void *new;
@@ -102,7 +102,7 @@ kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
}
void *
-kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
+kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
{
int retries = 0;
gfp_t lflags = kmem_flags_convert(flags);
@@ -121,7 +121,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
}
void *
-kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
+kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
{
void *ptr;
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 292eff19803..b2f2620f9a8 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -27,10 +27,11 @@
* General memory allocation interfaces
*/
-#define KM_SLEEP 0x0001u
-#define KM_NOSLEEP 0x0002u
-#define KM_NOFS 0x0004u
-#define KM_MAYFAIL 0x0008u
+typedef unsigned __bitwise xfs_km_flags_t;
+#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u)
+#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u)
+#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
+#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
/*
* We use a special process flag to avoid recursive callbacks into
@@ -38,7 +39,7 @@
* warnings, so we explicitly skip any generic ones (silly of us).
*/
static inline gfp_t
-kmem_flags_convert(unsigned int __nocast flags)
+kmem_flags_convert(xfs_km_flags_t flags)
{
gfp_t lflags;
@@ -54,9 +55,9 @@ kmem_flags_convert(unsigned int __nocast flags)
return lflags;
}
-extern void *kmem_alloc(size_t, unsigned int __nocast);
-extern void *kmem_zalloc(size_t, unsigned int __nocast);
-extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
+extern void *kmem_alloc(size_t, xfs_km_flags_t);
+extern void *kmem_zalloc(size_t, xfs_km_flags_t);
+extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
extern void kmem_free(const void *);
static inline void *kmem_zalloc_large(size_t size)
@@ -107,13 +108,7 @@ kmem_zone_destroy(kmem_zone_t *zone)
kmem_cache_destroy(zone);
}
-extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
-extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
-
-static inline int
-kmem_shake_allow(gfp_t gfp_mask)
-{
- return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS));
-}
+extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
+extern void *kmem_zone_zalloc(kmem_zone_t *, xfs_km_flags_t);
#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 76e4266d2e7..ac702a6eab9 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -39,7 +39,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
struct posix_acl_entry *acl_e;
struct posix_acl *acl;
struct xfs_acl_entry *ace;
- int count, i;
+ unsigned int count, i;
count = be32_to_cpu(aclp->acl_cnt);
if (count > XFS_ACL_MAX_ENTRIES)
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4805f009f92..44d65c1533c 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,24 +175,6 @@ typedef struct xfs_agfl {
} xfs_agfl_t;
/*
- * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
- * have been freed but whose transactions aren't committed to disk yet.
- *
- * Note that we use the transaction ID to record the transaction, not the
- * transaction structure itself. See xfs_alloc_busy_insert() for details.
- */
-struct xfs_busy_extent {
- struct rb_node rb_node; /* ag by-bno indexed search tree */
- struct list_head list; /* transaction busy extent list */
- xfs_agnumber_t agno;
- xfs_agblock_t bno;
- xfs_extlen_t length;
- unsigned int flags;
-#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */
-#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */
-};
-
-/*
* Per-ag incore structure, copies of information in agf and agi,
* to improve the performance of allocation group selection.
*/
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index ce84ffd0264..4f33c32affe 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -32,9 +31,11 @@
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+struct workqueue_struct *xfs_alloc_wq;
#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
@@ -46,8 +47,6 @@ STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
-STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
- xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
/*
* Lookup the record equal to [bno, len] in the btree given by cur.
@@ -68,7 +67,7 @@ xfs_alloc_lookup_eq(
* Lookup the first record greater than or equal to [bno, len]
* in the btree given by cur.
*/
-STATIC int /* error */
+int /* error */
xfs_alloc_lookup_ge(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agblock_t bno, /* starting block of extent */
@@ -151,7 +150,7 @@ xfs_alloc_compute_aligned(
xfs_extlen_t len;
/* Trim busy sections out of found extent */
- xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
+ xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
if (args->alignment > 1 && len >= args->minlen) {
xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
@@ -535,7 +534,7 @@ xfs_alloc_ag_vextent(
if (error)
return error;
- ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
+ ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
args->agbno, args->len));
}
@@ -602,7 +601,7 @@ xfs_alloc_ag_vextent_exact(
/*
* Check for overlapping busy extents.
*/
- xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
+ xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
/*
* Give up if the start of the extent is busy, or the freespace isn't
@@ -1075,12 +1074,13 @@ restart:
* If we couldn't get anything, give up.
*/
if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+
if (!forced++) {
trace_xfs_alloc_near_busy(args);
xfs_log_force(args->mp, XFS_LOG_SYNC);
goto restart;
}
-
trace_xfs_alloc_size_neither(args);
args->agbno = NULLAGBLOCK;
return 0;
@@ -1390,7 +1390,7 @@ xfs_alloc_ag_vextent_small(
if (error)
goto error0;
if (fbno != NULLAGBLOCK) {
- xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
+ xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
args->userdata);
if (args->userdata) {
@@ -2207,7 +2207,7 @@ xfs_alloc_read_agf(
* group or loop over the allocation groups to find the result.
*/
int /* error */
-xfs_alloc_vextent(
+__xfs_alloc_vextent(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
xfs_agblock_t agsize; /* allocation group size */
@@ -2417,6 +2417,46 @@ error0:
return error;
}
+static void
+xfs_alloc_vextent_worker(
+ struct work_struct *work)
+{
+ struct xfs_alloc_arg *args = container_of(work,
+ struct xfs_alloc_arg, work);
+ unsigned long pflags;
+
+ /* we are in a transaction context here */
+ current_set_flags_nested(&pflags, PF_FSTRANS);
+
+ args->result = __xfs_alloc_vextent(args);
+ complete(args->done);
+
+ current_restore_flags_nested(&pflags, PF_FSTRANS);
+}
+
+/*
+ * Data allocation requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. Metadata
+ * requests, OTOH, are generally from low stack usage paths, so avoid the
+ * context switch overhead here.
+ */
+int
+xfs_alloc_vextent(
+ struct xfs_alloc_arg *args)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+
+ if (!args->userdata)
+ return __xfs_alloc_vextent(args);
+
+
+ args->done = &done;
+ INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker);
+ queue_work(xfs_alloc_wq, &args->work);
+ wait_for_completion(&done);
+ return args->result;
+}
+
/*
* Free an extent.
* Just break up the extent address and hand off to xfs_free_ag_extent
@@ -2464,579 +2504,8 @@ xfs_free_extent(
error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
if (!error)
- xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
+ xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
error0:
xfs_perag_put(args.pag);
return error;
}
-
-void
-xfs_alloc_busy_insert(
- struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- unsigned int flags)
-{
- struct xfs_busy_extent *new;
- struct xfs_busy_extent *busyp;
- struct xfs_perag *pag;
- struct rb_node **rbp;
- struct rb_node *parent = NULL;
-
- new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
- if (!new) {
- /*
- * No Memory! Since it is now not possible to track the free
- * block, make this a synchronous transaction to insure that
- * the block is not reused before this transaction commits.
- */
- trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
- xfs_trans_set_sync(tp);
- return;
- }
-
- new->agno = agno;
- new->bno = bno;
- new->length = len;
- INIT_LIST_HEAD(&new->list);
- new->flags = flags;
-
- /* trace before insert to be able to see failed inserts */
- trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
-
- pag = xfs_perag_get(tp->t_mountp, new->agno);
- spin_lock(&pag->pagb_lock);
- rbp = &pag->pagb_tree.rb_node;
- while (*rbp) {
- parent = *rbp;
- busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
-
- if (new->bno < busyp->bno) {
- rbp = &(*rbp)->rb_left;
- ASSERT(new->bno + new->length <= busyp->bno);
- } else if (new->bno > busyp->bno) {
- rbp = &(*rbp)->rb_right;
- ASSERT(bno >= busyp->bno + busyp->length);
- } else {
- ASSERT(0);
- }
- }
-
- rb_link_node(&new->rb_node, parent, rbp);
- rb_insert_color(&new->rb_node, &pag->pagb_tree);
-
- list_add(&new->list, &tp->t_busy);
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
-}
-
-/*
- * Search for a busy extent within the range of the extent we are about to
- * allocate. You need to be holding the busy extent tree lock when calling
- * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
- * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
- * match. This is done so that a non-zero return indicates an overlap that
- * will require a synchronous transaction, but it can still be
- * used to distinguish between a partial or exact match.
- */
-int
-xfs_alloc_busy_search(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len)
-{
- struct xfs_perag *pag;
- struct rb_node *rbp;
- struct xfs_busy_extent *busyp;
- int match = 0;
-
- pag = xfs_perag_get(mp, agno);
- spin_lock(&pag->pagb_lock);
-
- rbp = pag->pagb_tree.rb_node;
-
- /* find closest start bno overlap */
- while (rbp) {
- busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
- if (bno < busyp->bno) {
- /* may overlap, but exact start block is lower */
- if (bno + len > busyp->bno)
- match = -1;
- rbp = rbp->rb_left;
- } else if (bno > busyp->bno) {
- /* may overlap, but exact start block is higher */
- if (bno < busyp->bno + busyp->length)
- match = -1;
- rbp = rbp->rb_right;
- } else {
- /* bno matches busyp, length determines exact match */
- match = (busyp->length == len) ? 1 : -1;
- break;
- }
- }
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
- return match;
-}
-
-/*
- * The found free extent [fbno, fend] overlaps part or all of the given busy
- * extent. If the overlap covers the beginning, the end, or all of the busy
- * extent, the overlapping portion can be made unbusy and used for the
- * allocation. We can't split a busy extent because we can't modify a
- * transaction/CIL context busy list, but we can update an entries block
- * number or length.
- *
- * Returns true if the extent can safely be reused, or false if the search
- * needs to be restarted.
- */
-STATIC bool
-xfs_alloc_busy_update_extent(
- struct xfs_mount *mp,
- struct xfs_perag *pag,
- struct xfs_busy_extent *busyp,
- xfs_agblock_t fbno,
- xfs_extlen_t flen,
- bool userdata)
-{
- xfs_agblock_t fend = fbno + flen;
- xfs_agblock_t bbno = busyp->bno;
- xfs_agblock_t bend = bbno + busyp->length;
-
- /*
- * This extent is currently being discarded. Give the thread
- * performing the discard a chance to mark the extent unbusy
- * and retry.
- */
- if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
- spin_unlock(&pag->pagb_lock);
- delay(1);
- spin_lock(&pag->pagb_lock);
- return false;
- }
-
- /*
- * If there is a busy extent overlapping a user allocation, we have
- * no choice but to force the log and retry the search.
- *
- * Fortunately this does not happen during normal operation, but
- * only if the filesystem is very low on space and has to dip into
- * the AGFL for normal allocations.
- */
- if (userdata)
- goto out_force_log;
-
- if (bbno < fbno && bend > fend) {
- /*
- * Case 1:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +---------+
- * fbno fend
- */
-
- /*
- * We would have to split the busy extent to be able to track
- * it correct, which we cannot do because we would have to
- * modify the list of busy extents attached to the transaction
- * or CIL context, which is immutable.
- *
- * Force out the log to clear the busy extent and retry the
- * search.
- */
- goto out_force_log;
- } else if (bbno >= fbno && bend <= fend) {
- /*
- * Case 2:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-----------------+
- * fbno fend
- *
- * Case 3:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +--------------------------+
- * fbno fend
- *
- * Case 4:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +--------------------------+
- * fbno fend
- *
- * Case 5:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-----------------------------------+
- * fbno fend
- *
- */
-
- /*
- * The busy extent is fully covered by the extent we are
- * allocating, and can simply be removed from the rbtree.
- * However we cannot remove it from the immutable list
- * tracking busy extents in the transaction or CIL context,
- * so set the length to zero to mark it invalid.
- *
- * We also need to restart the busy extent search from the
- * tree root, because erasing the node can rearrange the
- * tree topology.
- */
- rb_erase(&busyp->rb_node, &pag->pagb_tree);
- busyp->length = 0;
- return false;
- } else if (fend < bend) {
- /*
- * Case 6:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +---------+
- * fbno fend
- *
- * Case 7:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +------------------+
- * fbno fend
- *
- */
- busyp->bno = fend;
- } else if (bbno < fbno) {
- /*
- * Case 8:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-------------+
- * fbno fend
- *
- * Case 9:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +----------------------+
- * fbno fend
- */
- busyp->length = fbno - busyp->bno;
- } else {
- ASSERT(0);
- }
-
- trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
- return true;
-
-out_force_log:
- spin_unlock(&pag->pagb_lock);
- xfs_log_force(mp, XFS_LOG_SYNC);
- trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
- spin_lock(&pag->pagb_lock);
- return false;
-}
-
-
-/*
- * For a given extent [fbno, flen], make sure we can reuse it safely.
- */
-void
-xfs_alloc_busy_reuse(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agblock_t fbno,
- xfs_extlen_t flen,
- bool userdata)
-{
- struct xfs_perag *pag;
- struct rb_node *rbp;
-
- ASSERT(flen > 0);
-
- pag = xfs_perag_get(mp, agno);
- spin_lock(&pag->pagb_lock);
-restart:
- rbp = pag->pagb_tree.rb_node;
- while (rbp) {
- struct xfs_busy_extent *busyp =
- rb_entry(rbp, struct xfs_busy_extent, rb_node);
- xfs_agblock_t bbno = busyp->bno;
- xfs_agblock_t bend = bbno + busyp->length;
-
- if (fbno + flen <= bbno) {
- rbp = rbp->rb_left;
- continue;
- } else if (fbno >= bend) {
- rbp = rbp->rb_right;
- continue;
- }
-
- if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
- userdata))
- goto restart;
- }
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
-}
-
-/*
- * For a given extent [fbno, flen], search the busy extent list to find a
- * subset of the extent that is not busy. If *rlen is smaller than
- * args->minlen no suitable extent could be found, and the higher level
- * code needs to force out the log and retry the allocation.
- */
-STATIC void
-xfs_alloc_busy_trim(
- struct xfs_alloc_arg *args,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- xfs_agblock_t *rbno,
- xfs_extlen_t *rlen)
-{
- xfs_agblock_t fbno;
- xfs_extlen_t flen;
- struct rb_node *rbp;
-
- ASSERT(len > 0);
-
- spin_lock(&args->pag->pagb_lock);
-restart:
- fbno = bno;
- flen = len;
- rbp = args->pag->pagb_tree.rb_node;
- while (rbp && flen >= args->minlen) {
- struct xfs_busy_extent *busyp =
- rb_entry(rbp, struct xfs_busy_extent, rb_node);
- xfs_agblock_t fend = fbno + flen;
- xfs_agblock_t bbno = busyp->bno;
- xfs_agblock_t bend = bbno + busyp->length;
-
- if (fend <= bbno) {
- rbp = rbp->rb_left;
- continue;
- } else if (fbno >= bend) {
- rbp = rbp->rb_right;
- continue;
- }
-
- /*
- * If this is a metadata allocation, try to reuse the busy
- * extent instead of trimming the allocation.
- */
- if (!args->userdata &&
- !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
- if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
- busyp, fbno, flen,
- false))
- goto restart;
- continue;
- }
-
- if (bbno <= fbno) {
- /* start overlap */
-
- /*
- * Case 1:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +---------+
- * fbno fend
- *
- * Case 2:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-------------+
- * fbno fend
- *
- * Case 3:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-------------+
- * fbno fend
- *
- * Case 4:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-----------------+
- * fbno fend
- *
- * No unbusy region in extent, return failure.
- */
- if (fend <= bend)
- goto fail;
-
- /*
- * Case 5:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +----------------------+
- * fbno fend
- *
- * Case 6:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +--------------------------+
- * fbno fend
- *
- * Needs to be trimmed to:
- * +-------+
- * fbno fend
- */
- fbno = bend;
- } else if (bend >= fend) {
- /* end overlap */
-
- /*
- * Case 7:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +------------------+
- * fbno fend
- *
- * Case 8:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +--------------------------+
- * fbno fend
- *
- * Needs to be trimmed to:
- * +-------+
- * fbno fend
- */
- fend = bbno;
- } else {
- /* middle overlap */
-
- /*
- * Case 9:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-----------------------------------+
- * fbno fend
- *
- * Can be trimmed to:
- * +-------+ OR +-------+
- * fbno fend fbno fend
- *
- * Backward allocation leads to significant
- * fragmentation of directories, which degrades
- * directory performance, therefore we always want to
- * choose the option that produces forward allocation
- * patterns.
- * Preferring the lower bno extent will make the next
- * request use "fend" as the start of the next
- * allocation; if the segment is no longer busy at
- * that point, we'll get a contiguous allocation, but
- * even if it is still busy, we will get a forward
- * allocation.
- * We try to avoid choosing the segment at "bend",
- * because that can lead to the next allocation
- * taking the segment at "fbno", which would be a
- * backward allocation. We only use the segment at
- * "fbno" if it is much larger than the current
- * requested size, because in that case there's a
- * good chance subsequent allocations will be
- * contiguous.
- */
- if (bbno - fbno >= args->maxlen) {
- /* left candidate fits perfect */
- fend = bbno;
- } else if (fend - bend >= args->maxlen * 4) {
- /* right candidate has enough free space */
- fbno = bend;
- } else if (bbno - fbno >= args->minlen) {
- /* left candidate fits minimum requirement */
- fend = bbno;
- } else {
- goto fail;
- }
- }
-
- flen = fend - fbno;
- }
- spin_unlock(&args->pag->pagb_lock);
-
- if (fbno != bno || flen != len) {
- trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
- fbno, flen);
- }
- *rbno = fbno;
- *rlen = flen;
- return;
-fail:
- /*
- * Return a zero extent length as failure indications. All callers
- * re-check if the trimmed extent satisfies the minlen requirement.
- */
- spin_unlock(&args->pag->pagb_lock);
- trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
- *rbno = fbno;
- *rlen = 0;
-}
-
-static void
-xfs_alloc_busy_clear_one(
- struct xfs_mount *mp,
- struct xfs_perag *pag,
- struct xfs_busy_extent *busyp)
-{
- if (busyp->length) {
- trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
- busyp->length);
- rb_erase(&busyp->rb_node, &pag->pagb_tree);
- }
-
- list_del_init(&busyp->list);
- kmem_free(busyp);
-}
-
-/*
- * Remove all extents on the passed in list from the busy extents tree.
- * If do_discard is set skip extents that need to be discarded, and mark
- * these as undergoing a discard operation instead.
- */
-void
-xfs_alloc_busy_clear(
- struct xfs_mount *mp,
- struct list_head *list,
- bool do_discard)
-{
- struct xfs_busy_extent *busyp, *n;
- struct xfs_perag *pag = NULL;
- xfs_agnumber_t agno = NULLAGNUMBER;
-
- list_for_each_entry_safe(busyp, n, list, list) {
- if (busyp->agno != agno) {
- if (pag) {
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
- }
- pag = xfs_perag_get(mp, busyp->agno);
- spin_lock(&pag->pagb_lock);
- agno = busyp->agno;
- }
-
- if (do_discard && busyp->length &&
- !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
- busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
- else
- xfs_alloc_busy_clear_one(mp, pag, busyp);
- }
-
- if (pag) {
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
- }
-}
-
-/*
- * Callback for list_sort to sort busy extents by the AG they reside in.
- */
-int
-xfs_busy_extent_ag_cmp(
- void *priv,
- struct list_head *a,
- struct list_head *b)
-{
- return container_of(a, struct xfs_busy_extent, list)->agno -
- container_of(b, struct xfs_busy_extent, list)->agno;
-}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 2f52b924be7..93be4a667ca 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -23,7 +23,8 @@ struct xfs_btree_cur;
struct xfs_mount;
struct xfs_perag;
struct xfs_trans;
-struct xfs_busy_extent;
+
+extern struct workqueue_struct *xfs_alloc_wq;
/*
* Freespace allocation types. Argument to xfs_alloc_[v]extent.
@@ -119,6 +120,9 @@ typedef struct xfs_alloc_arg {
char isfl; /* set if is freelist blocks - !acctg */
char userdata; /* set if this is user data */
xfs_fsblock_t firstblock; /* io first block allocated */
+ struct completion *done;
+ struct work_struct work;
+ int result;
} xfs_alloc_arg_t;
/*
@@ -134,33 +138,6 @@ xfs_extlen_t
xfs_alloc_longest_free_extent(struct xfs_mount *mp,
struct xfs_perag *pag);
-#ifdef __KERNEL__
-void
-xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
- xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
-
-void
-xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
- bool do_discard);
-
-int
-xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t bno, xfs_extlen_t len);
-
-void
-xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
-
-int
-xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
-
-static inline void xfs_alloc_busy_sort(struct list_head *list)
-{
- list_sort(NULL, list, xfs_busy_extent_ag_cmp);
-}
-
-#endif /* __KERNEL__ */
-
/*
* Compute and fill in value of m_ag_maxlevels.
*/
@@ -243,6 +220,13 @@ xfs_alloc_lookup_le(
xfs_extlen_t len, /* length of extent */
int *stat); /* success/failure */
+int /* error */
+xfs_alloc_lookup_ge(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat); /* success/failure */
+
int /* error */
xfs_alloc_get_rec(
struct xfs_btree_cur *cur, /* btree cursor */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index ffb3386e45c..f1647caace8 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -32,6 +30,7 @@
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
#include "xfs_error.h"
#include "xfs_trace.h"
@@ -94,7 +93,7 @@ xfs_allocbt_alloc_block(
return 0;
}
- xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+ xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
xfs_trans_agbtree_delta(cur->bc_tp, 1);
new->s = cpu_to_be32(bno);
@@ -119,8 +118,8 @@ xfs_allocbt_free_block(
if (error)
return error;
- xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
- XFS_ALLOC_BUSY_SKIP_DISCARD);
+ xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+ XFS_EXTENT_BUSY_SKIP_DISCARD);
xfs_trans_agbtree_delta(cur->bc_tp, -1);
return 0;
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee9b62..8dad722c004 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -16,9 +16,7 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_trans.h"
@@ -26,9 +24,9 @@
#include "xfs_bmap_btree.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
-#include "xfs_rw.h"
#include "xfs_iomap.h"
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
@@ -99,24 +97,6 @@ xfs_destroy_ioend(
}
/*
- * If the end of the current ioend is beyond the current EOF,
- * return the new EOF value, otherwise zero.
- */
-STATIC xfs_fsize_t
-xfs_ioend_new_eof(
- xfs_ioend_t *ioend)
-{
- xfs_inode_t *ip = XFS_I(ioend->io_inode);
- xfs_fsize_t isize;
- xfs_fsize_t bsize;
-
- bsize = ioend->io_offset + ioend->io_size;
- isize = MAX(ip->i_size, ip->i_new_size);
- isize = MIN(isize, bsize);
- return isize > ip->i_d.di_size ? isize : 0;
-}
-
-/*
* Fast and loose check if this write could update the on-disk inode size.
*/
static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
@@ -125,36 +105,65 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
XFS_I(ioend->io_inode)->i_d.di_size;
}
+STATIC int
+xfs_setfilesize_trans_alloc(
+ struct xfs_ioend *ioend)
+{
+ struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+
+ error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ ioend->io_append_trans = tp;
+
+ /*
+ * We hand off the transaction to the completion thread now, so
+ * clear the flag here.
+ */
+ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ return 0;
+}
+
/*
- * Update on-disk file size now that data has been written to disk. The
- * current in-memory file size is i_size. If a write is beyond eof i_new_size
- * will be the intended file size until i_size is updated. If this write does
- * not extend all the way to the valid file size then restrict this update to
- * the end of the write.
- *
- * This function does not block as blocking on the inode lock in IO completion
- * can lead to IO completion order dependency deadlocks.. If it can't get the
- * inode ilock it will return EAGAIN. Callers must handle this.
+ * Update on-disk file size now that data has been written to disk.
*/
STATIC int
xfs_setfilesize(
- xfs_ioend_t *ioend)
+ struct xfs_ioend *ioend)
{
- xfs_inode_t *ip = XFS_I(ioend->io_inode);
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_trans *tp = ioend->io_append_trans;
xfs_fsize_t isize;
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
- return EAGAIN;
+ /*
+ * The transaction was allocated in the I/O submission thread,
+ * thus we need to mark ourselves as beeing in a transaction
+ * manually.
+ */
+ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
- isize = xfs_ioend_new_eof(ioend);
- if (isize) {
- trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
- ip->i_d.di_size = isize;
- xfs_mark_inode_dirty(ip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
+ if (!isize) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_cancel(tp, 0);
+ return 0;
}
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return 0;
+ trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+
+ ip->i_d.di_size = isize;
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ return xfs_trans_commit(tp, 0);
}
/*
@@ -168,10 +177,12 @@ xfs_finish_ioend(
struct xfs_ioend *ioend)
{
if (atomic_dec_and_test(&ioend->io_remaining)) {
+ struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
+
if (ioend->io_type == IO_UNWRITTEN)
- queue_work(xfsconvertd_workqueue, &ioend->io_work);
- else if (xfs_ioend_is_append(ioend))
- queue_work(xfsdatad_workqueue, &ioend->io_work);
+ queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
+ else if (ioend->io_append_trans)
+ queue_work(mp->m_data_workqueue, &ioend->io_work);
else
xfs_destroy_ioend(ioend);
}
@@ -200,35 +211,36 @@ xfs_end_io(
* range to normal written extens after the data I/O has finished.
*/
if (ioend->io_type == IO_UNWRITTEN) {
+ /*
+ * For buffered I/O we never preallocate a transaction when
+ * doing the unwritten extent conversion, but for direct I/O
+ * we do not know if we are converting an unwritten extent
+ * or not at the point where we preallocate the transaction.
+ */
+ if (ioend->io_append_trans) {
+ ASSERT(ioend->io_isdirect);
+
+ current_set_flags_nested(
+ &ioend->io_append_trans->t_pflags, PF_FSTRANS);
+ xfs_trans_cancel(ioend->io_append_trans, 0);
+ }
+
error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
ioend->io_size);
if (error) {
ioend->io_error = -error;
goto done;
}
+ } else if (ioend->io_append_trans) {
+ error = xfs_setfilesize(ioend);
+ if (error)
+ ioend->io_error = -error;
+ } else {
+ ASSERT(!xfs_ioend_is_append(ioend));
}
- /*
- * We might have to update the on-disk file size after extending
- * writes.
- */
- error = xfs_setfilesize(ioend);
- ASSERT(!error || error == EAGAIN);
-
done:
- /*
- * If we didn't complete processing of the ioend, requeue it to the
- * tail of the workqueue for another attempt later. Otherwise destroy
- * it.
- */
- if (error == EAGAIN) {
- atomic_inc(&ioend->io_remaining);
- xfs_finish_ioend(ioend);
- /* ensure we don't spin on blocked ioends */
- delay(1);
- } else {
- xfs_destroy_ioend(ioend);
- }
+ xfs_destroy_ioend(ioend);
}
/*
@@ -264,6 +276,7 @@ xfs_alloc_ioend(
*/
atomic_set(&ioend->io_remaining, 1);
ioend->io_isasync = 0;
+ ioend->io_isdirect = 0;
ioend->io_error = 0;
ioend->io_list = NULL;
ioend->io_type = type;
@@ -274,6 +287,7 @@ xfs_alloc_ioend(
ioend->io_size = 0;
ioend->io_iocb = NULL;
ioend->io_result = 0;
+ ioend->io_append_trans = NULL;
INIT_WORK(&ioend->io_work, xfs_end_io);
return ioend;
@@ -384,14 +398,6 @@ xfs_submit_ioend_bio(
atomic_inc(&ioend->io_remaining);
bio->bi_private = ioend;
bio->bi_end_io = xfs_end_bio;
-
- /*
- * If the I/O is beyond EOF we mark the inode dirty immediately
- * but don't update the inode size until I/O completion.
- */
- if (xfs_ioend_new_eof(ioend))
- xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
-
submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
}
@@ -614,7 +620,7 @@ xfs_map_at_offset(
* or delayed allocate extent.
*/
STATIC int
-xfs_is_delayed_page(
+xfs_check_page_type(
struct page *page,
unsigned int type)
{
@@ -628,11 +634,11 @@ xfs_is_delayed_page(
bh = head = page_buffers(page);
do {
if (buffer_unwritten(bh))
- acceptable = (type == IO_UNWRITTEN);
+ acceptable += (type == IO_UNWRITTEN);
else if (buffer_delay(bh))
- acceptable = (type == IO_DELALLOC);
+ acceptable += (type == IO_DELALLOC);
else if (buffer_dirty(bh) && buffer_mapped(bh))
- acceptable = (type == IO_OVERWRITE);
+ acceptable += (type == IO_OVERWRITE);
else
break;
} while ((bh = bh->b_this_page) != head);
@@ -675,7 +681,7 @@ xfs_convert_page(
goto fail_unlock_page;
if (page->mapping != inode->i_mapping)
goto fail_unlock_page;
- if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
+ if (!xfs_check_page_type(page, (*ioendp)->io_type))
goto fail_unlock_page;
/*
@@ -825,7 +831,7 @@ xfs_aops_discard_page(
struct buffer_head *bh, *head;
loff_t offset = page_offset(page);
- if (!xfs_is_delayed_page(page, IO_DELALLOC))
+ if (!xfs_check_page_type(page, IO_DELALLOC))
goto out_invalidate;
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -975,10 +981,15 @@ xfs_vm_writepage(
imap_valid = 0;
}
} else {
- if (PageUptodate(page)) {
+ if (PageUptodate(page))
ASSERT(buffer_mapped(bh));
- imap_valid = 0;
- }
+ /*
+ * This buffer is not uptodate and will not be
+ * written to disk. Ensure that we will put any
+ * subsequent writeable buffers into a new
+ * ioend.
+ */
+ imap_valid = 0;
continue;
}
@@ -1038,8 +1049,20 @@ xfs_vm_writepage(
wbc, end_index);
}
- if (iohead)
+ if (iohead) {
+ /*
+ * Reserve log space if we might write beyond the on-disk
+ * inode size.
+ */
+ if (ioend->io_type != IO_UNWRITTEN &&
+ xfs_ioend_is_append(ioend)) {
+ err = xfs_setfilesize_trans_alloc(ioend);
+ if (err)
+ goto error;
+ }
+
xfs_submit_ioend(wbc, iohead);
+ }
return 0;
@@ -1125,7 +1148,14 @@ __xfs_get_blocks(
if (!create && direct && offset >= i_size_read(inode))
return 0;
- if (create) {
+ /*
+ * Direct I/O is usually done on preallocated files, so try getting
+ * a block mapping without an exclusive lock first. For buffered
+ * writes we already have the exclusive iolock anyway, so avoiding
+ * a lock roundtrip here by taking the ilock exclusive from the
+ * beginning is a useful micro optimization.
+ */
+ if (create && !direct) {
lockmode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lockmode);
} else {
@@ -1147,23 +1177,45 @@ __xfs_get_blocks(
(!nimaps ||
(imap.br_startblock == HOLESTARTBLOCK ||
imap.br_startblock == DELAYSTARTBLOCK))) {
- if (direct) {
+ if (direct || xfs_get_extsz_hint(ip)) {
+ /*
+ * Drop the ilock in preparation for starting the block
+ * allocation transaction. It will be retaken
+ * exclusively inside xfs_iomap_write_direct for the
+ * actual allocation.
+ */
+ xfs_iunlock(ip, lockmode);
error = xfs_iomap_write_direct(ip, offset, size,
&imap, nimaps);
+ if (error)
+ return -error;
+ new = 1;
} else {
+ /*
+ * Delalloc reservations do not require a transaction,
+ * we can go on without dropping the lock here. If we
+ * are allocating a new delalloc block, make sure that
+ * we set the new flag so that we mark the buffer new so
+ * that we know that it is newly allocated if the write
+ * fails.
+ */
+ if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
+ new = 1;
error = xfs_iomap_write_delay(ip, offset, size, &imap);
+ if (error)
+ goto out_unlock;
+
+ xfs_iunlock(ip, lockmode);
}
- if (error)
- goto out_unlock;
trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
} else if (nimaps) {
trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+ xfs_iunlock(ip, lockmode);
} else {
trace_xfs_get_blocks_notfound(ip, offset, size);
goto out_unlock;
}
- xfs_iunlock(ip, lockmode);
if (imap.br_startblock != HOLESTARTBLOCK &&
imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1279,6 +1331,15 @@ xfs_end_io_direct_write(
struct xfs_ioend *ioend = iocb->private;
/*
+ * While the generic direct I/O code updates the inode size, it does
+ * so only after the end_io handler is called, which means our
+ * end_io handler thinks the on-disk size is outside the in-core
+ * size. To prevent this just update it a little bit earlier here.
+ */
+ if (offset + size > i_size_read(ioend->io_inode))
+ i_size_write(ioend->io_inode, offset + size);
+
+ /*
* blockdev_direct_IO can return an error even after the I/O
* completion handler was called. Thus we need to protect
* against double-freeing.
@@ -1310,17 +1371,32 @@ xfs_vm_direct_IO(
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct block_device *bdev = xfs_find_bdev_for_inode(inode);
+ struct xfs_ioend *ioend = NULL;
ssize_t ret;
if (rw & WRITE) {
- iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
+ size_t size = iov_length(iov, nr_segs);
+
+ /*
+ * We need to preallocate a transaction for a size update
+ * here. In the case that this write both updates the size
+ * and converts at least on unwritten extent we will cancel
+ * the still clean transaction after the I/O has finished.
+ */
+ iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT);
+ if (offset + size > XFS_I(inode)->i_d.di_size) {
+ ret = xfs_setfilesize_trans_alloc(ioend);
+ if (ret)
+ goto out_destroy_ioend;
+ ioend->io_isdirect = 1;
+ }
ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
offset, nr_segs,
xfs_get_blocks_direct,
xfs_end_io_direct_write, NULL, 0);
if (ret != -EIOCBQUEUED && iocb->private)
- xfs_destroy_ioend(iocb->private);
+ goto out_trans_cancel;
} else {
ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
offset, nr_segs,
@@ -1329,55 +1405,103 @@ xfs_vm_direct_IO(
}
return ret;
+
+out_trans_cancel:
+ if (ioend->io_append_trans) {
+ current_set_flags_nested(&ioend->io_append_trans->t_pflags,
+ PF_FSTRANS);
+ xfs_trans_cancel(ioend->io_append_trans, 0);
+ }
+out_destroy_ioend:
+ xfs_destroy_ioend(ioend);
+ return ret;
+}
+
+/*
+ * Punch out the delalloc blocks we have already allocated.
+ *
+ * Don't bother with xfs_setattr given that nothing can have made it to disk yet
+ * as the page is still locked at this point.
+ */
+STATIC void
+xfs_vm_kill_delalloc_range(
+ struct inode *inode,
+ loff_t start,
+ loff_t end)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ xfs_fileoff_t start_fsb;
+ xfs_fileoff_t end_fsb;
+ int error;
+
+ start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
+ end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
+ if (end_fsb <= start_fsb)
+ return;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+ end_fsb - start_fsb);
+ if (error) {
+ /* something screwed, just bail */
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_alert(ip->i_mount,
+ "xfs_vm_write_failed: unable to clean up ino %lld",
+ ip->i_ino);
+ }
+ }
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
STATIC void
xfs_vm_write_failed(
- struct address_space *mapping,
- loff_t to)
+ struct inode *inode,
+ struct page *page,
+ loff_t pos,
+ unsigned len)
{
- struct inode *inode = mapping->host;
+ loff_t block_offset = pos & PAGE_MASK;
+ loff_t block_start;
+ loff_t block_end;
+ loff_t from = pos & (PAGE_CACHE_SIZE - 1);
+ loff_t to = from + len;
+ struct buffer_head *bh, *head;
- if (to > inode->i_size) {
- /*
- * punch out the delalloc blocks we have already allocated. We
- * don't call xfs_setattr() to do this as we may be in the
- * middle of a multi-iovec write and so the vfs inode->i_size
- * will not match the xfs ip->i_size and so it will zero too
- * much. Hence we jus truncate the page cache to zero what is
- * necessary and punch the delalloc blocks directly.
- */
- struct xfs_inode *ip = XFS_I(inode);
- xfs_fileoff_t start_fsb;
- xfs_fileoff_t end_fsb;
- int error;
+ ASSERT(block_offset + from == pos);
- truncate_pagecache(inode, to, inode->i_size);
+ head = page_buffers(page);
+ block_start = 0;
+ for (bh = head; bh != head || !block_start;
+ bh = bh->b_this_page, block_start = block_end,
+ block_offset += bh->b_size) {
+ block_end = block_start + bh->b_size;
- /*
- * Check if there are any blocks that are outside of i_size
- * that need to be trimmed back.
- */
- start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
- end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
- if (end_fsb <= start_fsb)
- return;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- end_fsb - start_fsb);
- if (error) {
- /* something screwed, just bail */
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- xfs_alert(ip->i_mount,
- "xfs_vm_write_failed: unable to clean up ino %lld",
- ip->i_ino);
- }
- }
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /* skip buffers before the write */
+ if (block_end <= from)
+ continue;
+
+ /* if the buffer is after the write, we're done */
+ if (block_start >= to)
+ break;
+
+ if (!buffer_delay(bh))
+ continue;
+
+ if (!buffer_new(bh) && block_offset < i_size_read(inode))
+ continue;
+
+ xfs_vm_kill_delalloc_range(inode, block_offset,
+ block_offset + bh->b_size);
}
+
}
+/*
+ * This used to call block_write_begin(), but it unlocks and releases the page
+ * on error, and we need that page to be able to punch stale delalloc blocks out
+ * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
+ * the appropriate point.
+ */
STATIC int
xfs_vm_write_begin(
struct file *file,
@@ -1388,15 +1512,40 @@ xfs_vm_write_begin(
struct page **pagep,
void **fsdata)
{
- int ret;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ struct page *page;
+ int status;
- ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,
- pagep, xfs_get_blocks);
- if (unlikely(ret))
- xfs_vm_write_failed(mapping, pos + len);
- return ret;
+ ASSERT(len <= PAGE_CACHE_SIZE);
+
+ page = grab_cache_page_write_begin(mapping, index,
+ flags | AOP_FLAG_NOFS);
+ if (!page)
+ return -ENOMEM;
+
+ status = __block_write_begin(page, pos, len, xfs_get_blocks);
+ if (unlikely(status)) {
+ struct inode *inode = mapping->host;
+
+ xfs_vm_write_failed(inode, page, pos, len);
+ unlock_page(page);
+
+ if (pos + len > i_size_read(inode))
+ truncate_pagecache(inode, pos + len, i_size_read(inode));
+
+ page_cache_release(page);
+ page = NULL;
+ }
+
+ *pagep = page;
+ return status;
}
+/*
+ * On failure, we only need to kill delalloc blocks beyond EOF because they
+ * will never be written. For blocks within EOF, generic_write_end() zeros them
+ * so they are safe to leave alone and be written with all the other valid data.
+ */
STATIC int
xfs_vm_write_end(
struct file *file,
@@ -1409,9 +1558,19 @@ xfs_vm_write_end(
{
int ret;
+ ASSERT(len <= PAGE_CACHE_SIZE);
+
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
- if (unlikely(ret < len))
- xfs_vm_write_failed(mapping, pos + len);
+ if (unlikely(ret < len)) {
+ struct inode *inode = mapping->host;
+ size_t isize = i_size_read(inode);
+ loff_t to = pos + len;
+
+ if (to > isize) {
+ truncate_pagecache(inode, to, isize);
+ xfs_vm_kill_delalloc_range(inode, isize, to);
+ }
+ }
return ret;
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 116dd5c3703..84eafbcb0d9 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,8 +18,6 @@
#ifndef __XFS_AOPS_H__
#define __XFS_AOPS_H__
-extern struct workqueue_struct *xfsdatad_workqueue;
-extern struct workqueue_struct *xfsconvertd_workqueue;
extern mempool_t *xfs_ioend_pool;
/*
@@ -48,12 +46,14 @@ typedef struct xfs_ioend {
int io_error; /* I/O error code */
atomic_t io_remaining; /* hold count */
unsigned int io_isasync : 1; /* needs aio_complete */
+ unsigned int io_isdirect : 1;/* direct I/O */
struct inode *io_inode; /* file being written to */
struct buffer_head *io_buffer_head;/* buffer linked list head */
struct buffer_head *io_buffer_tail;/* buffer linked list tail */
size_t io_size; /* size of the extent */
xfs_off_t io_offset; /* offset in the file */
struct work_struct io_work; /* xfsdatad work queue */
+ struct xfs_trans *io_append_trans;/* xact. for size update */
struct kiocb *io_iocb;
int io_result;
} xfs_ioend_t;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 1e5d97f86ea..a17ff01b5ad 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -21,7 +21,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -39,7 +38,6 @@
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
-#include "xfs_rw.h"
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
@@ -827,10 +825,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
if (error)
goto out;
- /*
- * Commit the last in the sequence of transactions.
- */
- xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -857,6 +851,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
{
int newsize, forkoff, retval;
+ trace_xfs_attr_sf_addname(args);
+
retval = xfs_attr_shortform_lookup(args);
if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
return(retval);
@@ -900,6 +896,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
xfs_dabuf_t *bp;
int retval, error, committed, forkoff;
+ trace_xfs_attr_leaf_addname(args);
+
/*
* Read the (only) block in the attribute list in.
*/
@@ -924,6 +922,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
xfs_da_brelse(args->trans, bp);
return(retval);
}
+
+ trace_xfs_attr_leaf_replace(args);
+
args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
args->blkno2 = args->blkno; /* set 2nd entry info*/
args->index2 = args->index;
@@ -1094,6 +1095,8 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
xfs_dabuf_t *bp;
int error, committed, forkoff;
+ trace_xfs_attr_leaf_removename(args);
+
/*
* Remove the attribute.
*/
@@ -1227,6 +1230,8 @@ xfs_attr_node_addname(xfs_da_args_t *args)
xfs_mount_t *mp;
int committed, retval, error;
+ trace_xfs_attr_node_addname(args);
+
/*
* Fill in bucket of arguments/results/context to carry around.
*/
@@ -1253,6 +1258,9 @@ restart:
} else if (retval == EEXIST) {
if (args->flags & ATTR_CREATE)
goto out;
+
+ trace_xfs_attr_node_replace(args);
+
args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
args->blkno2 = args->blkno; /* set 2nd entry info*/
args->index2 = args->index;
@@ -1484,6 +1492,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
xfs_dabuf_t *bp;
int retval, error, committed, forkoff;
+ trace_xfs_attr_node_removename(args);
+
/*
* Tie a string around our finger to remind us where we are.
*/
@@ -1975,14 +1985,12 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
(map[i].br_startblock != HOLESTARTBLOCK));
dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
- blkcnt, XBF_LOCK | XBF_DONT_BLOCK,
- &bp);
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+ dblkno, blkcnt, 0, &bp);
if (error)
return(error);
- tmp = (valuelen < XFS_BUF_SIZE(bp))
- ? valuelen : XFS_BUF_SIZE(bp);
+ tmp = min_t(int, valuelen, BBTOB(bp->b_length));
xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
xfs_buf_relse(bp);
dst += tmp;
@@ -2085,6 +2093,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
lblkno = args->rmtblkno;
valuelen = args->valuelen;
while (valuelen > 0) {
+ int buflen;
+
/*
* Try to remember where we decided to put the value.
*/
@@ -2102,15 +2112,16 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
- bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
- XBF_LOCK | XBF_DONT_BLOCK);
+ bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0);
if (!bp)
return ENOMEM;
- tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
- XFS_BUF_SIZE(bp);
+
+ buflen = BBTOB(bp->b_length);
+ tmp = min_t(int, valuelen, buflen);
xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
- if (tmp < XFS_BUF_SIZE(bp))
- xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
+ if (tmp < buflen)
+ xfs_buf_zero(bp, tmp, buflen - tmp);
+
error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
xfs_buf_relse(bp);
if (error)
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index c1b55e59655..7d89d800f51 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -235,6 +234,8 @@ xfs_attr_shortform_create(xfs_da_args_t *args)
xfs_inode_t *dp;
xfs_ifork_t *ifp;
+ trace_xfs_attr_sf_create(args);
+
dp = args->dp;
ASSERT(dp != NULL);
ifp = dp->i_afp;
@@ -268,13 +269,11 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
xfs_inode_t *dp;
xfs_ifork_t *ifp;
+ trace_xfs_attr_sf_add(args);
+
dp = args->dp;
mp = dp->i_mount;
dp->i_d.di_forkoff = forkoff;
- dp->i_df.if_ext_max =
- XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
- dp->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
ifp = dp->i_afp;
ASSERT(ifp->if_flags & XFS_IFINLINE);
@@ -326,7 +325,6 @@ xfs_attr_fork_reset(
ASSERT(ip->i_d.di_anextents == 0);
ASSERT(ip->i_afp == NULL);
- ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -342,6 +340,8 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
xfs_mount_t *mp;
xfs_inode_t *dp;
+ trace_xfs_attr_sf_remove(args);
+
dp = args->dp;
mp = dp->i_mount;
base = sizeof(xfs_attr_sf_hdr_t);
@@ -389,10 +389,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
(args->op_flags & XFS_DA_OP_ADDNAME) ||
!(mp->m_flags & XFS_MOUNT_ATTR2) ||
dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
- dp->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
- dp->i_df.if_ext_max =
- XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
xfs_trans_log_inode(args->trans, dp,
XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
@@ -414,6 +410,8 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
int i;
xfs_ifork_t *ifp;
+ trace_xfs_attr_sf_lookup(args);
+
ifp = args->dp->i_afp;
ASSERT(ifp->if_flags & XFS_IFINLINE);
sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
@@ -485,6 +483,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
xfs_dabuf_t *bp;
xfs_ifork_t *ifp;
+ trace_xfs_attr_sf_to_leaf(args);
+
dp = args->dp;
ifp = dp->i_afp;
sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
@@ -784,6 +784,8 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
char *tmpbuffer;
int error, i;
+ trace_xfs_attr_leaf_to_sf(args);
+
dp = args->dp;
tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
ASSERT(tmpbuffer != NULL);
@@ -857,6 +859,8 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
xfs_dablk_t blkno;
int error;
+ trace_xfs_attr_leaf_to_node(args);
+
dp = args->dp;
bp1 = bp2 = NULL;
error = xfs_da_grow_inode(args, &blkno);
@@ -920,6 +924,8 @@ xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
xfs_dabuf_t *bp;
int error;
+ trace_xfs_attr_leaf_create(args);
+
dp = args->dp;
ASSERT(dp != NULL);
error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
@@ -957,6 +963,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
xfs_dablk_t blkno;
int error;
+ trace_xfs_attr_leaf_split(state->args);
+
/*
* Allocate space for a new leaf node.
*/
@@ -986,10 +994,13 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
*
* Insert the "new" entry in the correct block.
*/
- if (state->inleaf)
+ if (state->inleaf) {
+ trace_xfs_attr_leaf_add_old(state->args);
error = xfs_attr_leaf_add(oldblk->bp, state->args);
- else
+ } else {
+ trace_xfs_attr_leaf_add_new(state->args);
error = xfs_attr_leaf_add(newblk->bp, state->args);
+ }
/*
* Update last hashval in each block since we added the name.
@@ -1010,6 +1021,8 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
xfs_attr_leaf_map_t *map;
int tablesize, entsize, sum, tmp, i;
+ trace_xfs_attr_leaf_add(args);
+
leaf = bp->data;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
ASSERT((args->index >= 0)
@@ -1137,8 +1150,6 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
(be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
/*
- * Copy the attribute name and value into the new space.
- *
* For "remote" attribute values, simply note that we need to
* allocate space for the "remote" value. We can't actually
* allocate the extents in this transaction, and we can't decide
@@ -1274,6 +1285,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
args = state->args;
+ trace_xfs_attr_leaf_rebalance(args);
+
/*
* Check ordering of blocks, reverse if it makes things simpler.
*
@@ -1819,6 +1832,8 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
xfs_mount_t *mp;
char *tmpbuffer;
+ trace_xfs_attr_leaf_unbalance(state->args);
+
/*
* Set up environment.
*/
@@ -1928,6 +1943,8 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
int probe, span;
xfs_dahash_t hashval;
+ trace_xfs_attr_leaf_lookup(args);
+
leaf = bp->data;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
ASSERT(be16_to_cpu(leaf->hdr.count)
@@ -2454,6 +2471,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
char *name;
#endif /* DEBUG */
+ trace_xfs_attr_leaf_clearflag(args);
/*
* Set up the operation.
*/
@@ -2518,6 +2536,8 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
xfs_dabuf_t *bp;
int error;
+ trace_xfs_attr_leaf_setflag(args);
+
/*
* Set up the operation.
*/
@@ -2574,6 +2594,8 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
char *name1, *name2;
#endif /* DEBUG */
+ trace_xfs_attr_leaf_flipflags(args);
+
/*
* Read the block containing the "old" attr
*/
@@ -2960,7 +2982,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
map.br_blockcount);
bp = xfs_trans_get_buf(*trans,
dp->i_mount->m_ddev_targp,
- dblkno, dblkcnt, XBF_LOCK);
+ dblkno, dblkcnt, 0);
if (!bp)
return ENOMEM;
xfs_trans_binval(*trans, bp);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index d0ab7883705..58b815ec8c9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -41,7 +41,6 @@
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_attr_leaf.h"
-#include "xfs_rw.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_buf_item.h"
@@ -249,7 +248,27 @@ xfs_bmbt_lookup_ge(
}
/*
-* Update the record referred to by cur to the value given
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+ return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >
+ XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+ return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) <=
+ XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Update the record referred to by cur to the value given
* by [off, bno, len, state].
* This either works (return 0) or gets an EFSCORRUPTED error.
*/
@@ -683,8 +702,8 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist,
&bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
@@ -767,8 +786,8 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur, 1,
&tmp_rval, XFS_DATA_FORK);
@@ -836,8 +855,8 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur,
1, &tmp_rval, XFS_DATA_FORK);
@@ -884,8 +903,7 @@ xfs_bmap_add_extent_delay_real(
}
/* convert to a btree if necessary */
- if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
@@ -1421,8 +1439,7 @@ xfs_bmap_add_extent_unwritten_real(
}
/* convert to a btree if necessary */
- if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(cur == NULL);
@@ -1812,8 +1829,7 @@ xfs_bmap_add_extent_hole_real(
}
/* convert to a btree if necessary */
- if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
@@ -3037,8 +3053,7 @@ xfs_bmap_extents_to_btree(
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
/*
* Make space in the inode incore.
*/
@@ -3184,13 +3199,8 @@ xfs_bmap_forkoff_reset(
ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
- if (dfl_forkoff > ip->i_d.di_forkoff) {
+ if (dfl_forkoff > ip->i_d.di_forkoff)
ip->i_d.di_forkoff = dfl_forkoff;
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
- ip->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
- }
}
}
@@ -3430,8 +3440,6 @@ xfs_bmap_add_attrfork(
int error; /* error return value */
ASSERT(XFS_IFORK_Q(ip) == 0);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
mp = ip->i_mount;
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -3486,12 +3494,9 @@ xfs_bmap_add_attrfork(
error = XFS_ERROR(EINVAL);
goto error1;
}
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
ASSERT(ip->i_afp == NULL);
ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
- ip->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
ip->i_afp->if_flags = XFS_IFEXTENTS;
logflags = 0;
xfs_bmap_init(&flist, &firstblock);
@@ -3535,20 +3540,17 @@ xfs_bmap_add_attrfork(
} else
spin_unlock(&mp->m_sb_lock);
}
- if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
+
+ error = xfs_bmap_finish(&tp, &flist, &committed);
+ if (error)
goto error2;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
- return error;
+ return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
error2:
xfs_bmap_cancel(&flist);
error1:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
error0:
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
return error;
}
@@ -3994,11 +3996,8 @@ xfs_bmap_one_block(
xfs_bmbt_irec_t s; /* internal version of extent */
#ifndef DEBUG
- if (whichfork == XFS_DATA_FORK) {
- return S_ISREG(ip->i_d.di_mode) ?
- (ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
- (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
- }
+ if (whichfork == XFS_DATA_FORK)
+ return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
#endif /* !DEBUG */
if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
return 0;
@@ -4010,7 +4009,7 @@ xfs_bmap_one_block(
xfs_bmbt_get_all(ep, &s);
rval = s.br_startoff == 0 && s.br_blockcount == 1;
if (rval && whichfork == XFS_DATA_FORK)
- ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize);
+ ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
return rval;
}
@@ -4379,8 +4378,6 @@ xfs_bmapi_read(
XFS_STATS_INC(xs_blk_mapr);
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, whichfork);
@@ -4529,7 +4526,7 @@ out_unreserve_blocks:
xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
- xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ?
+ xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
return error;
}
@@ -4871,8 +4868,6 @@ xfs_bmapi_write(
return XFS_ERROR(EIO);
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
XFS_STATS_INC(xs_blk_mapw);
@@ -4981,8 +4976,7 @@ xfs_bmapi_write(
/*
* Transform from btree to extents, give it cur.
*/
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+ if (xfs_bmap_wants_extents(ip, whichfork)) {
int tmp_logflags = 0;
ASSERT(bma.cur);
@@ -4992,10 +4986,10 @@ xfs_bmapi_write(
if (error)
goto error0;
}
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
- XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
+ XFS_IFORK_NEXTENTS(ip, whichfork) >
+ XFS_IFORK_MAXEXT(ip, whichfork));
error = 0;
error0:
/*
@@ -5095,8 +5089,7 @@ xfs_bunmapi(
ASSERT(len > 0);
ASSERT(nexts >= 0);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
if (!(ifp->if_flags & XFS_IFEXTENTS) &&
(error = xfs_iread_extents(tp, ip, whichfork)))
return error;
@@ -5130,6 +5123,15 @@ xfs_bunmapi(
cur->bc_private.b.flags = 0;
} else
cur = NULL;
+
+ if (isrt) {
+ /*
+ * Synchronize by locking the bitmap inode.
+ */
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ }
+
extno = 0;
while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
(nexts == 0 || extno < nexts)) {
@@ -5322,7 +5324,8 @@ xfs_bunmapi(
*/
if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+ XFS_IFORK_MAXEXT(ip, whichfork) &&
del.br_startoff > got.br_startoff &&
del.br_startoff + del.br_blockcount <
got.br_startoff + got.br_blockcount) {
@@ -5353,13 +5356,11 @@ nodelete:
}
}
*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
/*
* Convert to a btree if necessary.
*/
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(ip, whichfork)) {
ASSERT(cur == NULL);
error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
&cur, 0, &tmp_logflags, whichfork);
@@ -5370,8 +5371,7 @@ nodelete:
/*
* transform from btree to extents, give it cur
*/
- else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+ else if (xfs_bmap_wants_extents(ip, whichfork)) {
ASSERT(cur != NULL);
error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
whichfork);
@@ -5382,8 +5382,6 @@ nodelete:
/*
* transform from extents to local?
*/
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
error = 0;
error0:
/*
@@ -5434,7 +5432,7 @@ xfs_getbmapx_fix_eof_hole(
if (startblock == HOLESTARTBLOCK) {
mp = ip->i_mount;
out->bmv_block = -1;
- fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size));
+ fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
fixlen -= out->bmv_offset;
if (prealloced && out->bmv_offset + out->bmv_length == end) {
/* Came to hole at EOF. Trim it. */
@@ -5522,7 +5520,7 @@ xfs_getbmap(
fixlen = XFS_MAXIOFFSET(mp);
} else {
prealloced = 0;
- fixlen = ip->i_size;
+ fixlen = XFS_ISIZE(ip);
}
}
@@ -5546,12 +5544,16 @@ xfs_getbmap(
if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
return XFS_ERROR(ENOMEM);
out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
- if (!out)
- return XFS_ERROR(ENOMEM);
+ if (!out) {
+ out = kmem_zalloc_large(bmv->bmv_count *
+ sizeof(struct getbmapx));
+ if (!out)
+ return XFS_ERROR(ENOMEM);
+ }
xfs_ilock(ip, XFS_IOLOCK_SHARED);
if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
- if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
+ if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
if (error)
goto out_unlock_iolock;
@@ -5618,8 +5620,20 @@ xfs_getbmap(
XFS_FSB_TO_BB(mp, map[i].br_blockcount);
out[cur_ext].bmv_unused1 = 0;
out[cur_ext].bmv_unused2 = 0;
- ASSERT(((iflags & BMV_IF_DELALLOC) != 0) ||
- (map[i].br_startblock != DELAYSTARTBLOCK));
+
+ /*
+ * delayed allocation extents that start beyond EOF can
+ * occur due to speculative EOF allocation when the
+ * delalloc extent is larger than the largest freespace
+ * extent at conversion time. These extents cannot be
+ * converted by data writeback, so can exist here even
+ * if we are not supposed to be finding delalloc
+ * extents.
+ */
+ if (map[i].br_startblock == DELAYSTARTBLOCK &&
+ map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
+ ASSERT((iflags & BMV_IF_DELALLOC) != 0);
+
if (map[i].br_startblock == HOLESTARTBLOCK &&
whichfork == XFS_ATTR_FORK) {
/* came to the end of attribute fork */
@@ -5671,7 +5685,10 @@ xfs_getbmap(
break;
}
- kmem_free(out);
+ if (is_vmalloc_addr(out))
+ kmem_free_large(out);
+ else
+ kmem_free(out);
return error;
}
@@ -6151,3 +6168,16 @@ next_block:
return error;
}
+
+/*
+ * Convert the given file system block to a disk block. We have to treat it
+ * differently based on whether the file is a real time file or not, because the
+ * bmap code does.
+ */
+xfs_daddr_t
+xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
+{
+ return (XFS_IS_REALTIME_INODE(ip) ? \
+ (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+ XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 89ee672d378..803b56d7ce1 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -211,6 +211,9 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, int *count);
int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
xfs_fileoff_t start_fsb, xfs_fileoff_t length);
+
+xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+
#endif /* __KERNEL__ */
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index e2f5d59cbea..862084a47a7 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 1f19f03af9d..e53e317b158 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index cf0ac056815..269b35c084d 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -35,18 +35,14 @@
#include <linux/freezer.h>
#include "xfs_sb.h"
-#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_trace.h"
static kmem_zone_t *xfs_buf_zone;
-STATIC int xfsbufd(void *);
static struct workqueue_struct *xfslogd_workqueue;
-struct workqueue_struct *xfsdatad_workqueue;
-struct workqueue_struct *xfsconvertd_workqueue;
#ifdef XFS_BUF_LOCK_TRACKING
# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
@@ -59,11 +55,7 @@ struct workqueue_struct *xfsconvertd_workqueue;
#endif
#define xb_to_gfp(flags) \
- ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
- ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
-
-#define xb_to_km(flags) \
- (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
+ ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
static inline int
@@ -73,11 +65,11 @@ xfs_buf_is_vmapped(
/*
* Return true if the buffer is vmapped.
*
- * The XBF_MAPPED flag is set if the buffer should be mapped, but the
- * code is clever enough to know it doesn't have to map a single page,
- * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
+ * b_addr is null if the buffer is not mapped, but the code is clever
+ * enough to know it doesn't have to map a single page, so the check has
+ * to be both for b_addr and bp->b_page_count > 1.
*/
- return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
+ return bp->b_addr && bp->b_page_count > 1;
}
static inline int
@@ -146,8 +138,17 @@ void
xfs_buf_stale(
struct xfs_buf *bp)
{
+ ASSERT(xfs_buf_islocked(bp));
+
bp->b_flags |= XBF_STALE;
- xfs_buf_delwri_dequeue(bp);
+
+ /*
+ * Clear the delwri status so that a delwri queue walker will not
+ * flush this buffer to disk now that it is stale. The delwri queue has
+ * a reference to the buffer, so this is safe to do.
+ */
+ bp->b_flags &= ~_XBF_DELWRI_Q;
+
atomic_set(&(bp)->b_lru_ref, 0);
if (!list_empty(&bp->b_lru)) {
struct xfs_buftarg *btp = bp->b_target;
@@ -166,22 +167,22 @@ xfs_buf_stale(
struct xfs_buf *
xfs_buf_alloc(
struct xfs_buftarg *target,
- xfs_off_t range_base,
- size_t range_length,
+ xfs_daddr_t blkno,
+ size_t numblks,
xfs_buf_flags_t flags)
{
struct xfs_buf *bp;
- bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags));
+ bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
if (unlikely(!bp))
return NULL;
/*
- * We don't want certain flags to appear in b_flags.
+ * We don't want certain flags to appear in b_flags unless they are
+ * specifically set by later operations on the buffer.
*/
- flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
+ flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
- memset(bp, 0, sizeof(xfs_buf_t));
atomic_set(&bp->b_hold, 1);
atomic_set(&bp->b_lru_ref, 1);
init_completion(&bp->b_iowait);
@@ -191,15 +192,16 @@ xfs_buf_alloc(
sema_init(&bp->b_sema, 0); /* held, no waiters */
XB_SET_OWNER(bp);
bp->b_target = target;
- bp->b_file_offset = range_base;
+
/*
- * Set buffer_length and count_desired to the same value initially.
- * I/O routines should use count_desired, which will be the same in
+ * Set length and io_length to the same value initially.
+ * I/O routines should use io_length, which will be the same in
* most cases but may be reset (e.g. XFS recovery).
*/
- bp->b_buffer_length = bp->b_count_desired = range_length;
+ bp->b_length = numblks;
+ bp->b_io_length = numblks;
bp->b_flags = flags;
- bp->b_bn = XFS_BUF_DADDR_NULL;
+ bp->b_bn = blkno;
atomic_set(&bp->b_pin_count, 0);
init_waitqueue_head(&bp->b_waiters);
@@ -221,13 +223,12 @@ _xfs_buf_get_pages(
{
/* Make sure that we have a page list */
if (bp->b_pages == NULL) {
- bp->b_offset = xfs_buf_poff(bp->b_file_offset);
bp->b_page_count = page_count;
if (page_count <= XB_PAGES) {
bp->b_pages = bp->b_page_array;
} else {
bp->b_pages = kmem_alloc(sizeof(struct page *) *
- page_count, xb_to_km(flags));
+ page_count, KM_NOFS);
if (bp->b_pages == NULL)
return -ENOMEM;
}
@@ -290,11 +291,11 @@ xfs_buf_allocate_memory(
xfs_buf_t *bp,
uint flags)
{
- size_t size = bp->b_count_desired;
+ size_t size;
size_t nbytes, offset;
gfp_t gfp_mask = xb_to_gfp(flags);
unsigned short page_count, i;
- xfs_off_t end;
+ xfs_off_t start, end;
int error;
/*
@@ -302,15 +303,15 @@ xfs_buf_allocate_memory(
* the memory from the heap - there's no need for the complexity of
* page arrays to keep allocation down to order 0.
*/
- if (bp->b_buffer_length < PAGE_SIZE) {
- bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
+ size = BBTOB(bp->b_length);
+ if (size < PAGE_SIZE) {
+ bp->b_addr = kmem_alloc(size, KM_NOFS);
if (!bp->b_addr) {
/* low memory - use alloc_page loop instead */
goto use_alloc_page;
}
- if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
- PAGE_MASK) !=
+ if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
((unsigned long)bp->b_addr & PAGE_MASK)) {
/* b_addr spans two pages - use alloc_page instead */
kmem_free(bp->b_addr);
@@ -321,13 +322,14 @@ xfs_buf_allocate_memory(
bp->b_pages = bp->b_page_array;
bp->b_pages[0] = virt_to_page(bp->b_addr);
bp->b_page_count = 1;
- bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
+ bp->b_flags |= _XBF_KMEM;
return 0;
}
use_alloc_page:
- end = bp->b_file_offset + bp->b_buffer_length;
- page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
+ start = BBTOB(bp->b_bn) >> PAGE_SHIFT;
+ end = (BBTOB(bp->b_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ page_count = end - start;
error = _xfs_buf_get_pages(bp, page_count, flags);
if (unlikely(error))
return error;
@@ -390,8 +392,9 @@ _xfs_buf_map_pages(
if (bp->b_page_count == 1) {
/* A single page buffer is always mappable */
bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
- bp->b_flags |= XBF_MAPPED;
- } else if (flags & XBF_MAPPED) {
+ } else if (flags & XBF_UNMAPPED) {
+ bp->b_addr = NULL;
+ } else {
int retried = 0;
do {
@@ -405,7 +408,6 @@ _xfs_buf_map_pages(
if (!bp->b_addr)
return -ENOMEM;
bp->b_addr += bp->b_offset;
- bp->b_flags |= XBF_MAPPED;
}
return 0;
@@ -422,29 +424,27 @@ _xfs_buf_map_pages(
*/
xfs_buf_t *
_xfs_buf_find(
- xfs_buftarg_t *btp, /* block device target */
- xfs_off_t ioff, /* starting offset of range */
- size_t isize, /* length of range */
+ struct xfs_buftarg *btp,
+ xfs_daddr_t blkno,
+ size_t numblks,
xfs_buf_flags_t flags,
xfs_buf_t *new_bp)
{
- xfs_off_t range_base;
- size_t range_length;
+ size_t numbytes;
struct xfs_perag *pag;
struct rb_node **rbp;
struct rb_node *parent;
xfs_buf_t *bp;
- range_base = (ioff << BBSHIFT);
- range_length = (isize << BBSHIFT);
+ numbytes = BBTOB(numblks);
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(range_length < (1 << btp->bt_sshift)));
- ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
+ ASSERT(!(numbytes < (1 << btp->bt_sshift)));
+ ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
/* get tree root */
pag = xfs_perag_get(btp->bt_mount,
- xfs_daddr_to_agno(btp->bt_mount, ioff));
+ xfs_daddr_to_agno(btp->bt_mount, blkno));
/* walk tree */
spin_lock(&pag->pag_buf_lock);
@@ -455,20 +455,20 @@ _xfs_buf_find(
parent = *rbp;
bp = rb_entry(parent, struct xfs_buf, b_rbnode);
- if (range_base < bp->b_file_offset)
+ if (blkno < bp->b_bn)
rbp = &(*rbp)->rb_left;
- else if (range_base > bp->b_file_offset)
+ else if (blkno > bp->b_bn)
rbp = &(*rbp)->rb_right;
else {
/*
- * found a block offset match. If the range doesn't
+ * found a block number match. If the range doesn't
* match, the only way this is allowed is if the buffer
* in the cache is stale and the transaction that made
* it stale has not yet committed. i.e. we are
* reallocating a busy extent. Skip this buffer and
* continue searching to the right for an exact match.
*/
- if (bp->b_buffer_length != range_length) {
+ if (bp->b_length != numblks) {
ASSERT(bp->b_flags & XBF_STALE);
rbp = &(*rbp)->rb_right;
continue;
@@ -513,7 +513,7 @@ found:
*/
if (bp->b_flags & XBF_STALE) {
ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
- bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
+ bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
}
trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -528,63 +528,54 @@ found:
*/
struct xfs_buf *
xfs_buf_get(
- xfs_buftarg_t *target,/* target for buffer */
- xfs_off_t ioff, /* starting offset of range */
- size_t isize, /* length of range */
+ xfs_buftarg_t *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
xfs_buf_flags_t flags)
{
struct xfs_buf *bp;
struct xfs_buf *new_bp;
int error = 0;
- bp = _xfs_buf_find(target, ioff, isize, flags, NULL);
+ bp = _xfs_buf_find(target, blkno, numblks, flags, NULL);
if (likely(bp))
goto found;
- new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT,
- flags);
+ new_bp = xfs_buf_alloc(target, blkno, numblks, flags);
if (unlikely(!new_bp))
return NULL;
- bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
- if (!bp) {
+ error = xfs_buf_allocate_memory(new_bp, flags);
+ if (error) {
kmem_zone_free(xfs_buf_zone, new_bp);
return NULL;
}
- if (bp == new_bp) {
- error = xfs_buf_allocate_memory(bp, flags);
- if (error)
- goto no_buffer;
- } else
- kmem_zone_free(xfs_buf_zone, new_bp);
+ bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp);
+ if (!bp) {
+ xfs_buf_free(new_bp);
+ return NULL;
+ }
- /*
- * Now we have a workable buffer, fill in the block number so
- * that we can do IO on it.
- */
- bp->b_bn = ioff;
- bp->b_count_desired = bp->b_buffer_length;
+ if (bp != new_bp)
+ xfs_buf_free(new_bp);
+
+ bp->b_io_length = bp->b_length;
found:
- if (!(bp->b_flags & XBF_MAPPED)) {
+ if (!bp->b_addr) {
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
xfs_warn(target->bt_mount,
"%s: failed to map pages\n", __func__);
- goto no_buffer;
+ xfs_buf_relse(bp);
+ return NULL;
}
}
XFS_STATS_INC(xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
return bp;
-
-no_buffer:
- if (flags & (XBF_LOCK | XBF_TRYLOCK))
- xfs_buf_unlock(bp);
- xfs_buf_rele(bp);
- return NULL;
}
STATIC int
@@ -592,32 +583,30 @@ _xfs_buf_read(
xfs_buf_t *bp,
xfs_buf_flags_t flags)
{
- int status;
-
- ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
+ ASSERT(!(flags & XBF_WRITE));
ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
- bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD);
+ bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
- status = xfs_buf_iorequest(bp);
- if (status || bp->b_error || (flags & XBF_ASYNC))
- return status;
+ xfs_buf_iorequest(bp);
+ if (flags & XBF_ASYNC)
+ return 0;
return xfs_buf_iowait(bp);
}
xfs_buf_t *
xfs_buf_read(
xfs_buftarg_t *target,
- xfs_off_t ioff,
- size_t isize,
+ xfs_daddr_t blkno,
+ size_t numblks,
xfs_buf_flags_t flags)
{
xfs_buf_t *bp;
flags |= XBF_READ;
- bp = xfs_buf_get(target, ioff, isize, flags);
+ bp = xfs_buf_get(target, blkno, numblks, flags);
if (bp) {
trace_xfs_buf_read(bp, flags, _RET_IP_);
@@ -629,7 +618,8 @@ xfs_buf_read(
* Read ahead call which is already satisfied,
* drop the buffer
*/
- goto no_buffer;
+ xfs_buf_relse(bp);
+ return NULL;
} else {
/* We do not want read in the flags */
bp->b_flags &= ~XBF_READ;
@@ -637,12 +627,6 @@ xfs_buf_read(
}
return bp;
-
- no_buffer:
- if (flags & (XBF_LOCK | XBF_TRYLOCK))
- xfs_buf_unlock(bp);
- xfs_buf_rele(bp);
- return NULL;
}
/*
@@ -652,14 +636,14 @@ xfs_buf_read(
void
xfs_buf_readahead(
xfs_buftarg_t *target,
- xfs_off_t ioff,
- size_t isize)
+ xfs_daddr_t blkno,
+ size_t numblks)
{
if (bdi_read_congested(target->bt_bdi))
return;
- xfs_buf_read(target, ioff, isize,
- XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
+ xfs_buf_read(target, blkno, numblks,
+ XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
}
/*
@@ -668,16 +652,15 @@ xfs_buf_readahead(
*/
struct xfs_buf *
xfs_buf_read_uncached(
- struct xfs_mount *mp,
struct xfs_buftarg *target,
xfs_daddr_t daddr,
- size_t length,
+ size_t numblks,
int flags)
{
xfs_buf_t *bp;
int error;
- bp = xfs_buf_get_uncached(target, length, flags);
+ bp = xfs_buf_get_uncached(target, numblks, flags);
if (!bp)
return NULL;
@@ -685,9 +668,9 @@ xfs_buf_read_uncached(
XFS_BUF_SET_ADDR(bp, daddr);
XFS_BUF_READ(bp);
- xfsbdstrat(mp, bp);
+ xfsbdstrat(target->bt_mount, bp);
error = xfs_buf_iowait(bp);
- if (error || bp->b_error) {
+ if (error) {
xfs_buf_relse(bp);
return NULL;
}
@@ -701,7 +684,7 @@ xfs_buf_read_uncached(
void
xfs_buf_set_empty(
struct xfs_buf *bp,
- size_t len)
+ size_t numblks)
{
if (bp->b_pages)
_xfs_buf_free_pages(bp);
@@ -709,10 +692,9 @@ xfs_buf_set_empty(
bp->b_pages = NULL;
bp->b_page_count = 0;
bp->b_addr = NULL;
- bp->b_file_offset = 0;
- bp->b_buffer_length = bp->b_count_desired = len;
+ bp->b_length = numblks;
+ bp->b_io_length = numblks;
bp->b_bn = XFS_BUF_DADDR_NULL;
- bp->b_flags &= ~XBF_MAPPED;
}
static inline struct page *
@@ -751,7 +733,7 @@ xfs_buf_associate_memory(
bp->b_pages = NULL;
bp->b_addr = mem;
- rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
+ rval = _xfs_buf_get_pages(bp, page_count, 0);
if (rval)
return rval;
@@ -762,9 +744,8 @@ xfs_buf_associate_memory(
pageaddr += PAGE_SIZE;
}
- bp->b_count_desired = len;
- bp->b_buffer_length = buflen;
- bp->b_flags |= XBF_MAPPED;
+ bp->b_io_length = BTOBB(len);
+ bp->b_length = BTOBB(buflen);
return 0;
}
@@ -772,17 +753,18 @@ xfs_buf_associate_memory(
xfs_buf_t *
xfs_buf_get_uncached(
struct xfs_buftarg *target,
- size_t len,
+ size_t numblks,
int flags)
{
- unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
+ unsigned long page_count;
int error, i;
xfs_buf_t *bp;
- bp = xfs_buf_alloc(target, 0, len, 0);
+ bp = xfs_buf_alloc(target, XFS_BUF_DADDR_NULL, numblks, 0);
if (unlikely(bp == NULL))
goto fail;
+ page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
error = _xfs_buf_get_pages(bp, page_count, 0);
if (error)
goto fail_free_buf;
@@ -794,7 +776,7 @@ xfs_buf_get_uncached(
}
bp->b_flags |= _XBF_PAGES;
- error = _xfs_buf_map_pages(bp, XBF_MAPPED);
+ error = _xfs_buf_map_pages(bp, 0);
if (unlikely(error)) {
xfs_warn(target->bt_mount,
"%s: failed to map pages\n", __func__);
@@ -857,7 +839,7 @@ xfs_buf_rele(
spin_unlock(&pag->pag_buf_lock);
} else {
xfs_buf_lru_del(bp);
- ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
@@ -917,13 +899,6 @@ xfs_buf_lock(
trace_xfs_buf_lock_done(bp, _RET_IP_);
}
-/*
- * Releases the lock on the buffer object.
- * If the buffer is marked delwri but is not queued, do so before we
- * unlock the buffer as we need to set flags correctly. We also need to
- * take a reference for the delwri queue because the unlocker is going to
- * drop their's and they don't know we just queued it.
- */
void
xfs_buf_unlock(
struct xfs_buf *bp)
@@ -1010,29 +985,8 @@ xfs_buf_ioerror_alert(
const char *func)
{
xfs_alert(bp->b_target->bt_mount,
-"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd",
- (__uint64_t)XFS_BUF_ADDR(bp), func,
- bp->b_error, XFS_BUF_COUNT(bp));
-}
-
-int
-xfs_bwrite(
- struct xfs_buf *bp)
-{
- int error;
-
- bp->b_flags |= XBF_WRITE;
- bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
-
- xfs_buf_delwri_dequeue(bp);
- xfs_bdstrat_cb(bp);
-
- error = xfs_buf_iowait(bp);
- if (error) {
- xfs_force_shutdown(bp->b_target->bt_mount,
- SHUTDOWN_META_IO_ERROR);
- }
- return error;
+"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
+ (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length);
}
/*
@@ -1104,14 +1058,7 @@ xfs_bioerror_relse(
return EIO;
}
-
-/*
- * All xfs metadata buffers except log state machine buffers
- * get this attached as their b_bdstrat callback function.
- * This is so that we can catch a buffer
- * after prematurely unpinning it to forcibly shutdown the filesystem.
- */
-int
+STATIC int
xfs_bdstrat_cb(
struct xfs_buf *bp)
{
@@ -1132,6 +1079,27 @@ xfs_bdstrat_cb(
return 0;
}
+int
+xfs_bwrite(
+ struct xfs_buf *bp)
+{
+ int error;
+
+ ASSERT(xfs_buf_islocked(bp));
+
+ bp->b_flags |= XBF_WRITE;
+ bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
+
+ xfs_bdstrat_cb(bp);
+
+ error = xfs_buf_iowait(bp);
+ if (error) {
+ xfs_force_shutdown(bp->b_target->bt_mount,
+ SHUTDOWN_META_IO_ERROR);
+ }
+ return error;
+}
+
/*
* Wrapper around bdstrat so that we can stop data from going to disk in case
* we are shutting down the filesystem. Typically user data goes thru this
@@ -1183,7 +1151,7 @@ _xfs_buf_ioapply(
int rw, map_i, total_nr_pages, nr_pages;
struct bio *bio;
int offset = bp->b_offset;
- int size = bp->b_count_desired;
+ int size = BBTOB(bp->b_io_length);
sector_t sector = bp->b_bn;
total_nr_pages = bp->b_page_count;
@@ -1231,7 +1199,7 @@ next_chunk:
break;
offset = 0;
- sector += nbytes >> BBSHIFT;
+ sector += BTOBB(nbytes);
size -= nbytes;
total_nr_pages--;
}
@@ -1250,13 +1218,13 @@ next_chunk:
}
}
-int
+void
xfs_buf_iorequest(
xfs_buf_t *bp)
{
trace_xfs_buf_iorequest(bp, _RET_IP_);
- ASSERT(!(bp->b_flags & XBF_DELWRI));
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
if (bp->b_flags & XBF_WRITE)
xfs_buf_wait_unpin(bp);
@@ -1268,16 +1236,15 @@ xfs_buf_iorequest(
*/
atomic_set(&bp->b_io_remaining, 1);
_xfs_buf_ioapply(bp);
- _xfs_buf_ioend(bp, 0);
+ _xfs_buf_ioend(bp, 1);
xfs_buf_rele(bp);
- return 0;
}
/*
- * Waits for I/O to complete on the buffer supplied.
- * It returns immediately if no I/O is pending.
- * It returns the I/O error code, if any, or 0 if there was no error.
+ * Waits for I/O to complete on the buffer supplied. It returns immediately if
+ * no I/O is pending or there is already a pending error on the buffer. It
+ * returns the I/O error code, if any, or 0 if there was no error.
*/
int
xfs_buf_iowait(
@@ -1285,7 +1252,8 @@ xfs_buf_iowait(
{
trace_xfs_buf_iowait(bp, _RET_IP_);
- wait_for_completion(&bp->b_iowait);
+ if (!bp->b_error)
+ wait_for_completion(&bp->b_iowait);
trace_xfs_buf_iowait_done(bp, _RET_IP_);
return bp->b_error;
@@ -1298,7 +1266,7 @@ xfs_buf_offset(
{
struct page *page;
- if (bp->b_flags & XBF_MAPPED)
+ if (bp->b_addr)
return bp->b_addr + offset;
offset += bp->b_offset;
@@ -1317,27 +1285,30 @@ xfs_buf_iomove(
void *data, /* data address */
xfs_buf_rw_t mode) /* read/write/zero flag */
{
- size_t bend, cpoff, csize;
- struct page *page;
+ size_t bend;
bend = boff + bsize;
while (boff < bend) {
- page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
- cpoff = xfs_buf_poff(boff + bp->b_offset);
- csize = min_t(size_t,
- PAGE_SIZE-cpoff, bp->b_count_desired-boff);
+ struct page *page;
+ int page_index, page_offset, csize;
+
+ page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
+ page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
+ page = bp->b_pages[page_index];
+ csize = min_t(size_t, PAGE_SIZE - page_offset,
+ BBTOB(bp->b_io_length) - boff);
- ASSERT(((csize + cpoff) <= PAGE_SIZE));
+ ASSERT((csize + page_offset) <= PAGE_SIZE);
switch (mode) {
case XBRW_ZERO:
- memset(page_address(page) + cpoff, 0, csize);
+ memset(page_address(page) + page_offset, 0, csize);
break;
case XBRW_READ:
- memcpy(data, page_address(page) + cpoff, csize);
+ memcpy(data, page_address(page) + page_offset, csize);
break;
case XBRW_WRITE:
- memcpy(page_address(page) + cpoff, data, csize);
+ memcpy(page_address(page) + page_offset, data, csize);
}
boff += csize;
@@ -1370,7 +1341,7 @@ restart:
goto restart;
}
/*
- * clear the LRU reference count so the bufer doesn't get
+ * clear the LRU reference count so the buffer doesn't get
* ignored in xfs_buf_rele().
*/
atomic_set(&bp->b_lru_ref, 0);
@@ -1437,11 +1408,9 @@ xfs_free_buftarg(
{
unregister_shrinker(&btp->bt_shrinker);
- xfs_flush_buftarg(btp, 1);
if (mp->m_flags & XFS_MOUNT_BARRIER)
xfs_blkdev_issue_flush(btp);
- kthread_stop(btp->bt_task);
kmem_free(btp);
}
@@ -1493,20 +1462,6 @@ xfs_setsize_buftarg(
return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
}
-STATIC int
-xfs_alloc_delwri_queue(
- xfs_buftarg_t *btp,
- const char *fsname)
-{
- INIT_LIST_HEAD(&btp->bt_delwri_queue);
- spin_lock_init(&btp->bt_delwri_lock);
- btp->bt_flags = 0;
- btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
- if (IS_ERR(btp->bt_task))
- return PTR_ERR(btp->bt_task);
- return 0;
-}
-
xfs_buftarg_t *
xfs_alloc_buftarg(
struct xfs_mount *mp,
@@ -1529,8 +1484,6 @@ xfs_alloc_buftarg(
spin_lock_init(&btp->bt_lru_lock);
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;
- if (xfs_alloc_delwri_queue(btp, fsname))
- goto error;
btp->bt_shrinker.shrink = xfs_buftarg_shrink;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
register_shrinker(&btp->bt_shrinker);
@@ -1541,125 +1494,52 @@ error:
return NULL;
}
-
/*
- * Delayed write buffer handling
+ * Add a buffer to the delayed write list.
+ *
+ * This queues a buffer for writeout if it hasn't already been. Note that
+ * neither this routine nor the buffer list submission functions perform
+ * any internal synchronization. It is expected that the lists are thread-local
+ * to the callers.
+ *
+ * Returns true if we queued up the buffer, or false if it already had
+ * been on the buffer list.
*/
-void
+bool
xfs_buf_delwri_queue(
- xfs_buf_t *bp)
+ struct xfs_buf *bp,
+ struct list_head *list)
{
- struct xfs_buftarg *btp = bp->b_target;
-
- trace_xfs_buf_delwri_queue(bp, _RET_IP_);
-
+ ASSERT(xfs_buf_islocked(bp));
ASSERT(!(bp->b_flags & XBF_READ));
- spin_lock(&btp->bt_delwri_lock);
- if (!list_empty(&bp->b_list)) {
- /* if already in the queue, move it to the tail */
- ASSERT(bp->b_flags & _XBF_DELWRI_Q);
- list_move_tail(&bp->b_list, &btp->bt_delwri_queue);
- } else {
- /* start xfsbufd as it is about to have something to do */
- if (list_empty(&btp->bt_delwri_queue))
- wake_up_process(bp->b_target->bt_task);
-
- atomic_inc(&bp->b_hold);
- bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
- list_add_tail(&bp->b_list, &btp->bt_delwri_queue);
- }
- bp->b_queuetime = jiffies;
- spin_unlock(&btp->bt_delwri_lock);
-}
-
-void
-xfs_buf_delwri_dequeue(
- xfs_buf_t *bp)
-{
- int dequeued = 0;
-
- spin_lock(&bp->b_target->bt_delwri_lock);
- if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
- ASSERT(bp->b_flags & _XBF_DELWRI_Q);
- list_del_init(&bp->b_list);
- dequeued = 1;
+ /*
+ * If the buffer is already marked delwri it already is queued up
+ * by someone else for imediate writeout. Just ignore it in that
+ * case.
+ */
+ if (bp->b_flags & _XBF_DELWRI_Q) {
+ trace_xfs_buf_delwri_queued(bp, _RET_IP_);
+ return false;
}
- bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
- spin_unlock(&bp->b_target->bt_delwri_lock);
-
- if (dequeued)
- xfs_buf_rele(bp);
- trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
-}
-
-/*
- * If a delwri buffer needs to be pushed before it has aged out, then promote
- * it to the head of the delwri queue so that it will be flushed on the next
- * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
- * than the age currently needed to flush the buffer. Hence the next time the
- * xfsbufd sees it is guaranteed to be considered old enough to flush.
- */
-void
-xfs_buf_delwri_promote(
- struct xfs_buf *bp)
-{
- struct xfs_buftarg *btp = bp->b_target;
- long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
-
- ASSERT(bp->b_flags & XBF_DELWRI);
- ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+ trace_xfs_buf_delwri_queue(bp, _RET_IP_);
/*
- * Check the buffer age before locking the delayed write queue as we
- * don't need to promote buffers that are already past the flush age.
+ * If a buffer gets written out synchronously or marked stale while it
+ * is on a delwri list we lazily remove it. To do this, the other party
+ * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
+ * It remains referenced and on the list. In a rare corner case it
+ * might get readded to a delwri list after the synchronous writeout, in
+ * which case we need just need to re-add the flag here.
*/
- if (bp->b_queuetime < jiffies - age)
- return;
- bp->b_queuetime = jiffies - age;
- spin_lock(&btp->bt_delwri_lock);
- list_move(&bp->b_list, &btp->bt_delwri_queue);
- spin_unlock(&btp->bt_delwri_lock);
-}
-
-/*
- * Move as many buffers as specified to the supplied list
- * idicating if we skipped any buffers to prevent deadlocks.
- */
-STATIC int
-xfs_buf_delwri_split(
- xfs_buftarg_t *target,
- struct list_head *list,
- unsigned long age)
-{
- xfs_buf_t *bp, *n;
- int skipped = 0;
- int force;
-
- force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
- INIT_LIST_HEAD(list);
- spin_lock(&target->bt_delwri_lock);
- list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) {
- ASSERT(bp->b_flags & XBF_DELWRI);
-
- if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
- if (!force &&
- time_before(jiffies, bp->b_queuetime + age)) {
- xfs_buf_unlock(bp);
- break;
- }
-
- bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q);
- bp->b_flags |= XBF_WRITE;
- list_move_tail(&bp->b_list, list);
- trace_xfs_buf_delwri_split(bp, _RET_IP_);
- } else
- skipped++;
+ bp->b_flags |= _XBF_DELWRI_Q;
+ if (list_empty(&bp->b_list)) {
+ atomic_inc(&bp->b_hold);
+ list_add_tail(&bp->b_list, list);
}
- spin_unlock(&target->bt_delwri_lock);
- return skipped;
+ return true;
}
/*
@@ -1685,103 +1565,109 @@ xfs_buf_cmp(
return 0;
}
-STATIC int
-xfsbufd(
- void *data)
+static int
+__xfs_buf_delwri_submit(
+ struct list_head *buffer_list,
+ struct list_head *io_list,
+ bool wait)
{
- xfs_buftarg_t *target = (xfs_buftarg_t *)data;
-
- current->flags |= PF_MEMALLOC;
-
- set_freezable();
-
- do {
- long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
- long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
- struct list_head tmp;
- struct blk_plug plug;
-
- if (unlikely(freezing(current))) {
- set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
- refrigerator();
+ struct blk_plug plug;
+ struct xfs_buf *bp, *n;
+ int pinned = 0;
+
+ list_for_each_entry_safe(bp, n, buffer_list, b_list) {
+ if (!wait) {
+ if (xfs_buf_ispinned(bp)) {
+ pinned++;
+ continue;
+ }
+ if (!xfs_buf_trylock(bp))
+ continue;
} else {
- clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
+ xfs_buf_lock(bp);
}
- /* sleep for a long time if there is nothing to do. */
- if (list_empty(&target->bt_delwri_queue))
- tout = MAX_SCHEDULE_TIMEOUT;
- schedule_timeout_interruptible(tout);
+ /*
+ * Someone else might have written the buffer synchronously or
+ * marked it stale in the meantime. In that case only the
+ * _XBF_DELWRI_Q flag got cleared, and we have to drop the
+ * reference and remove it from the list here.
+ */
+ if (!(bp->b_flags & _XBF_DELWRI_Q)) {
+ list_del_init(&bp->b_list);
+ xfs_buf_relse(bp);
+ continue;
+ }
+
+ list_move_tail(&bp->b_list, io_list);
+ trace_xfs_buf_delwri_split(bp, _RET_IP_);
+ }
- xfs_buf_delwri_split(target, &tmp, age);
- list_sort(NULL, &tmp, xfs_buf_cmp);
+ list_sort(NULL, io_list, xfs_buf_cmp);
- blk_start_plug(&plug);
- while (!list_empty(&tmp)) {
- struct xfs_buf *bp;
- bp = list_first_entry(&tmp, struct xfs_buf, b_list);
+ blk_start_plug(&plug);
+ list_for_each_entry_safe(bp, n, io_list, b_list) {
+ bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
+ bp->b_flags |= XBF_WRITE;
+
+ if (!wait) {
+ bp->b_flags |= XBF_ASYNC;
list_del_init(&bp->b_list);
- xfs_bdstrat_cb(bp);
}
- blk_finish_plug(&plug);
- } while (!kthread_should_stop());
+ xfs_bdstrat_cb(bp);
+ }
+ blk_finish_plug(&plug);
- return 0;
+ return pinned;
}
/*
- * Go through all incore buffers, and release buffers if they belong to
- * the given device. This is used in filesystem error handling to
- * preserve the consistency of its metadata.
+ * Write out a buffer list asynchronously.
+ *
+ * This will take the @buffer_list, write all non-locked and non-pinned buffers
+ * out and not wait for I/O completion on any of the buffers. This interface
+ * is only safely useable for callers that can track I/O completion by higher
+ * level means, e.g. AIL pushing as the @buffer_list is consumed in this
+ * function.
*/
int
-xfs_flush_buftarg(
- xfs_buftarg_t *target,
- int wait)
+xfs_buf_delwri_submit_nowait(
+ struct list_head *buffer_list)
{
- xfs_buf_t *bp;
- int pincount = 0;
- LIST_HEAD(tmp_list);
- LIST_HEAD(wait_list);
- struct blk_plug plug;
+ LIST_HEAD (io_list);
+ return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
+}
- flush_workqueue(xfslogd_workqueue);
+/*
+ * Write out a buffer list synchronously.
+ *
+ * This will take the @buffer_list, write all buffers out and wait for I/O
+ * completion on all of the buffers. @buffer_list is consumed by the function,
+ * so callers must have some other way of tracking buffers if they require such
+ * functionality.
+ */
+int
+xfs_buf_delwri_submit(
+ struct list_head *buffer_list)
+{
+ LIST_HEAD (io_list);
+ int error = 0, error2;
+ struct xfs_buf *bp;
- set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
- pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
+ __xfs_buf_delwri_submit(buffer_list, &io_list, true);
- /*
- * Dropped the delayed write list lock, now walk the temporary list.
- * All I/O is issued async and then if we need to wait for completion
- * we do that after issuing all the IO.
- */
- list_sort(NULL, &tmp_list, xfs_buf_cmp);
+ /* Wait for IO to complete. */
+ while (!list_empty(&io_list)) {
+ bp = list_first_entry(&io_list, struct xfs_buf, b_list);
- blk_start_plug(&plug);
- while (!list_empty(&tmp_list)) {
- bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
- ASSERT(target == bp->b_target);
list_del_init(&bp->b_list);
- if (wait) {
- bp->b_flags &= ~XBF_ASYNC;
- list_add(&bp->b_list, &wait_list);
- }
- xfs_bdstrat_cb(bp);
- }
- blk_finish_plug(&plug);
-
- if (wait) {
- /* Wait for IO to complete. */
- while (!list_empty(&wait_list)) {
- bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
-
- list_del_init(&bp->b_list);
- xfs_buf_iowait(bp);
- xfs_buf_relse(bp);
- }
+ error2 = xfs_buf_iowait(bp);
+ xfs_buf_relse(bp);
+ if (!error)
+ error = error2;
}
- return pincount;
+ return error;
}
int __init
@@ -1797,21 +1683,8 @@ xfs_buf_init(void)
if (!xfslogd_workqueue)
goto out_free_buf_zone;
- xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
- if (!xfsdatad_workqueue)
- goto out_destroy_xfslogd_workqueue;
-
- xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
- WQ_MEM_RECLAIM, 1);
- if (!xfsconvertd_workqueue)
- goto out_destroy_xfsdatad_workqueue;
-
return 0;
- out_destroy_xfsdatad_workqueue:
- destroy_workqueue(xfsdatad_workqueue);
- out_destroy_xfslogd_workqueue:
- destroy_workqueue(xfslogd_workqueue);
out_free_buf_zone:
kmem_zone_destroy(xfs_buf_zone);
out:
@@ -1821,8 +1694,6 @@ xfs_buf_init(void)
void
xfs_buf_terminate(void)
{
- destroy_workqueue(xfsconvertd_workqueue);
- destroy_workqueue(xfsdatad_workqueue);
destroy_workqueue(xfslogd_workqueue);
kmem_zone_destroy(xfs_buf_zone);
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 5bab046e859..79344c48008 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -21,7 +21,6 @@
#include <linux/list.h>
#include <linux/types.h>
#include <linux/spinlock.h>
-#include <asm/system.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
@@ -33,11 +32,6 @@
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
-#define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE)
-#define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT)
-#define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT)
-#define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK)
-
typedef enum {
XBRW_READ = 1, /* transfer into target memory */
XBRW_WRITE = 2, /* transfer from target memory */
@@ -47,11 +41,9 @@ typedef enum {
#define XBF_READ (1 << 0) /* buffer intended for reading from device */
#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
-#define XBF_MAPPED (1 << 3) /* buffer mapped (b_addr valid) */
#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
-#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */
-#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
+#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
/* I/O hints for the BIO layer */
#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
@@ -59,14 +51,13 @@ typedef enum {
#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */
/* flags used only as arguments to access routines */
-#define XBF_LOCK (1 << 15)/* lock requested */
#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */
-#define XBF_DONT_BLOCK (1 << 17)/* do not block in current thread */
+#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */
/* flags used only internally */
#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
#define _XBF_KMEM (1 << 21)/* backed by heap memory */
-#define _XBF_DELWRI_Q (1 << 22)/* buffer on delwri queue */
+#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
typedef unsigned int xfs_buf_flags_t;
@@ -74,26 +65,18 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_READ, "READ" }, \
{ XBF_WRITE, "WRITE" }, \
{ XBF_READ_AHEAD, "READ_AHEAD" }, \
- { XBF_MAPPED, "MAPPED" }, \
{ XBF_ASYNC, "ASYNC" }, \
{ XBF_DONE, "DONE" }, \
- { XBF_DELWRI, "DELWRI" }, \
{ XBF_STALE, "STALE" }, \
{ XBF_SYNCIO, "SYNCIO" }, \
{ XBF_FUA, "FUA" }, \
{ XBF_FLUSH, "FLUSH" }, \
- { XBF_LOCK, "LOCK" }, /* should never be set */\
- { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\
- { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\
+ { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\
+ { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }
-typedef enum {
- XBT_FORCE_SLEEP = 0,
- XBT_FORCE_FLUSH = 1,
-} xfs_buftarg_flags_t;
-
typedef struct xfs_buftarg {
dev_t bt_dev;
struct block_device *bt_bdev;
@@ -103,12 +86,6 @@ typedef struct xfs_buftarg {
unsigned int bt_sshift;
size_t bt_smask;
- /* per device delwri queue */
- struct task_struct *bt_task;
- struct list_head bt_delwri_queue;
- spinlock_t bt_delwri_lock;
- unsigned long bt_flags;
-
/* LRU control structures */
struct shrinker bt_shrinker;
struct list_head bt_lru;
@@ -130,8 +107,8 @@ typedef struct xfs_buf {
* fast-path on locking.
*/
struct rb_node b_rbnode; /* rbtree node */
- xfs_off_t b_file_offset; /* offset in file */
- size_t b_buffer_length;/* size of buffer in bytes */
+ xfs_daddr_t b_bn; /* block number for I/O */
+ int b_length; /* size of buffer in BBs */
atomic_t b_hold; /* reference count */
atomic_t b_lru_ref; /* lru reclaim ref count */
xfs_buf_flags_t b_flags; /* status flags */
@@ -142,8 +119,6 @@ typedef struct xfs_buf {
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
xfs_buftarg_t *b_target; /* buffer target (device) */
- xfs_daddr_t b_bn; /* block number for I/O */
- size_t b_count_desired;/* desired transfer size */
void *b_addr; /* virtual address of buffer */
struct work_struct b_iodone_work;
xfs_buf_iodone_t b_iodone; /* I/O completion function */
@@ -152,7 +127,7 @@ typedef struct xfs_buf {
struct xfs_trans *b_transp;
struct page **b_pages; /* array of page pointers */
struct page *b_page_array[XB_PAGES]; /* inline pages */
- unsigned long b_queuetime; /* time buffer was queued */
+ int b_io_length; /* IO size in BBs */
atomic_t b_pin_count; /* pin count */
atomic_t b_io_remaining; /* #outstanding I/O requests */
unsigned int b_page_count; /* size of page array */
@@ -165,26 +140,30 @@ typedef struct xfs_buf {
/* Finding and Reading Buffers */
-extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
- xfs_buf_flags_t, xfs_buf_t *);
+struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, xfs_daddr_t blkno,
+ size_t numblks, xfs_buf_flags_t flags,
+ struct xfs_buf *new_bp);
#define xfs_incore(buftarg,blkno,len,lockit) \
_xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
-extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
- xfs_buf_flags_t);
-extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
- xfs_buf_flags_t);
-
-struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t,
- xfs_buf_flags_t);
-extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
-extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
-extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
-extern void xfs_buf_hold(xfs_buf_t *);
-extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
-struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
- struct xfs_buftarg *target,
- xfs_daddr_t daddr, size_t length, int flags);
+struct xfs_buf *xfs_buf_get(struct xfs_buftarg *target, xfs_daddr_t blkno,
+ size_t numblks, xfs_buf_flags_t flags);
+struct xfs_buf *xfs_buf_read(struct xfs_buftarg *target, xfs_daddr_t blkno,
+ size_t numblks, xfs_buf_flags_t flags);
+void xfs_buf_readahead(struct xfs_buftarg *target, xfs_daddr_t blkno,
+ size_t numblks);
+
+struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
+struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *target, xfs_daddr_t blkno,
+ size_t numblks, xfs_buf_flags_t flags);
+void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks);
+int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
+
+struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
+ int flags);
+struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
+ xfs_daddr_t daddr, size_t numblks, int flags);
+void xfs_buf_hold(struct xfs_buf *bp);
/* Releasing Buffers */
extern void xfs_buf_free(xfs_buf_t *);
@@ -201,12 +180,11 @@ extern void xfs_buf_unlock(xfs_buf_t *);
extern int xfs_bwrite(struct xfs_buf *bp);
extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
-extern int xfs_bdstrat_cb(struct xfs_buf *);
extern void xfs_buf_ioend(xfs_buf_t *, int);
extern void xfs_buf_ioerror(xfs_buf_t *, int);
extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
-extern int xfs_buf_iorequest(xfs_buf_t *);
+extern void xfs_buf_iorequest(xfs_buf_t *);
extern int xfs_buf_iowait(xfs_buf_t *);
extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
xfs_buf_rw_t);
@@ -222,24 +200,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
/* Delayed Write Buffer Routines */
-extern void xfs_buf_delwri_queue(struct xfs_buf *);
-extern void xfs_buf_delwri_dequeue(struct xfs_buf *);
-extern void xfs_buf_delwri_promote(struct xfs_buf *);
+extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+extern int xfs_buf_delwri_submit(struct list_head *);
+extern int xfs_buf_delwri_submit_nowait(struct list_head *);
/* Buffer Daemon Setup Routines */
extern int xfs_buf_init(void);
extern void xfs_buf_terminate(void);
#define XFS_BUF_ZEROFLAGS(bp) \
- ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \
+ ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
void xfs_buf_stale(struct xfs_buf *bp);
#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
-#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
-
#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
@@ -258,12 +234,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
#define XFS_BUF_ADDR(bp) ((bp)->b_bn)
#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno))
-#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset)
-#define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off))
-#define XFS_BUF_COUNT(bp) ((bp)->b_count_desired)
-#define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt))
-#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
-#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
{
@@ -289,7 +259,6 @@ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
extern void xfs_wait_buftarg(xfs_buftarg_t *);
extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
-extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index eac97ef81e2..d9e451115f9 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -123,11 +122,11 @@ xfs_buf_item_log_check(
ASSERT(bip->bli_logged != NULL);
bp = bip->bli_buf;
- ASSERT(XFS_BUF_COUNT(bp) > 0);
+ ASSERT(bp->b_length > 0);
ASSERT(bp->b_addr != NULL);
orig = bip->bli_orig;
buffer = bp->b_addr;
- for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
+ for (x = 0; x < BBTOB(bp->b_length); x++) {
if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
xfs_emerg(bp->b_mount,
"%s: bip %x buffer %x orig %x index %d",
@@ -418,7 +417,6 @@ xfs_buf_item_unpin(
if (freed && stale) {
ASSERT(bip->bli_flags & XFS_BLI_STALE);
ASSERT(xfs_buf_islocked(bp));
- ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
ASSERT(XFS_BUF_ISSTALE(bp));
ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
@@ -455,42 +453,42 @@ xfs_buf_item_unpin(
bp->b_iodone = NULL;
} else {
spin_lock(&ailp->xa_lock);
- xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
+ xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
ASSERT(bp->b_fspriv == NULL);
}
xfs_buf_relse(bp);
+ } else if (freed && remove) {
+ xfs_buf_lock(bp);
+ xfs_buf_ioerror(bp, EIO);
+ XFS_BUF_UNDONE(bp);
+ xfs_buf_stale(bp);
+ xfs_buf_ioend(bp, 0);
}
}
-/*
- * This is called to attempt to lock the buffer associated with this
- * buf log item. Don't sleep on the buffer lock. If we can't get
- * the lock right away, return 0. If we can get the lock, take a
- * reference to the buffer. If this is a delayed write buffer that
- * needs AIL help to be written back, invoke the pushbuf routine
- * rather than the normal success path.
- */
STATIC uint
-xfs_buf_item_trylock(
- struct xfs_log_item *lip)
+xfs_buf_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
+ uint rval = XFS_ITEM_SUCCESS;
if (xfs_buf_ispinned(bp))
return XFS_ITEM_PINNED;
if (!xfs_buf_trylock(bp))
return XFS_ITEM_LOCKED;
- /* take a reference to the buffer. */
- xfs_buf_hold(bp);
-
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- trace_xfs_buf_item_trylock(bip);
- if (XFS_BUF_ISDELAYWRITE(bp))
- return XFS_ITEM_PUSHBUF;
- return XFS_ITEM_SUCCESS;
+
+ trace_xfs_buf_item_push(bip);
+
+ if (!xfs_buf_delwri_queue(bp, buffer_list))
+ rval = XFS_ITEM_FLUSHING;
+ xfs_buf_unlock(bp);
+ return rval;
}
/*
@@ -603,49 +601,6 @@ xfs_buf_item_committed(
return lsn;
}
-/*
- * The buffer is locked, but is not a delayed write buffer. This happens
- * if we race with IO completion and hence we don't want to try to write it
- * again. Just release the buffer.
- */
-STATIC void
-xfs_buf_item_push(
- struct xfs_log_item *lip)
-{
- struct xfs_buf_log_item *bip = BUF_ITEM(lip);
- struct xfs_buf *bp = bip->bli_buf;
-
- ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
-
- trace_xfs_buf_item_push(bip);
-
- xfs_buf_relse(bp);
-}
-
-/*
- * The buffer is locked and is a delayed write buffer. Promote the buffer
- * in the delayed write queue as the caller knows that they must invoke
- * the xfsbufd to get this buffer written. We have to unlock the buffer
- * to allow the xfsbufd to write it, too.
- */
-STATIC bool
-xfs_buf_item_pushbuf(
- struct xfs_log_item *lip)
-{
- struct xfs_buf_log_item *bip = BUF_ITEM(lip);
- struct xfs_buf *bp = bip->bli_buf;
-
- ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(XFS_BUF_ISDELAYWRITE(bp));
-
- trace_xfs_buf_item_pushbuf(bip);
-
- xfs_buf_delwri_promote(bp);
- xfs_buf_relse(bp);
- return true;
-}
-
STATIC void
xfs_buf_item_committing(
struct xfs_log_item *lip,
@@ -661,11 +616,9 @@ static const struct xfs_item_ops xfs_buf_item_ops = {
.iop_format = xfs_buf_item_format,
.iop_pin = xfs_buf_item_pin,
.iop_unpin = xfs_buf_item_unpin,
- .iop_trylock = xfs_buf_item_trylock,
.iop_unlock = xfs_buf_item_unlock,
.iop_committed = xfs_buf_item_committed,
.iop_push = xfs_buf_item_push,
- .iop_pushbuf = xfs_buf_item_pushbuf,
.iop_committing = xfs_buf_item_committing
};
@@ -703,7 +656,8 @@ xfs_buf_item_init(
* truncate any pieces. map_size is the size of the
* bitmap needed to describe the chunks of the buffer.
*/
- chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
+ chunks = (int)((BBTOB(bp->b_length) + (XFS_BLF_CHUNK - 1)) >>
+ XFS_BLF_SHIFT);
map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
@@ -713,7 +667,7 @@ xfs_buf_item_init(
xfs_buf_hold(bp);
bip->bli_format.blf_type = XFS_LI_BUF;
bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
- bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
+ bip->bli_format.blf_len = (ushort)bp->b_length;
bip->bli_format.blf_map_size = map_size;
#ifdef XFS_TRANS_DEBUG
@@ -725,9 +679,9 @@ xfs_buf_item_init(
* the buffer to indicate which bytes the callers have asked
* to have logged.
*/
- bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
- memcpy(bip->bli_orig, bp->b_addr, XFS_BUF_COUNT(bp));
- bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
+ bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP);
+ memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length));
+ bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP);
#endif
/*
@@ -984,20 +938,27 @@ xfs_buf_iodone_callbacks(
* If the write was asynchronous then no one will be looking for the
* error. Clear the error state and write the buffer out again.
*
- * During sync or umount we'll write all pending buffers again
- * synchronous, which will catch these errors if they keep hanging
- * around.
+ * XXX: This helps against transient write errors, but we need to find
+ * a way to shut the filesystem down if the writes keep failing.
+ *
+ * In practice we'll shut the filesystem down soon as non-transient
+ * erorrs tend to affect the whole device and a failing log write
+ * will make us give up. But we really ought to do better here.
*/
if (XFS_BUF_ISASYNC(bp)) {
+ ASSERT(bp->b_iodone != NULL);
+
+ trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+
xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
if (!XFS_BUF_ISSTALE(bp)) {
- xfs_buf_delwri_queue(bp);
- XFS_BUF_DONE(bp);
+ bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE;
+ xfs_buf_iorequest(bp);
+ } else {
+ xfs_buf_relse(bp);
}
- ASSERT(bp->b_iodone != NULL);
- trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
- xfs_buf_relse(bp);
+
return;
}
@@ -1045,6 +1006,6 @@ xfs_buf_iodone(
* Either way, AIL is useless if we're forcing a shutdown.
*/
spin_lock(&ailp->xa_lock);
- xfs_trans_ail_delete(ailp, lip);
+ xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
xfs_buf_item_free(BUF_ITEM(lip));
}
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 77c74257c2a..015b946c580 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -108,6 +107,8 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
int error;
xfs_trans_t *tp;
+ trace_xfs_da_node_create(args);
+
tp = args->trans;
error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
if (error)
@@ -140,6 +141,8 @@ xfs_da_split(xfs_da_state_t *state)
xfs_dabuf_t *bp;
int max, action, error, i;
+ trace_xfs_da_split(state->args);
+
/*
* Walk back up the tree splitting/inserting/adjusting as necessary.
* If we need to insert and there isn't room, split the node, then
@@ -178,10 +181,12 @@ xfs_da_split(xfs_da_state_t *state)
state->extravalid = 1;
if (state->inleaf) {
state->extraafter = 0; /* before newblk */
+ trace_xfs_attr_leaf_split_before(state->args);
error = xfs_attr_leaf_split(state, oldblk,
&state->extrablk);
} else {
state->extraafter = 1; /* after newblk */
+ trace_xfs_attr_leaf_split_after(state->args);
error = xfs_attr_leaf_split(state, newblk,
&state->extrablk);
}
@@ -300,6 +305,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
xfs_mount_t *mp;
xfs_dir2_leaf_t *leaf;
+ trace_xfs_da_root_split(state->args);
+
/*
* Copy the existing (incorrect) block from the root node position
* to a free space somewhere.
@@ -380,6 +387,8 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
int newcount, error;
int useextra;
+ trace_xfs_da_node_split(state->args);
+
node = oldblk->bp->data;
ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
@@ -466,6 +475,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
int count, tmp;
xfs_trans_t *tp;
+ trace_xfs_da_node_rebalance(state->args);
+
node1 = blk1->bp->data;
node2 = blk2->bp->data;
/*
@@ -574,6 +585,8 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
xfs_da_node_entry_t *btree;
int tmp;
+ trace_xfs_da_node_add(state->args);
+
node = oldblk->bp->data;
ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
@@ -619,6 +632,8 @@ xfs_da_join(xfs_da_state_t *state)
xfs_da_state_blk_t *drop_blk, *save_blk;
int action, error;
+ trace_xfs_da_join(state->args);
+
action = 0;
drop_blk = &state->path.blk[ state->path.active-1 ];
save_blk = &state->altpath.blk[ state->path.active-1 ];
@@ -723,6 +738,8 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
xfs_dabuf_t *bp;
int error;
+ trace_xfs_da_root_join(state->args);
+
args = state->args;
ASSERT(args != NULL);
ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
@@ -941,6 +958,8 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
xfs_da_node_entry_t *btree;
int tmp;
+ trace_xfs_da_node_remove(state->args);
+
node = drop_blk->bp->data;
ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count));
ASSERT(drop_blk->index >= 0);
@@ -984,6 +1003,8 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
int tmp;
xfs_trans_t *tp;
+ trace_xfs_da_node_unbalance(state->args);
+
drop_node = drop_blk->bp->data;
save_node = save_blk->bp->data;
ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
@@ -1230,6 +1251,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
/*
* Link new block in before existing block.
*/
+ trace_xfs_da_link_before(args);
new_info->forw = cpu_to_be32(old_blk->blkno);
new_info->back = old_info->back;
if (old_info->back) {
@@ -1251,6 +1273,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
/*
* Link new block in after existing block.
*/
+ trace_xfs_da_link_after(args);
new_info->forw = old_info->forw;
new_info->back = cpu_to_be32(old_blk->blkno);
if (old_info->forw) {
@@ -1348,6 +1371,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
* Unlink the leaf block from the doubly linked chain of leaves.
*/
if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
+ trace_xfs_da_unlink_back(args);
save_info->back = drop_info->back;
if (drop_info->back) {
error = xfs_da_read_buf(args->trans, args->dp,
@@ -1365,6 +1389,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
xfs_da_buf_done(bp);
}
} else {
+ trace_xfs_da_unlink_forward(args);
save_info->forw = drop_info->forw;
if (drop_info->forw) {
error = xfs_da_read_buf(args->trans, args->dp,
@@ -1652,6 +1677,8 @@ xfs_da_grow_inode(
int count;
int error;
+ trace_xfs_da_grow_inode(args);
+
if (args->whichfork == XFS_DATA_FORK) {
bno = args->dp->i_mount->m_dirleafblk;
count = args->dp->i_mount->m_dirblkfsbs;
@@ -1690,6 +1717,8 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
xfs_dir2_leaf_t *dead_leaf2;
xfs_dahash_t dead_hash;
+ trace_xfs_da_swap_lastblock(args);
+
dead_buf = *dead_bufp;
dead_blkno = *dead_blknop;
tp = args->trans;
@@ -1878,6 +1907,8 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
xfs_trans_t *tp;
xfs_mount_t *mp;
+ trace_xfs_da_shrink_inode(args);
+
dp = args->dp;
w = args->whichfork;
tp = args->trans;
@@ -2245,20 +2276,20 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps)
if (nbuf == 1) {
dabuf->nbuf = 1;
bp = bps[0];
- dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
+ dabuf->bbcount = bp->b_length;
dabuf->data = bp->b_addr;
dabuf->bps[0] = bp;
} else {
dabuf->nbuf = nbuf;
for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) {
dabuf->bps[i] = bp = bps[i];
- dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp));
+ dabuf->bbcount += bp->b_length;
}
dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
- for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) {
+ for (i = off = 0; i < nbuf; i++, off += BBTOB(bp->b_length)) {
bp = bps[i];
memcpy((char *)dabuf->data + off, bp->b_addr,
- XFS_BUF_COUNT(bp));
+ BBTOB(bp->b_length));
}
}
return dabuf;
@@ -2278,10 +2309,10 @@ xfs_da_buf_clean(xfs_dabuf_t *dabuf)
ASSERT(dabuf->nbuf > 1);
dabuf->dirty = 0;
for (i = off = 0; i < dabuf->nbuf;
- i++, off += XFS_BUF_COUNT(bp)) {
+ i++, off += BBTOB(bp->b_length)) {
bp = dabuf->bps[i];
memcpy(bp->b_addr, dabuf->data + off,
- XFS_BUF_COUNT(bp));
+ BBTOB(bp->b_length));
}
}
}
@@ -2324,10 +2355,10 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
}
dabuf->dirty = 1;
ASSERT(first <= last);
- for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) {
+ for (i = off = 0; i < dabuf->nbuf; i++, off += BBTOB(bp->b_length)) {
bp = dabuf->bps[i];
f = off;
- l = f + XFS_BUF_COUNT(bp) - 1;
+ l = f + BBTOB(bp->b_length) - 1;
if (f < first)
f = first;
if (l > last)
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 654dc6f05ba..e00de08dc8a 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -163,12 +161,14 @@ xfs_swap_extents_check_format(
/* Check temp in extent form to max in target */
if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+ XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
return EINVAL;
/* Check target in extent form to max in temp */
if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+ XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
return EINVAL;
/*
@@ -180,18 +180,25 @@ xfs_swap_extents_check_format(
* (a common defrag case) which will occur when the temp inode is in
* extent format...
*/
- if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
- ((XFS_IFORK_BOFF(ip) &&
- tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
- XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
- return EINVAL;
+ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ if (XFS_IFORK_BOFF(ip) &&
+ tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+ return EINVAL;
+ if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+ XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+ return EINVAL;
+ }
/* Reciprocal target->temp btree format checks */
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
- ((XFS_IFORK_BOFF(tip) &&
- ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
- return EINVAL;
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ if (XFS_IFORK_BOFF(tip) &&
+ ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+ return EINVAL;
+
+ if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+ XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+ return EINVAL;
+ }
return 0;
}
@@ -206,7 +213,7 @@ xfs_swap_extents(
xfs_trans_t *tp;
xfs_bstat_t *sbp = &sxp->sx_stat;
xfs_ifork_t *tempifp, *ifp, *tifp;
- int ilf_fields, tilf_fields;
+ int src_log_flags, target_log_flags;
int error = 0;
int aforkblks = 0;
int taforkblks = 0;
@@ -349,16 +356,6 @@ xfs_swap_extents(
*tifp = *tempifp; /* struct copy */
/*
- * Fix the in-memory data fork values that are dependent on the fork
- * offset in the inode. We can't assume they remain the same as attr2
- * has dynamic fork offsets.
- */
- ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
- (uint)sizeof(xfs_bmbt_rec_t);
- tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
- (uint)sizeof(xfs_bmbt_rec_t);
-
- /*
* Fix the on-disk inode values
*/
tmp = (__uint64_t)ip->i_d.di_nblocks;
@@ -386,9 +383,8 @@ xfs_swap_extents(
tip->i_delayed_blks = ip->i_delayed_blks;
ip->i_delayed_blks = 0;
- ilf_fields = XFS_ILOG_CORE;
-
- switch(ip->i_d.di_format) {
+ src_log_flags = XFS_ILOG_CORE;
+ switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
/* If the extents fit in the inode, fix the
* pointer. Otherwise it's already NULL or
@@ -398,16 +394,15 @@ xfs_swap_extents(
ifp->if_u1.if_extents =
ifp->if_u2.if_inline_ext;
}
- ilf_fields |= XFS_ILOG_DEXT;
+ src_log_flags |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
- ilf_fields |= XFS_ILOG_DBROOT;
+ src_log_flags |= XFS_ILOG_DBROOT;
break;
}
- tilf_fields = XFS_ILOG_CORE;
-
- switch(tip->i_d.di_format) {
+ target_log_flags = XFS_ILOG_CORE;
+ switch (tip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
/* If the extents fit in the inode, fix the
* pointer. Otherwise it's already NULL or
@@ -417,10 +412,10 @@ xfs_swap_extents(
tifp->if_u1.if_extents =
tifp->if_u2.if_inline_ext;
}
- tilf_fields |= XFS_ILOG_DEXT;
+ target_log_flags |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
- tilf_fields |= XFS_ILOG_DBROOT;
+ target_log_flags |= XFS_ILOG_DBROOT;
break;
}
@@ -428,8 +423,8 @@ xfs_swap_extents(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_log_inode(tp, ip, ilf_fields);
- xfs_trans_log_inode(tp, tip, tilf_fields);
+ xfs_trans_log_inode(tp, ip, src_log_flags);
+ xfs_trans_log_inode(tp, tip, target_log_flags);
/*
* If this is a synchronous mount, make sure that the
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index a2e27010c7f..67a250c36d4 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -18,7 +18,6 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 9245e029b8e..586732f2d80 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -29,6 +28,7 @@
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
+#include "xfs_dir2.h"
#include "xfs_dir2_format.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 5bbe2a8a023..2046988e9eb 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 66e108f561a..397ffbcbab1 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 0179a41d9e5..b0f26780449 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 79d05e84e29..19bf0c5e38f 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 8a24f0c6c86..f9c3fe304a1 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -17,7 +17,6 @@
*/
#include "xfs.h"
#include "xfs_sb.h"
-#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
@@ -30,6 +29,7 @@
#include "xfs_inode.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
+#include "xfs_extent_busy.h"
#include "xfs_discard.h"
#include "xfs_trace.h"
@@ -37,9 +37,9 @@ STATIC int
xfs_trim_extents(
struct xfs_mount *mp,
xfs_agnumber_t agno,
- xfs_fsblock_t start,
- xfs_fsblock_t end,
- xfs_fsblock_t minlen,
+ xfs_daddr_t start,
+ xfs_daddr_t end,
+ xfs_daddr_t minlen,
__uint64_t *blocks_trimmed)
{
struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
@@ -67,8 +67,8 @@ xfs_trim_extents(
/*
* Look up the longest btree in the AGF and start with it.
*/
- error = xfs_alloc_lookup_le(cur, 0,
- XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+ error = xfs_alloc_lookup_ge(cur, 0,
+ be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
if (error)
goto out_del_cursor;
@@ -77,19 +77,29 @@ xfs_trim_extents(
* enough to be worth discarding.
*/
while (i) {
- xfs_agblock_t fbno;
- xfs_extlen_t flen;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ xfs_daddr_t dbno;
+ xfs_extlen_t dlen;
error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
if (error)
goto out_del_cursor;
XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
- ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+ ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
+
+ /*
+ * use daddr format for all range/len calculations as that is
+ * the format the range/len variables are supplied in by
+ * userspace.
+ */
+ dbno = XFS_AGB_TO_DADDR(mp, agno, fbno);
+ dlen = XFS_FSB_TO_BB(mp, flen);
/*
* Too small? Give up.
*/
- if (flen < minlen) {
+ if (dlen < minlen) {
trace_xfs_discard_toosmall(mp, agno, fbno, flen);
goto out_del_cursor;
}
@@ -99,8 +109,7 @@ xfs_trim_extents(
* supposed to discard skip it. Do not bother to trim
* down partially overlapping ranges for now.
*/
- if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
- XFS_AGB_TO_FSB(mp, agno, fbno) > end) {
+ if (dbno + dlen < start || dbno > end) {
trace_xfs_discard_exclude(mp, agno, fbno, flen);
goto next_extent;
}
@@ -109,16 +118,13 @@ xfs_trim_extents(
* If any blocks in the range are still busy, skip the
* discard and try again the next time.
*/
- if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+ if (xfs_extent_busy_search(mp, agno, fbno, flen)) {
trace_xfs_discard_busy(mp, agno, fbno, flen);
goto next_extent;
}
trace_xfs_discard_extent(mp, agno, fbno, flen);
- error = -blkdev_issue_discard(bdev,
- XFS_AGB_TO_DADDR(mp, agno, fbno),
- XFS_FSB_TO_BB(mp, flen),
- GFP_NOFS, 0);
+ error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
if (error)
goto out_del_cursor;
*blocks_trimmed += flen;
@@ -137,6 +143,15 @@ out_put_perag:
return error;
}
+/*
+ * trim a range of the filesystem.
+ *
+ * Note: the parameters passed from userspace are byte ranges into the
+ * filesystem which does not match to the format we use for filesystem block
+ * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
+ * is a linear address range. Hence we need to use DADDR based conversions and
+ * comparisons for determining the correct offset and regions to trim.
+ */
int
xfs_ioc_trim(
struct xfs_mount *mp,
@@ -145,7 +160,7 @@ xfs_ioc_trim(
struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
unsigned int granularity = q->limits.discard_granularity;
struct fstrim_range range;
- xfs_fsblock_t start, end, minlen;
+ xfs_daddr_t start, end, minlen;
xfs_agnumber_t start_agno, end_agno, agno;
__uint64_t blocks_trimmed = 0;
int error, last_error = 0;
@@ -159,22 +174,22 @@ xfs_ioc_trim(
/*
* Truncating down the len isn't actually quite correct, but using
- * XFS_B_TO_FSB would mean we trivially get overflows for values
+ * BBTOB would mean we trivially get overflows for values
* of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
* used by the fstrim application. In the end it really doesn't
* matter as trimming blocks is an advisory interface.
*/
- start = XFS_B_TO_FSBT(mp, range.start);
- end = start + XFS_B_TO_FSBT(mp, range.len) - 1;
- minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+ start = BTOBB(range.start);
+ end = start + BTOBBT(range.len) - 1;
+ minlen = BTOBB(max_t(u64, granularity, range.minlen));
- if (start >= mp->m_sb.sb_dblocks)
+ if (XFS_BB_TO_FSB(mp, start) >= mp->m_sb.sb_dblocks)
return -XFS_ERROR(EINVAL);
- if (end > mp->m_sb.sb_dblocks - 1)
- end = mp->m_sb.sb_dblocks - 1;
+ if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
+ end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
- start_agno = XFS_FSB_TO_AGNO(mp, start);
- end_agno = XFS_FSB_TO_AGNO(mp, end);
+ start_agno = xfs_daddr_to_agno(mp, start);
+ end_agno = xfs_daddr_to_agno(mp, end);
for (agno = start_agno; agno <= end_agno; agno++) {
error = -xfs_trim_extents(mp, agno, start, end, minlen,
@@ -197,7 +212,7 @@ xfs_discard_extents(
struct xfs_mount *mp,
struct list_head *list)
{
- struct xfs_busy_extent *busyp;
+ struct xfs_extent_busy *busyp;
int error = 0;
list_for_each_entry(busyp, list, list) {
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 25d7280e9f6..bf27fcca484 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -39,20 +38,18 @@
#include "xfs_qm.h"
#include "xfs_trace.h"
-
/*
- LOCK ORDER
-
- inode lock (ilock)
- dquot hash-chain lock (hashlock)
- xqm dquot freelist lock (freelistlock
- mount's dquot list lock (mplistlock)
- user dquot lock - lock ordering among dquots is based on the uid or gid
- group dquot lock - similar to udquots. Between the two dquots, the udquot
- has to be locked first.
- pin lock - the dquot lock must be held to take this lock.
- flush lock - ditto.
-*/
+ * Lock order:
+ *
+ * ip->i_lock
+ * qi->qi_tree_lock
+ * dquot->q_qlock (xfs_dqlock() and friends)
+ * dquot->q_flush (xfs_dqflock() and friends)
+ * qi->qi_lru_lock
+ *
+ * If two dquots need to be locked the order is user before group/project,
+ * otherwise by the lowest id first, see xfs_dqlock2.
+ */
#ifdef DEBUG
xfs_buftarg_t *xfs_dqerror_target;
@@ -61,83 +58,10 @@ int xfs_dqreq_num;
int xfs_dqerror_mod = 33;
#endif
-static struct lock_class_key xfs_dquot_other_class;
-
-/*
- * Allocate and initialize a dquot. We don't always allocate fresh memory;
- * we try to reclaim a free dquot if the number of incore dquots are above
- * a threshold.
- * The only field inside the core that gets initialized at this point
- * is the d_id field. The idea is to fill in the entire q_core
- * when we read in the on disk dquot.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqinit(
- xfs_mount_t *mp,
- xfs_dqid_t id,
- uint type)
-{
- xfs_dquot_t *dqp;
- boolean_t brandnewdquot;
-
- brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
- dqp->dq_flags = type;
- dqp->q_core.d_id = cpu_to_be32(id);
- dqp->q_mount = mp;
-
- /*
- * No need to re-initialize these if this is a reclaimed dquot.
- */
- if (brandnewdquot) {
- INIT_LIST_HEAD(&dqp->q_freelist);
- mutex_init(&dqp->q_qlock);
- init_waitqueue_head(&dqp->q_pinwait);
-
- /*
- * Because we want to use a counting completion, complete
- * the flush completion once to allow a single access to
- * the flush completion without blocking.
- */
- init_completion(&dqp->q_flush);
- complete(&dqp->q_flush);
-
- trace_xfs_dqinit(dqp);
- } else {
- /*
- * Only the q_core portion was zeroed in dqreclaim_one().
- * So, we need to reset others.
- */
- dqp->q_nrefs = 0;
- dqp->q_blkno = 0;
- INIT_LIST_HEAD(&dqp->q_mplist);
- INIT_LIST_HEAD(&dqp->q_hashlist);
- dqp->q_bufoffset = 0;
- dqp->q_fileoffset = 0;
- dqp->q_transp = NULL;
- dqp->q_gdquot = NULL;
- dqp->q_res_bcount = 0;
- dqp->q_res_icount = 0;
- dqp->q_res_rtbcount = 0;
- atomic_set(&dqp->q_pincount, 0);
- dqp->q_hash = NULL;
- ASSERT(list_empty(&dqp->q_freelist));
-
- trace_xfs_dqreuse(dqp);
- }
-
- /*
- * In either case we need to make sure group quotas have a different
- * lock class than user quotas, to make sure lockdep knows we can
- * locks of one of each at the same time.
- */
- if (!(type & XFS_DQ_USER))
- lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
+struct kmem_zone *xfs_qm_dqtrxzone;
+static struct kmem_zone *xfs_qm_dqzone;
- /*
- * log item gets initialized later
- */
- return (dqp);
-}
+static struct lock_class_key xfs_dquot_other_class;
/*
* This is called to free all the memory associated with a dquot
@@ -146,30 +70,12 @@ void
xfs_qm_dqdestroy(
xfs_dquot_t *dqp)
{
- ASSERT(list_empty(&dqp->q_freelist));
+ ASSERT(list_empty(&dqp->q_lru));
mutex_destroy(&dqp->q_qlock);
- kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
+ kmem_zone_free(xfs_qm_dqzone, dqp);
- atomic_dec(&xfs_Gqm->qm_totaldquots);
-}
-
-/*
- * This is what a 'fresh' dquot inside a dquot chunk looks like on disk.
- */
-STATIC void
-xfs_qm_dqinit_core(
- xfs_dqid_t id,
- uint type,
- xfs_dqblk_t *d)
-{
- /*
- * Caller has zero'd the entire dquot 'chunk' already.
- */
- d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
- d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
- d->dd_diskdq.d_id = cpu_to_be32(id);
- d->dd_diskdq.d_flags = type;
+ XFS_STATS_DEC(xs_qm_dquot);
}
/*
@@ -234,10 +140,10 @@ xfs_qm_adjust_dqtimers(
if (!d->d_btimer) {
if ((d->d_blk_softlimit &&
- (be64_to_cpu(d->d_bcount) >=
+ (be64_to_cpu(d->d_bcount) >
be64_to_cpu(d->d_blk_softlimit))) ||
(d->d_blk_hardlimit &&
- (be64_to_cpu(d->d_bcount) >=
+ (be64_to_cpu(d->d_bcount) >
be64_to_cpu(d->d_blk_hardlimit)))) {
d->d_btimer = cpu_to_be32(get_seconds() +
mp->m_quotainfo->qi_btimelimit);
@@ -246,10 +152,10 @@ xfs_qm_adjust_dqtimers(
}
} else {
if ((!d->d_blk_softlimit ||
- (be64_to_cpu(d->d_bcount) <
+ (be64_to_cpu(d->d_bcount) <=
be64_to_cpu(d->d_blk_softlimit))) &&
(!d->d_blk_hardlimit ||
- (be64_to_cpu(d->d_bcount) <
+ (be64_to_cpu(d->d_bcount) <=
be64_to_cpu(d->d_blk_hardlimit)))) {
d->d_btimer = 0;
}
@@ -257,10 +163,10 @@ xfs_qm_adjust_dqtimers(
if (!d->d_itimer) {
if ((d->d_ino_softlimit &&
- (be64_to_cpu(d->d_icount) >=
+ (be64_to_cpu(d->d_icount) >
be64_to_cpu(d->d_ino_softlimit))) ||
(d->d_ino_hardlimit &&
- (be64_to_cpu(d->d_icount) >=
+ (be64_to_cpu(d->d_icount) >
be64_to_cpu(d->d_ino_hardlimit)))) {
d->d_itimer = cpu_to_be32(get_seconds() +
mp->m_quotainfo->qi_itimelimit);
@@ -269,10 +175,10 @@ xfs_qm_adjust_dqtimers(
}
} else {
if ((!d->d_ino_softlimit ||
- (be64_to_cpu(d->d_icount) <
+ (be64_to_cpu(d->d_icount) <=
be64_to_cpu(d->d_ino_softlimit))) &&
(!d->d_ino_hardlimit ||
- (be64_to_cpu(d->d_icount) <
+ (be64_to_cpu(d->d_icount) <=
be64_to_cpu(d->d_ino_hardlimit)))) {
d->d_itimer = 0;
}
@@ -280,10 +186,10 @@ xfs_qm_adjust_dqtimers(
if (!d->d_rtbtimer) {
if ((d->d_rtb_softlimit &&
- (be64_to_cpu(d->d_rtbcount) >=
+ (be64_to_cpu(d->d_rtbcount) >
be64_to_cpu(d->d_rtb_softlimit))) ||
(d->d_rtb_hardlimit &&
- (be64_to_cpu(d->d_rtbcount) >=
+ (be64_to_cpu(d->d_rtbcount) >
be64_to_cpu(d->d_rtb_hardlimit)))) {
d->d_rtbtimer = cpu_to_be32(get_seconds() +
mp->m_quotainfo->qi_rtbtimelimit);
@@ -292,10 +198,10 @@ xfs_qm_adjust_dqtimers(
}
} else {
if ((!d->d_rtb_softlimit ||
- (be64_to_cpu(d->d_rtbcount) <
+ (be64_to_cpu(d->d_rtbcount) <=
be64_to_cpu(d->d_rtb_softlimit))) &&
(!d->d_rtb_hardlimit ||
- (be64_to_cpu(d->d_rtbcount) <
+ (be64_to_cpu(d->d_rtbcount) <=
be64_to_cpu(d->d_rtb_hardlimit)))) {
d->d_rtbtimer = 0;
}
@@ -328,8 +234,13 @@ xfs_qm_init_dquot_blk(
curid = id - (id % q->qi_dqperchunk);
ASSERT(curid >= 0);
memset(d, 0, BBTOB(q->qi_dqchunklen));
- for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
- xfs_qm_dqinit_core(curid, type, d);
+ for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) {
+ d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+ d->dd_diskdq.d_id = cpu_to_be32(curid);
+ d->dd_diskdq.d_flags = type;
+ }
+
xfs_trans_dquot_buf(tp, bp,
(type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
@@ -372,7 +283,7 @@ xfs_qm_dqalloc(
* Return if this type of quotas is turned off while we didn't
* have an inode lock
*/
- if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+ if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
xfs_iunlock(quotip, XFS_ILOCK_EXCL);
return (ESRCH);
}
@@ -474,7 +385,7 @@ xfs_qm_dqtobp(
dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
xfs_ilock(quotip, XFS_ILOCK_SHARED);
- if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+ if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
/*
* Return if this type of quotas is turned off while we
* didn't have the quota inode lock.
@@ -564,36 +475,87 @@ xfs_qm_dqtobp(
* Read in the ondisk dquot using dqtobp() then copy it to an incore version,
* and release the buffer immediately.
*
+ * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed.
*/
-/* ARGSUSED */
-STATIC int
+int
xfs_qm_dqread(
- xfs_trans_t **tpp,
- xfs_dqid_t id,
- xfs_dquot_t *dqp, /* dquot to get filled in */
- uint flags)
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ uint flags,
+ struct xfs_dquot **O_dqpp)
{
- xfs_disk_dquot_t *ddqp;
- xfs_buf_t *bp;
- int error;
- xfs_trans_t *tp;
+ struct xfs_dquot *dqp;
+ struct xfs_disk_dquot *ddqp;
+ struct xfs_buf *bp;
+ struct xfs_trans *tp = NULL;
+ int error;
+ int cancelflags = 0;
+
+
+ dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
+
+ dqp->dq_flags = type;
+ dqp->q_core.d_id = cpu_to_be32(id);
+ dqp->q_mount = mp;
+ INIT_LIST_HEAD(&dqp->q_lru);
+ mutex_init(&dqp->q_qlock);
+ init_waitqueue_head(&dqp->q_pinwait);
- ASSERT(tpp);
+ /*
+ * Because we want to use a counting completion, complete
+ * the flush completion once to allow a single access to
+ * the flush completion without blocking.
+ */
+ init_completion(&dqp->q_flush);
+ complete(&dqp->q_flush);
+
+ /*
+ * Make sure group quotas have a different lock class than user
+ * quotas.
+ */
+ if (!(type & XFS_DQ_USER))
+ lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
+
+ XFS_STATS_INC(xs_qm_dquot);
trace_xfs_dqread(dqp);
+ if (flags & XFS_QMOPT_DQALLOC) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
+ error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
+ XFS_WRITE_LOG_RES(mp) +
+ /*
+ * Round the chunklen up to the next multiple
+ * of 128 (buf log item chunk size)).
+ */
+ BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128,
+ 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_WRITE_LOG_COUNT);
+ if (error)
+ goto error1;
+ cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+ }
+
/*
* get a pointer to the on-disk dquot and the buffer containing it
* dqp already knows its own type (GROUP/USER).
*/
- if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) {
- return (error);
+ error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags);
+ if (error) {
+ /*
+ * This can happen if quotas got turned off (ESRCH),
+ * or if the dquot didn't exist on disk and we ask to
+ * allocate (ENOENT).
+ */
+ trace_xfs_dqread_fail(dqp);
+ cancelflags |= XFS_TRANS_ABORT;
+ goto error1;
}
- tp = *tpp;
/* copy everything from disk dquot to the incore dquot */
memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
- ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
xfs_qm_dquot_logitem_init(dqp);
/*
@@ -622,173 +584,22 @@ xfs_qm_dqread(
ASSERT(xfs_buf_islocked(bp));
xfs_trans_brelse(tp, bp);
- return (error);
-}
-
-
-/*
- * allocate an incore dquot from the kernel heap,
- * and fill its core with quota information kept on disk.
- * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk
- * if it wasn't already allocated.
- */
-STATIC int
-xfs_qm_idtodq(
- xfs_mount_t *mp,
- xfs_dqid_t id, /* gid or uid, depending on type */
- uint type, /* UDQUOT or GDQUOT */
- uint flags, /* DQALLOC, DQREPAIR */
- xfs_dquot_t **O_dqpp)/* OUT : incore dquot, not locked */
-{
- xfs_dquot_t *dqp;
- int error;
- xfs_trans_t *tp;
- int cancelflags=0;
-
- dqp = xfs_qm_dqinit(mp, id, type);
- tp = NULL;
- if (flags & XFS_QMOPT_DQALLOC) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
- error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
- XFS_WRITE_LOG_RES(mp) +
- BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
- 128,
- 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT);
- if (error) {
- cancelflags = 0;
- goto error0;
- }
- cancelflags = XFS_TRANS_RELEASE_LOG_RES;
- }
-
- /*
- * Read it from disk; xfs_dqread() takes care of
- * all the necessary initialization of dquot's fields (locks, etc)
- */
- if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) {
- /*
- * This can happen if quotas got turned off (ESRCH),
- * or if the dquot didn't exist on disk and we ask to
- * allocate (ENOENT).
- */
- trace_xfs_dqread_fail(dqp);
- cancelflags |= XFS_TRANS_ABORT;
- goto error0;
- }
if (tp) {
- if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES)))
- goto error1;
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto error0;
}
*O_dqpp = dqp;
- return (0);
+ return error;
- error0:
- ASSERT(error);
+error1:
if (tp)
xfs_trans_cancel(tp, cancelflags);
- error1:
+error0:
xfs_qm_dqdestroy(dqp);
*O_dqpp = NULL;
- return (error);
-}
-
-/*
- * Lookup a dquot in the incore dquot hashtable. We keep two separate
- * hashtables for user and group dquots; and, these are global tables
- * inside the XQM, not per-filesystem tables.
- * The hash chain must be locked by caller, and it is left locked
- * on return. Returning dquot is locked.
- */
-STATIC int
-xfs_qm_dqlookup(
- xfs_mount_t *mp,
- xfs_dqid_t id,
- xfs_dqhash_t *qh,
- xfs_dquot_t **O_dqpp)
-{
- xfs_dquot_t *dqp;
- uint flist_locked;
-
- ASSERT(mutex_is_locked(&qh->qh_lock));
-
- flist_locked = B_FALSE;
-
- /*
- * Traverse the hashchain looking for a match
- */
- list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
- /*
- * We already have the hashlock. We don't need the
- * dqlock to look at the id field of the dquot, since the
- * id can't be modified without the hashlock anyway.
- */
- if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) {
- trace_xfs_dqlookup_found(dqp);
-
- /*
- * All in core dquots must be on the dqlist of mp
- */
- ASSERT(!list_empty(&dqp->q_mplist));
-
- xfs_dqlock(dqp);
- if (dqp->q_nrefs == 0) {
- ASSERT(!list_empty(&dqp->q_freelist));
- if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
- trace_xfs_dqlookup_want(dqp);
-
- /*
- * We may have raced with dqreclaim_one()
- * (and lost). So, flag that we don't
- * want the dquot to be reclaimed.
- */
- dqp->dq_flags |= XFS_DQ_WANT;
- xfs_dqunlock(dqp);
- mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
- xfs_dqlock(dqp);
- dqp->dq_flags &= ~(XFS_DQ_WANT);
- }
- flist_locked = B_TRUE;
- }
-
- /*
- * id couldn't have changed; we had the hashlock all
- * along
- */
- ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
-
- if (flist_locked) {
- if (dqp->q_nrefs != 0) {
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- flist_locked = B_FALSE;
- } else {
- /* take it off the freelist */
- trace_xfs_dqlookup_freelist(dqp);
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
- }
- }
-
- XFS_DQHOLD(dqp);
-
- if (flist_locked)
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- /*
- * move the dquot to the front of the hashchain
- */
- ASSERT(mutex_is_locked(&qh->qh_lock));
- list_move(&dqp->q_hashlist, &qh->qh_list);
- trace_xfs_dqlookup_done(dqp);
- *O_dqpp = dqp;
- return 0;
- }
- }
-
- *O_dqpp = NULL;
- ASSERT(mutex_is_locked(&qh->qh_lock));
- return (1);
+ return error;
}
/*
@@ -808,10 +619,10 @@ xfs_qm_dqget(
uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
{
- xfs_dquot_t *dqp;
- xfs_dqhash_t *h;
- uint version;
- int error;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+ struct xfs_dquot *dqp;
+ int error;
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
@@ -819,7 +630,6 @@ xfs_qm_dqget(
(! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
return (ESRCH);
}
- h = XFS_DQ_HASH(mp, id, type);
#ifdef DEBUG
if (xfs_do_dqerror) {
@@ -829,42 +639,39 @@ xfs_qm_dqget(
return (EIO);
}
}
-#endif
- again:
-
-#ifdef DEBUG
ASSERT(type == XFS_DQ_USER ||
type == XFS_DQ_PROJ ||
type == XFS_DQ_GROUP);
if (ip) {
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (type == XFS_DQ_USER)
- ASSERT(ip->i_udquot == NULL);
- else
- ASSERT(ip->i_gdquot == NULL);
+ ASSERT(xfs_inode_dquot(ip, type) == NULL);
}
#endif
- mutex_lock(&h->qh_lock);
- /*
- * Look in the cache (hashtable).
- * The chain is kept locked during lookup.
- */
- if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) {
- XQM_STATS_INC(xqmstats.xs_qm_dqcachehits);
- /*
- * The dquot was found, moved to the front of the chain,
- * taken off the freelist if it was on it, and locked
- * at this point. Just unlock the hashchain and return.
- */
- ASSERT(*O_dqpp);
- ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
- mutex_unlock(&h->qh_lock);
- trace_xfs_dqget_hit(*O_dqpp);
- return (0); /* success */
+restart:
+ mutex_lock(&qi->qi_tree_lock);
+ dqp = radix_tree_lookup(tree, id);
+ if (dqp) {
+ xfs_dqlock(dqp);
+ if (dqp->dq_flags & XFS_DQ_FREEING) {
+ xfs_dqunlock(dqp);
+ mutex_unlock(&qi->qi_tree_lock);
+ trace_xfs_dqget_freeing(dqp);
+ delay(1);
+ goto restart;
+ }
+
+ dqp->q_nrefs++;
+ mutex_unlock(&qi->qi_tree_lock);
+
+ trace_xfs_dqget_hit(dqp);
+ XFS_STATS_INC(xs_qm_dqcachehits);
+ *O_dqpp = dqp;
+ return 0;
}
- XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
+ mutex_unlock(&qi->qi_tree_lock);
+ XFS_STATS_INC(xs_qm_dqcachemisses);
/*
* Dquot cache miss. We don't want to keep the inode lock across
@@ -875,130 +682,62 @@ xfs_qm_dqget(
*/
if (ip)
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /*
- * Save the hashchain version stamp, and unlock the chain, so that
- * we don't keep the lock across a disk read
- */
- version = h->qh_version;
- mutex_unlock(&h->qh_lock);
- /*
- * Allocate the dquot on the kernel heap, and read the ondisk
- * portion off the disk. Also, do all the necessary initialization
- * This can return ENOENT if dquot didn't exist on disk and we didn't
- * ask it to allocate; ESRCH if quotas got turned off suddenly.
- */
- if ((error = xfs_qm_idtodq(mp, id, type,
- flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR|
- XFS_QMOPT_DOWARN),
- &dqp))) {
- if (ip)
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- return (error);
- }
-
- /*
- * See if this is mount code calling to look at the overall quota limits
- * which are stored in the id == 0 user or group's dquot.
- * Since we may not have done a quotacheck by this point, just return
- * the dquot without attaching it to any hashtables, lists, etc, or even
- * taking a reference.
- * The caller must dqdestroy this once done.
- */
- if (flags & XFS_QMOPT_DQSUSER) {
- ASSERT(id == 0);
- ASSERT(! ip);
- goto dqret;
- }
+ error = xfs_qm_dqread(mp, id, type, flags, &dqp);
- /*
- * Dquot lock comes after hashlock in the lock ordering
- */
- if (ip) {
+ if (ip)
xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ if (ip) {
/*
* A dquot could be attached to this inode by now, since
* we had dropped the ilock.
*/
- if (type == XFS_DQ_USER) {
- if (!XFS_IS_UQUOTA_ON(mp)) {
- /* inode stays locked on return */
- xfs_qm_dqdestroy(dqp);
- return XFS_ERROR(ESRCH);
- }
- if (ip->i_udquot) {
+ if (xfs_this_quota_on(mp, type)) {
+ struct xfs_dquot *dqp1;
+
+ dqp1 = xfs_inode_dquot(ip, type);
+ if (dqp1) {
xfs_qm_dqdestroy(dqp);
- dqp = ip->i_udquot;
+ dqp = dqp1;
xfs_dqlock(dqp);
goto dqret;
}
} else {
- if (!XFS_IS_OQUOTA_ON(mp)) {
- /* inode stays locked on return */
- xfs_qm_dqdestroy(dqp);
- return XFS_ERROR(ESRCH);
- }
- if (ip->i_gdquot) {
- xfs_qm_dqdestroy(dqp);
- dqp = ip->i_gdquot;
- xfs_dqlock(dqp);
- goto dqret;
- }
+ /* inode stays locked on return */
+ xfs_qm_dqdestroy(dqp);
+ return XFS_ERROR(ESRCH);
}
}
- /*
- * Hashlock comes after ilock in lock order
- */
- mutex_lock(&h->qh_lock);
- if (version != h->qh_version) {
- xfs_dquot_t *tmpdqp;
+ mutex_lock(&qi->qi_tree_lock);
+ error = -radix_tree_insert(tree, id, dqp);
+ if (unlikely(error)) {
+ WARN_ON(error != EEXIST);
+
/*
- * Now, see if somebody else put the dquot in the
- * hashtable before us. This can happen because we didn't
- * keep the hashchain lock. We don't have to worry about
- * lock order between the two dquots here since dqp isn't
- * on any findable lists yet.
+ * Duplicate found. Just throw away the new dquot and start
+ * over.
*/
- if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) {
- /*
- * Duplicate found. Just throw away the new dquot
- * and start over.
- */
- xfs_qm_dqput(tmpdqp);
- mutex_unlock(&h->qh_lock);
- xfs_qm_dqdestroy(dqp);
- XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
- goto again;
- }
+ mutex_unlock(&qi->qi_tree_lock);
+ trace_xfs_dqget_dup(dqp);
+ xfs_qm_dqdestroy(dqp);
+ XFS_STATS_INC(xs_qm_dquot_dups);
+ goto restart;
}
/*
- * Put the dquot at the beginning of the hash-chain and mp's list
- * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
- */
- ASSERT(mutex_is_locked(&h->qh_lock));
- dqp->q_hash = h;
- list_add(&dqp->q_hashlist, &h->qh_list);
- h->qh_version++;
-
- /*
- * Attach this dquot to this filesystem's list of all dquots,
- * kept inside the mount structure in m_quotainfo field
- */
- mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
-
- /*
* We return a locked dquot to the caller, with a reference taken
*/
xfs_dqlock(dqp);
dqp->q_nrefs = 1;
- list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
- mp->m_quotainfo->qi_dquots++;
- mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
- mutex_unlock(&h->qh_lock);
+ qi->qi_dquots++;
+ mutex_unlock(&qi->qi_tree_lock);
+
dqret:
ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
trace_xfs_dqget_miss(dqp);
@@ -1007,75 +746,61 @@ xfs_qm_dqget(
}
-/*
- * Release a reference to the dquot (decrement ref-count)
- * and unlock it. If there is a group quota attached to this
- * dquot, carefully release that too without tripping over
- * deadlocks'n'stuff.
- */
-void
-xfs_qm_dqput(
- xfs_dquot_t *dqp)
+STATIC void
+xfs_qm_dqput_final(
+ struct xfs_dquot *dqp)
{
- xfs_dquot_t *gdqp;
+ struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
+ struct xfs_dquot *gdqp;
- ASSERT(dqp->q_nrefs > 0);
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
-
- trace_xfs_dqput(dqp);
+ trace_xfs_dqput_free(dqp);
- if (dqp->q_nrefs != 1) {
- dqp->q_nrefs--;
- xfs_dqunlock(dqp);
- return;
+ mutex_lock(&qi->qi_lru_lock);
+ if (list_empty(&dqp->q_lru)) {
+ list_add_tail(&dqp->q_lru, &qi->qi_lru_list);
+ qi->qi_lru_count++;
+ XFS_STATS_INC(xs_qm_dquot_unused);
}
+ mutex_unlock(&qi->qi_lru_lock);
/*
- * drop the dqlock and acquire the freelist and dqlock
- * in the right order; but try to get it out-of-order first
+ * If we just added a udquot to the freelist, then we want to release
+ * the gdquot reference that it (probably) has. Otherwise it'll keep
+ * the gdquot from getting reclaimed.
*/
- if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
- trace_xfs_dqput_wait(dqp);
- xfs_dqunlock(dqp);
- mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
- xfs_dqlock(dqp);
+ gdqp = dqp->q_gdquot;
+ if (gdqp) {
+ xfs_dqlock(gdqp);
+ dqp->q_gdquot = NULL;
}
+ xfs_dqunlock(dqp);
- while (1) {
- gdqp = NULL;
+ /*
+ * If we had a group quota hint, release it now.
+ */
+ if (gdqp)
+ xfs_qm_dqput(gdqp);
+}
- /* We can't depend on nrefs being == 1 here */
- if (--dqp->q_nrefs == 0) {
- trace_xfs_dqput_free(dqp);
+/*
+ * Release a reference to the dquot (decrement ref-count) and unlock it.
+ *
+ * If there is a group quota attached to this dquot, carefully release that
+ * too without tripping over deadlocks'n'stuff.
+ */
+void
+xfs_qm_dqput(
+ struct xfs_dquot *dqp)
+{
+ ASSERT(dqp->q_nrefs > 0);
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
- list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
- xfs_Gqm->qm_dqfrlist_cnt++;
+ trace_xfs_dqput(dqp);
- /*
- * If we just added a udquot to the freelist, then
- * we want to release the gdquot reference that
- * it (probably) has. Otherwise it'll keep the
- * gdquot from getting reclaimed.
- */
- if ((gdqp = dqp->q_gdquot)) {
- /*
- * Avoid a recursive dqput call
- */
- xfs_dqlock(gdqp);
- dqp->q_gdquot = NULL;
- }
- }
+ if (--dqp->q_nrefs > 0)
xfs_dqunlock(dqp);
-
- /*
- * If we had a group quota inside the user quota as a hint,
- * release it now.
- */
- if (! gdqp)
- break;
- dqp = gdqp;
- }
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+ else
+ xfs_qm_dqput_final(dqp);
}
/*
@@ -1131,7 +856,7 @@ xfs_qm_dqflush_done(
/* xfs_trans_ail_delete() drops the AIL lock. */
spin_lock(&ailp->xa_lock);
if (lip->li_lsn == qip->qli_flush_lsn)
- xfs_trans_ail_delete(ailp, lip);
+ xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
else
spin_unlock(&ailp->xa_lock);
}
@@ -1152,8 +877,8 @@ xfs_qm_dqflush_done(
*/
int
xfs_qm_dqflush(
- xfs_dquot_t *dqp,
- uint flags)
+ struct xfs_dquot *dqp,
+ struct xfs_buf **bpp)
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_buf *bp;
@@ -1165,25 +890,30 @@ xfs_qm_dqflush(
trace_xfs_dqflush(dqp);
- /*
- * If not dirty, or it's pinned and we are not supposed to block, nada.
- */
- if (!XFS_DQ_IS_DIRTY(dqp) ||
- (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
- xfs_dqfunlock(dqp);
- return 0;
- }
+ *bpp = NULL;
+
xfs_qm_dqunpin_wait(dqp);
/*
* This may have been unpinned because the filesystem is shutting
* down forcibly. If that's the case we must not write this dquot
- * to disk, because the log record didn't make it to disk!
+ * to disk, because the log record didn't make it to disk.
+ *
+ * We also have to remove the log item from the AIL in this case,
+ * as we wait for an emptry AIL as part of the unmount process.
*/
if (XFS_FORCED_SHUTDOWN(mp)) {
+ struct xfs_log_item *lip = &dqp->q_logitem.qli_item;
dqp->dq_flags &= ~XFS_DQ_DIRTY;
- xfs_dqfunlock(dqp);
- return XFS_ERROR(EIO);
+
+ spin_lock(&mp->m_ail->xa_lock);
+ if (lip->li_flags & XFS_LI_IN_AIL)
+ xfs_trans_ail_delete(mp->m_ail, lip,
+ SHUTDOWN_CORRUPT_INCORE);
+ else
+ spin_unlock(&mp->m_ail->xa_lock);
+ error = XFS_ERROR(EIO);
+ goto out_unlock;
}
/*
@@ -1191,11 +921,8 @@ xfs_qm_dqflush(
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, 0, &bp);
- if (error) {
- ASSERT(error != ENOENT);
- xfs_dqfunlock(dqp);
- return error;
- }
+ if (error)
+ goto out_unlock;
/*
* Calculate the location of the dquot inside the buffer.
@@ -1241,54 +968,13 @@ xfs_qm_dqflush(
xfs_log_force(mp, 0);
}
- if (flags & SYNC_WAIT)
- error = xfs_bwrite(bp);
- else
- xfs_buf_delwri_queue(bp);
-
- xfs_buf_relse(bp);
-
trace_xfs_dqflush_done(dqp);
+ *bpp = bp;
+ return 0;
- /*
- * dqp is still locked, but caller is free to unlock it now.
- */
- return error;
-
-}
-
-int
-xfs_qm_dqlock_nowait(
- xfs_dquot_t *dqp)
-{
- return mutex_trylock(&dqp->q_qlock);
-}
-
-void
-xfs_dqlock(
- xfs_dquot_t *dqp)
-{
- mutex_lock(&dqp->q_qlock);
-}
-
-void
-xfs_dqunlock(
- xfs_dquot_t *dqp)
-{
- mutex_unlock(&(dqp->q_qlock));
- if (dqp->q_logitem.qli_dquot == dqp) {
- /* Once was dqp->q_mount, but might just have been cleared */
- xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
- (xfs_log_item_t*)&(dqp->q_logitem));
- }
-}
-
-
-void
-xfs_dqunlock_nonotify(
- xfs_dquot_t *dqp)
-{
- mutex_unlock(&(dqp->q_qlock));
+out_unlock:
+ xfs_dqfunlock(dqp);
+ return XFS_ERROR(EIO);
}
/*
@@ -1319,138 +1005,30 @@ xfs_dqlock2(
}
}
-
-/*
- * Take a dquot out of the mount's dqlist as well as the hashlist.
- * This is called via unmount as well as quotaoff, and the purge
- * will always succeed unless there are soft (temp) references
- * outstanding.
- *
- * This returns 0 if it was purged, 1 if it wasn't. It's not an error code
- * that we're returning! XXXsup - not cool.
- */
-/* ARGSUSED */
-int
-xfs_qm_dqpurge(
- xfs_dquot_t *dqp)
+int __init
+xfs_qm_init(void)
{
- xfs_dqhash_t *qh = dqp->q_hash;
- xfs_mount_t *mp = dqp->q_mount;
-
- ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
- ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
+ xfs_qm_dqzone =
+ kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot");
+ if (!xfs_qm_dqzone)
+ goto out;
- xfs_dqlock(dqp);
- /*
- * We really can't afford to purge a dquot that is
- * referenced, because these are hard refs.
- * It shouldn't happen in general because we went thru _all_ inodes in
- * dqrele_all_inodes before calling this and didn't let the mountlock go.
- * However it is possible that we have dquots with temporary
- * references that are not attached to an inode. e.g. see xfs_setattr().
- */
- if (dqp->q_nrefs != 0) {
- xfs_dqunlock(dqp);
- mutex_unlock(&dqp->q_hash->qh_lock);
- return (1);
- }
-
- ASSERT(!list_empty(&dqp->q_freelist));
-
- /*
- * If we're turning off quotas, we have to make sure that, for
- * example, we don't delete quota disk blocks while dquots are
- * in the process of getting written to those disk blocks.
- * This dquot might well be on AIL, and we can't leave it there
- * if we're turning off quotas. Basically, we need this flush
- * lock, and are willing to block on it.
- */
- if (!xfs_dqflock_nowait(dqp)) {
- /*
- * Block on the flush lock after nudging dquot buffer,
- * if it is incore.
- */
- xfs_qm_dqflock_pushbuf_wait(dqp);
- }
-
- /*
- * XXXIf we're turning this type of quotas off, we don't care
- * about the dirty metadata sitting in this dquot. OTOH, if
- * we're unmounting, we do care, so we flush it and wait.
- */
- if (XFS_DQ_IS_DIRTY(dqp)) {
- int error;
+ xfs_qm_dqtrxzone =
+ kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx");
+ if (!xfs_qm_dqtrxzone)
+ goto out_free_dqzone;
- /* dqflush unlocks dqflock */
- /*
- * Given that dqpurge is a very rare occurrence, it is OK
- * that we're holding the hashlist and mplist locks
- * across the disk write. But, ... XXXsup
- *
- * We don't care about getting disk errors here. We need
- * to purge this dquot anyway, so we go ahead regardless.
- */
- error = xfs_qm_dqflush(dqp, SYNC_WAIT);
- if (error)
- xfs_warn(mp, "%s: dquot %p flush failed",
- __func__, dqp);
- xfs_dqflock(dqp);
- }
- ASSERT(atomic_read(&dqp->q_pincount) == 0);
- ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
- !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
-
- list_del_init(&dqp->q_hashlist);
- qh->qh_version++;
- list_del_init(&dqp->q_mplist);
- mp->m_quotainfo->qi_dqreclaims++;
- mp->m_quotainfo->qi_dquots--;
- /*
- * XXX Move this to the front of the freelist, if we can get the
- * freelist lock.
- */
- ASSERT(!list_empty(&dqp->q_freelist));
+ return 0;
- dqp->q_mount = NULL;
- dqp->q_hash = NULL;
- dqp->dq_flags = XFS_DQ_INACTIVE;
- memset(&dqp->q_core, 0, sizeof(dqp->q_core));
- xfs_dqfunlock(dqp);
- xfs_dqunlock(dqp);
- mutex_unlock(&qh->qh_lock);
- return (0);
+out_free_dqzone:
+ kmem_zone_destroy(xfs_qm_dqzone);
+out:
+ return -ENOMEM;
}
-
-/*
- * Give the buffer a little push if it is incore and
- * wait on the flush lock.
- */
void
-xfs_qm_dqflock_pushbuf_wait(
- xfs_dquot_t *dqp)
+xfs_qm_exit(void)
{
- xfs_mount_t *mp = dqp->q_mount;
- xfs_buf_t *bp;
-
- /*
- * Check to see if the dquot has been flushed delayed
- * write. If so, grab its buffer and send it
- * out immediately. We'll be able to acquire
- * the flush lock when the I/O completes.
- */
- bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
- if (!bp)
- goto out_lock;
-
- if (XFS_BUF_ISDELAYWRITE(bp)) {
- if (xfs_buf_ispinned(bp))
- xfs_log_force(mp, 0);
- xfs_buf_delwri_promote(bp);
- wake_up_process(bp->b_target->bt_task);
- }
- xfs_buf_relse(bp);
-out_lock:
- xfs_dqflock(dqp);
+ kmem_zone_destroy(xfs_qm_dqtrxzone);
+ kmem_zone_destroy(xfs_qm_dqzone);
}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 34b7e945dbf..7d20af27346 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -29,16 +29,6 @@
* when quotas are off.
*/
-/*
- * The hash chain headers (hash buckets)
- */
-typedef struct xfs_dqhash {
- struct list_head qh_list;
- struct mutex qh_lock;
- uint qh_version; /* ever increasing version */
- uint qh_nelems; /* number of dquots on the list */
-} xfs_dqhash_t;
-
struct xfs_mount;
struct xfs_trans;
@@ -47,10 +37,7 @@ struct xfs_trans;
*/
typedef struct xfs_dquot {
uint dq_flags; /* various flags (XFS_DQ_*) */
- struct list_head q_freelist; /* global free list of dquots */
- struct list_head q_mplist; /* mount's list of dquots */
- struct list_head q_hashlist; /* gloabl hash list of dquots */
- xfs_dqhash_t *q_hash; /* the hashchain header */
+ struct list_head q_lru; /* global free list of dquots */
struct xfs_mount*q_mount; /* filesystem this relates to */
struct xfs_trans*q_transp; /* trans this belongs to currently */
uint q_nrefs; /* # active refs from inodes */
@@ -80,8 +67,6 @@ enum {
XFS_QLOCK_NESTED,
};
-#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++)
-
/*
* Manage the q_flush completion queue embedded in the dquot. This completion
* queue synchronizes processes attempting to flush the in-core dquot back to
@@ -102,6 +87,47 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
complete(&dqp->q_flush);
}
+static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp)
+{
+ return mutex_trylock(&dqp->q_qlock);
+}
+
+static inline void xfs_dqlock(struct xfs_dquot *dqp)
+{
+ mutex_lock(&dqp->q_qlock);
+}
+
+static inline void xfs_dqunlock(struct xfs_dquot *dqp)
+{
+ mutex_unlock(&dqp->q_qlock);
+}
+
+static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
+{
+ switch (type & XFS_DQ_ALLTYPES) {
+ case XFS_DQ_USER:
+ return XFS_IS_UQUOTA_ON(mp);
+ case XFS_DQ_GROUP:
+ case XFS_DQ_PROJ:
+ return XFS_IS_OQUOTA_ON(mp);
+ default:
+ return 0;
+ }
+}
+
+static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
+{
+ switch (type & XFS_DQ_ALLTYPES) {
+ case XFS_DQ_USER:
+ return ip->i_udquot;
+ case XFS_DQ_GROUP:
+ case XFS_DQ_PROJ:
+ return ip->i_gdquot;
+ default:
+ return NULL;
+ }
+}
+
#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
@@ -112,16 +138,11 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
-#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
- (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
- (XFS_IS_OQUOTA_ON((d)->q_mount))))
-
+extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
+ uint, struct xfs_dquot **);
extern void xfs_qm_dqdestroy(xfs_dquot_t *);
-extern int xfs_qm_dqflush(xfs_dquot_t *, uint);
-extern int xfs_qm_dqpurge(xfs_dquot_t *);
+extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
-extern int xfs_qm_dqlock_nowait(xfs_dquot_t *);
-extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
xfs_disk_dquot_t *);
extern void xfs_qm_adjust_dqlimits(xfs_mount_t *,
@@ -129,9 +150,15 @@ extern void xfs_qm_adjust_dqlimits(xfs_mount_t *,
extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
xfs_dqid_t, uint, uint, xfs_dquot_t **);
extern void xfs_qm_dqput(xfs_dquot_t *);
-extern void xfs_dqlock(xfs_dquot_t *);
-extern void xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *);
-extern void xfs_dqunlock(xfs_dquot_t *);
-extern void xfs_dqunlock_nonotify(xfs_dquot_t *);
+
+extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
+
+static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
+{
+ xfs_dqlock(dqp);
+ dqp->q_nrefs++;
+ xfs_dqunlock(dqp);
+ return dqp;
+}
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 0dee0b71029..57aa4b03720 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -17,9 +17,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -73,7 +71,6 @@ xfs_qm_dquot_logitem_format(
logvec->i_len = sizeof(xfs_disk_dquot_t);
logvec->i_type = XLOG_REG_TYPE_DQUOT;
- ASSERT(2 == lip->li_desc->lid_size);
qlip->qli_format.qlf_size = 2;
}
@@ -109,38 +106,6 @@ xfs_qm_dquot_logitem_unpin(
wake_up(&dqp->q_pinwait);
}
-/*
- * Given the logitem, this writes the corresponding dquot entry to disk
- * asynchronously. This is called with the dquot entry securely locked;
- * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
- * at the end.
- */
-STATIC void
-xfs_qm_dquot_logitem_push(
- struct xfs_log_item *lip)
-{
- struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
- int error;
-
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
- ASSERT(!completion_done(&dqp->q_flush));
-
- /*
- * Since we were able to lock the dquot's flush lock and
- * we found it on the AIL, the dquot must be dirty. This
- * is because the dquot is removed from the AIL while still
- * holding the flush lock in xfs_dqflush_done(). Thus, if
- * we found it in the AIL and were able to obtain the flush
- * lock without sleeping, then there must not have been
- * anyone in the process of flushing the dquot.
- */
- error = xfs_qm_dqflush(dqp, 0);
- if (error)
- xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
- __func__, error, dqp);
- xfs_dqunlock(dqp);
-}
-
STATIC xfs_lsn_t
xfs_qm_dquot_logitem_committed(
struct xfs_log_item *lip,
@@ -172,84 +137,57 @@ xfs_qm_dqunpin_wait(
wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
}
-/*
- * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
- * the dquot is locked by us, but the flush lock isn't. So, here we are
- * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
- * If so, we want to push it out to help us take this item off the AIL as soon
- * as possible.
- *
- * We must not be holding the AIL lock at this point. Calling incore() to
- * search the buffer cache can be a time consuming thing, and AIL lock is a
- * spinlock.
- */
-STATIC bool
-xfs_qm_dquot_logitem_pushbuf(
- struct xfs_log_item *lip)
-{
- struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
- struct xfs_dquot *dqp = qlip->qli_dquot;
- struct xfs_buf *bp;
- bool ret = true;
-
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
-
- /*
- * If flushlock isn't locked anymore, chances are that the
- * inode flush completed and the inode was taken off the AIL.
- * So, just get out.
- */
- if (completion_done(&dqp->q_flush) ||
- !(lip->li_flags & XFS_LI_IN_AIL)) {
- xfs_dqunlock(dqp);
- return true;
- }
-
- bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
- dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
- xfs_dqunlock(dqp);
- if (!bp)
- return true;
- if (XFS_BUF_ISDELAYWRITE(bp))
- xfs_buf_delwri_promote(bp);
- if (xfs_buf_ispinned(bp))
- ret = false;
- xfs_buf_relse(bp);
- return ret;
-}
-
-/*
- * This is called to attempt to lock the dquot associated with this
- * dquot log item. Don't sleep on the dquot lock or the flush lock.
- * If the flush lock is already held, indicating that the dquot has
- * been or is in the process of being flushed, then see if we can
- * find the dquot's buffer in the buffer cache without sleeping. If
- * we can and it is marked delayed write, then we want to send it out.
- * We delay doing so until the push routine, though, to avoid sleeping
- * in any device strategy routines.
- */
STATIC uint
-xfs_qm_dquot_logitem_trylock(
- struct xfs_log_item *lip)
+xfs_qm_dquot_logitem_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
+ struct xfs_buf *bp = NULL;
+ uint rval = XFS_ITEM_SUCCESS;
+ int error;
if (atomic_read(&dqp->q_pincount) > 0)
return XFS_ITEM_PINNED;
- if (!xfs_qm_dqlock_nowait(dqp))
+ if (!xfs_dqlock_nowait(dqp))
return XFS_ITEM_LOCKED;
+ /*
+ * Re-check the pincount now that we stabilized the value by
+ * taking the quota lock.
+ */
+ if (atomic_read(&dqp->q_pincount) > 0) {
+ rval = XFS_ITEM_PINNED;
+ goto out_unlock;
+ }
+
+ /*
+ * Someone else is already flushing the dquot. Nothing we can do
+ * here but wait for the flush to finish and remove the item from
+ * the AIL.
+ */
if (!xfs_dqflock_nowait(dqp)) {
- /*
- * dquot has already been flushed to the backing buffer,
- * leave it locked, pushbuf routine will unlock it.
- */
- return XFS_ITEM_PUSHBUF;
+ rval = XFS_ITEM_FLUSHING;
+ goto out_unlock;
}
- ASSERT(lip->li_flags & XFS_LI_IN_AIL);
- return XFS_ITEM_SUCCESS;
+ spin_unlock(&lip->li_ailp->xa_lock);
+
+ error = xfs_qm_dqflush(dqp, &bp);
+ if (error) {
+ xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
+ __func__, error, dqp);
+ } else {
+ if (!xfs_buf_delwri_queue(bp, buffer_list))
+ rval = XFS_ITEM_FLUSHING;
+ xfs_buf_relse(bp);
+ }
+
+ spin_lock(&lip->li_ailp->xa_lock);
+out_unlock:
+ xfs_dqunlock(dqp);
+ return rval;
}
/*
@@ -300,11 +238,9 @@ static const struct xfs_item_ops xfs_dquot_item_ops = {
.iop_format = xfs_qm_dquot_logitem_format,
.iop_pin = xfs_qm_dquot_logitem_pin,
.iop_unpin = xfs_qm_dquot_logitem_unpin,
- .iop_trylock = xfs_qm_dquot_logitem_trylock,
.iop_unlock = xfs_qm_dquot_logitem_unlock,
.iop_committed = xfs_qm_dquot_logitem_committed,
.iop_push = xfs_qm_dquot_logitem_push,
- .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf,
.iop_committing = xfs_qm_dquot_logitem_committing
};
@@ -399,11 +335,13 @@ xfs_qm_qoff_logitem_unpin(
}
/*
- * Quotaoff items have no locking, so just return success.
+ * There isn't much you can do to push a quotaoff item. It is simply
+ * stuck waiting for the log to be flushed to disk.
*/
STATIC uint
-xfs_qm_qoff_logitem_trylock(
- struct xfs_log_item *lip)
+xfs_qm_qoff_logitem_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
return XFS_ITEM_LOCKED;
}
@@ -430,17 +368,6 @@ xfs_qm_qoff_logitem_committed(
return lsn;
}
-/*
- * There isn't much you can do to push on an quotaoff item. It is simply
- * stuck waiting for the log to be flushed to disk.
- */
-STATIC void
-xfs_qm_qoff_logitem_push(
- struct xfs_log_item *lip)
-{
-}
-
-
STATIC xfs_lsn_t
xfs_qm_qoffend_logitem_committed(
struct xfs_log_item *lip,
@@ -455,7 +382,7 @@ xfs_qm_qoffend_logitem_committed(
* xfs_trans_ail_delete() drops the AIL lock.
*/
spin_lock(&ailp->xa_lock);
- xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
+ xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
kmem_free(qfs);
kmem_free(qfe);
@@ -488,7 +415,6 @@ static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
.iop_format = xfs_qm_qoff_logitem_format,
.iop_pin = xfs_qm_qoff_logitem_pin,
.iop_unpin = xfs_qm_qoff_logitem_unpin,
- .iop_trylock = xfs_qm_qoff_logitem_trylock,
.iop_unlock = xfs_qm_qoff_logitem_unlock,
.iop_committed = xfs_qm_qoffend_logitem_committed,
.iop_push = xfs_qm_qoff_logitem_push,
@@ -503,7 +429,6 @@ static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
.iop_format = xfs_qm_qoff_logitem_format,
.iop_pin = xfs_qm_qoff_logitem_pin,
.iop_unpin = xfs_qm_qoff_logitem_unpin,
- .iop_trylock = xfs_qm_qoff_logitem_trylock,
.iop_unlock = xfs_qm_qoff_logitem_unlock,
.iop_committed = xfs_qm_qoff_logitem_committed,
.iop_push = xfs_qm_qoff_logitem_push,
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 39f06336b99..610456054dc 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 558910f5e3c..42679223a0f 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -17,7 +17,6 @@
*/
#include "xfs.h"
#include "xfs_types.h"
-#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
@@ -53,19 +52,18 @@ static int xfs_fileid_length(int fileid_type)
STATIC int
xfs_fs_encode_fh(
- struct dentry *dentry,
- __u32 *fh,
- int *max_len,
- int connectable)
+ struct inode *inode,
+ __u32 *fh,
+ int *max_len,
+ struct inode *parent)
{
struct fid *fid = (struct fid *)fh;
struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh;
- struct inode *inode = dentry->d_inode;
int fileid_type;
int len;
/* Directories don't need their parent encoded, they have ".." */
- if (S_ISDIR(inode->i_mode) || !connectable)
+ if (!parent)
fileid_type = FILEID_INO32_GEN;
else
fileid_type = FILEID_INO32_GEN_PARENT;
@@ -97,20 +95,16 @@ xfs_fs_encode_fh(
switch (fileid_type) {
case FILEID_INO32_GEN_PARENT:
- spin_lock(&dentry->d_lock);
- fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
- fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
- spin_unlock(&dentry->d_lock);
+ fid->i32.parent_ino = XFS_I(parent)->i_ino;
+ fid->i32.parent_gen = parent->i_generation;
/*FALLTHRU*/
case FILEID_INO32_GEN:
fid->i32.ino = XFS_I(inode)->i_ino;
fid->i32.gen = inode->i_generation;
break;
case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
- spin_lock(&dentry->d_lock);
- fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
- fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
- spin_unlock(&dentry->d_lock);
+ fid64->parent_ino = XFS_I(parent)->i_ino;
+ fid64->parent_gen = parent->i_generation;
/*FALLTHRU*/
case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
fid64->ino = XFS_I(inode)->i_ino;
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
new file mode 100644
index 00000000000..85e9f87a1a7
--- /dev/null
+++ b/fs/xfs/xfs_extent_busy.c
@@ -0,0 +1,603 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2010 David Chinner.
+ * Copyright (c) 2011 Christoph Hellwig.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_inode.h"
+#include "xfs_extent_busy.h"
+#include "xfs_trace.h"
+
+void
+xfs_extent_busy_insert(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ unsigned int flags)
+{
+ struct xfs_extent_busy *new;
+ struct xfs_extent_busy *busyp;
+ struct xfs_perag *pag;
+ struct rb_node **rbp;
+ struct rb_node *parent = NULL;
+
+ new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL);
+ if (!new) {
+ /*
+ * No Memory! Since it is now not possible to track the free
+ * block, make this a synchronous transaction to insure that
+ * the block is not reused before this transaction commits.
+ */
+ trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len);
+ xfs_trans_set_sync(tp);
+ return;
+ }
+
+ new->agno = agno;
+ new->bno = bno;
+ new->length = len;
+ INIT_LIST_HEAD(&new->list);
+ new->flags = flags;
+
+ /* trace before insert to be able to see failed inserts */
+ trace_xfs_extent_busy(tp->t_mountp, agno, bno, len);
+
+ pag = xfs_perag_get(tp->t_mountp, new->agno);
+ spin_lock(&pag->pagb_lock);
+ rbp = &pag->pagb_tree.rb_node;
+ while (*rbp) {
+ parent = *rbp;
+ busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
+
+ if (new->bno < busyp->bno) {
+ rbp = &(*rbp)->rb_left;
+ ASSERT(new->bno + new->length <= busyp->bno);
+ } else if (new->bno > busyp->bno) {
+ rbp = &(*rbp)->rb_right;
+ ASSERT(bno >= busyp->bno + busyp->length);
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ rb_link_node(&new->rb_node, parent, rbp);
+ rb_insert_color(&new->rb_node, &pag->pagb_tree);
+
+ list_add(&new->list, &tp->t_busy);
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+}
+
+/*
+ * Search for a busy extent within the range of the extent we are about to
+ * allocate. You need to be holding the busy extent tree lock when calling
+ * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
+ */
+int
+xfs_extent_busy_search(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len)
+{
+ struct xfs_perag *pag;
+ struct rb_node *rbp;
+ struct xfs_extent_busy *busyp;
+ int match = 0;
+
+ pag = xfs_perag_get(mp, agno);
+ spin_lock(&pag->pagb_lock);
+
+ rbp = pag->pagb_tree.rb_node;
+
+ /* find closest start bno overlap */
+ while (rbp) {
+ busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
+ if (bno < busyp->bno) {
+ /* may overlap, but exact start block is lower */
+ if (bno + len > busyp->bno)
+ match = -1;
+ rbp = rbp->rb_left;
+ } else if (bno > busyp->bno) {
+ /* may overlap, but exact start block is higher */
+ if (bno < busyp->bno + busyp->length)
+ match = -1;
+ rbp = rbp->rb_right;
+ } else {
+ /* bno matches busyp, length determines exact match */
+ match = (busyp->length == len) ? 1 : -1;
+ break;
+ }
+ }
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ return match;
+}
+
+/*
+ * The found free extent [fbno, fend] overlaps part or all of the given busy
+ * extent. If the overlap covers the beginning, the end, or all of the busy
+ * extent, the overlapping portion can be made unbusy and used for the
+ * allocation. We can't split a busy extent because we can't modify a
+ * transaction/CIL context busy list, but we can update an entries block
+ * number or length.
+ *
+ * Returns true if the extent can safely be reused, or false if the search
+ * needs to be restarted.
+ */
+STATIC bool
+xfs_extent_busy_update_extent(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ struct xfs_extent_busy *busyp,
+ xfs_agblock_t fbno,
+ xfs_extlen_t flen,
+ bool userdata)
+{
+ xfs_agblock_t fend = fbno + flen;
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
+
+ /*
+ * This extent is currently being discarded. Give the thread
+ * performing the discard a chance to mark the extent unbusy
+ * and retry.
+ */
+ if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
+ spin_unlock(&pag->pagb_lock);
+ delay(1);
+ spin_lock(&pag->pagb_lock);
+ return false;
+ }
+
+ /*
+ * If there is a busy extent overlapping a user allocation, we have
+ * no choice but to force the log and retry the search.
+ *
+ * Fortunately this does not happen during normal operation, but
+ * only if the filesystem is very low on space and has to dip into
+ * the AGFL for normal allocations.
+ */
+ if (userdata)
+ goto out_force_log;
+
+ if (bbno < fbno && bend > fend) {
+ /*
+ * Case 1:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ */
+
+ /*
+ * We would have to split the busy extent to be able to track
+ * it correct, which we cannot do because we would have to
+ * modify the list of busy extents attached to the transaction
+ * or CIL context, which is immutable.
+ *
+ * Force out the log to clear the busy extent and retry the
+ * search.
+ */
+ goto out_force_log;
+ } else if (bbno >= fbno && bend <= fend) {
+ /*
+ * Case 2:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------+
+ * fbno fend
+ *
+ * Case 3:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Case 4:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Case 5:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------------------------+
+ * fbno fend
+ *
+ */
+
+ /*
+ * The busy extent is fully covered by the extent we are
+ * allocating, and can simply be removed from the rbtree.
+ * However we cannot remove it from the immutable list
+ * tracking busy extents in the transaction or CIL context,
+ * so set the length to zero to mark it invalid.
+ *
+ * We also need to restart the busy extent search from the
+ * tree root, because erasing the node can rearrange the
+ * tree topology.
+ */
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ busyp->length = 0;
+ return false;
+ } else if (fend < bend) {
+ /*
+ * Case 6:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ *
+ * Case 7:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +------------------+
+ * fbno fend
+ *
+ */
+ busyp->bno = fend;
+ } else if (bbno < fbno) {
+ /*
+ * Case 8:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 9:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +----------------------+
+ * fbno fend
+ */
+ busyp->length = fbno - busyp->bno;
+ } else {
+ ASSERT(0);
+ }
+
+ trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
+ return true;
+
+out_force_log:
+ spin_unlock(&pag->pagb_lock);
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
+ spin_lock(&pag->pagb_lock);
+ return false;
+}
+
+
+/*
+ * For a given extent [fbno, flen], make sure we can reuse it safely.
+ */
+void
+xfs_extent_busy_reuse(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t fbno,
+ xfs_extlen_t flen,
+ bool userdata)
+{
+ struct xfs_perag *pag;
+ struct rb_node *rbp;
+
+ ASSERT(flen > 0);
+
+ pag = xfs_perag_get(mp, agno);
+ spin_lock(&pag->pagb_lock);
+restart:
+ rbp = pag->pagb_tree.rb_node;
+ while (rbp) {
+ struct xfs_extent_busy *busyp =
+ rb_entry(rbp, struct xfs_extent_busy, rb_node);
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
+
+ if (fbno + flen <= bbno) {
+ rbp = rbp->rb_left;
+ continue;
+ } else if (fbno >= bend) {
+ rbp = rbp->rb_right;
+ continue;
+ }
+
+ if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
+ userdata))
+ goto restart;
+ }
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+}
+
+/*
+ * For a given extent [fbno, flen], search the busy extent list to find a
+ * subset of the extent that is not busy. If *rlen is smaller than
+ * args->minlen no suitable extent could be found, and the higher level
+ * code needs to force out the log and retry the allocation.
+ */
+void
+xfs_extent_busy_trim(
+ struct xfs_alloc_arg *args,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ xfs_agblock_t *rbno,
+ xfs_extlen_t *rlen)
+{
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ struct rb_node *rbp;
+
+ ASSERT(len > 0);
+
+ spin_lock(&args->pag->pagb_lock);
+restart:
+ fbno = bno;
+ flen = len;
+ rbp = args->pag->pagb_tree.rb_node;
+ while (rbp && flen >= args->minlen) {
+ struct xfs_extent_busy *busyp =
+ rb_entry(rbp, struct xfs_extent_busy, rb_node);
+ xfs_agblock_t fend = fbno + flen;
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
+
+ if (fend <= bbno) {
+ rbp = rbp->rb_left;
+ continue;
+ } else if (fbno >= bend) {
+ rbp = rbp->rb_right;
+ continue;
+ }
+
+ /*
+ * If this is a metadata allocation, try to reuse the busy
+ * extent instead of trimming the allocation.
+ */
+ if (!args->userdata &&
+ !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
+ if (!xfs_extent_busy_update_extent(args->mp, args->pag,
+ busyp, fbno, flen,
+ false))
+ goto restart;
+ continue;
+ }
+
+ if (bbno <= fbno) {
+ /* start overlap */
+
+ /*
+ * Case 1:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ *
+ * Case 2:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 3:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 4:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------+
+ * fbno fend
+ *
+ * No unbusy region in extent, return failure.
+ */
+ if (fend <= bend)
+ goto fail;
+
+ /*
+ * Case 5:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +----------------------+
+ * fbno fend
+ *
+ * Case 6:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Needs to be trimmed to:
+ * +-------+
+ * fbno fend
+ */
+ fbno = bend;
+ } else if (bend >= fend) {
+ /* end overlap */
+
+ /*
+ * Case 7:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +------------------+
+ * fbno fend
+ *
+ * Case 8:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Needs to be trimmed to:
+ * +-------+
+ * fbno fend
+ */
+ fend = bbno;
+ } else {
+ /* middle overlap */
+
+ /*
+ * Case 9:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------------------------+
+ * fbno fend
+ *
+ * Can be trimmed to:
+ * +-------+ OR +-------+
+ * fbno fend fbno fend
+ *
+ * Backward allocation leads to significant
+ * fragmentation of directories, which degrades
+ * directory performance, therefore we always want to
+ * choose the option that produces forward allocation
+ * patterns.
+ * Preferring the lower bno extent will make the next
+ * request use "fend" as the start of the next
+ * allocation; if the segment is no longer busy at
+ * that point, we'll get a contiguous allocation, but
+ * even if it is still busy, we will get a forward
+ * allocation.
+ * We try to avoid choosing the segment at "bend",
+ * because that can lead to the next allocation
+ * taking the segment at "fbno", which would be a
+ * backward allocation. We only use the segment at
+ * "fbno" if it is much larger than the current
+ * requested size, because in that case there's a
+ * good chance subsequent allocations will be
+ * contiguous.
+ */
+ if (bbno - fbno >= args->maxlen) {
+ /* left candidate fits perfect */
+ fend = bbno;
+ } else if (fend - bend >= args->maxlen * 4) {
+ /* right candidate has enough free space */
+ fbno = bend;
+ } else if (bbno - fbno >= args->minlen) {
+ /* left candidate fits minimum requirement */
+ fend = bbno;
+ } else {
+ goto fail;
+ }
+ }
+
+ flen = fend - fbno;
+ }
+ spin_unlock(&args->pag->pagb_lock);
+
+ if (fbno != bno || flen != len) {
+ trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
+ fbno, flen);
+ }
+ *rbno = fbno;
+ *rlen = flen;
+ return;
+fail:
+ /*
+ * Return a zero extent length as failure indications. All callers
+ * re-check if the trimmed extent satisfies the minlen requirement.
+ */
+ spin_unlock(&args->pag->pagb_lock);
+ trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
+ *rbno = fbno;
+ *rlen = 0;
+}
+
+STATIC void
+xfs_extent_busy_clear_one(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ struct xfs_extent_busy *busyp)
+{
+ if (busyp->length) {
+ trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
+ busyp->length);
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ }
+
+ list_del_init(&busyp->list);
+ kmem_free(busyp);
+}
+
+/*
+ * Remove all extents on the passed in list from the busy extents tree.
+ * If do_discard is set skip extents that need to be discarded, and mark
+ * these as undergoing a discard operation instead.
+ */
+void
+xfs_extent_busy_clear(
+ struct xfs_mount *mp,
+ struct list_head *list,
+ bool do_discard)
+{
+ struct xfs_extent_busy *busyp, *n;
+ struct xfs_perag *pag = NULL;
+ xfs_agnumber_t agno = NULLAGNUMBER;
+
+ list_for_each_entry_safe(busyp, n, list, list) {
+ if (busyp->agno != agno) {
+ if (pag) {
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ }
+ pag = xfs_perag_get(mp, busyp->agno);
+ spin_lock(&pag->pagb_lock);
+ agno = busyp->agno;
+ }
+
+ if (do_discard && busyp->length &&
+ !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD))
+ busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
+ else
+ xfs_extent_busy_clear_one(mp, pag, busyp);
+ }
+
+ if (pag) {
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ }
+}
+
+/*
+ * Callback for list_sort to sort busy extents by the AG they reside in.
+ */
+int
+xfs_extent_busy_ag_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ return container_of(a, struct xfs_extent_busy, list)->agno -
+ container_of(b, struct xfs_extent_busy, list)->agno;
+}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
new file mode 100644
index 00000000000..985412d65ba
--- /dev/null
+++ b/fs/xfs/xfs_extent_busy.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2010 David Chinner.
+ * Copyright (c) 2011 Christoph Hellwig.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_EXTENT_BUSY_H__
+#define __XFS_EXTENT_BUSY_H__
+
+/*
+ * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_extent_busy_insert() for details.
+ */
+struct xfs_extent_busy {
+ struct rb_node rb_node; /* ag by-bno indexed search tree */
+ struct list_head list; /* transaction busy extent list */
+ xfs_agnumber_t agno;
+ xfs_agblock_t bno;
+ xfs_extlen_t length;
+ unsigned int flags;
+#define XFS_EXTENT_BUSY_DISCARDED 0x01 /* undergoing a discard op. */
+#define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */
+};
+
+void
+xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
+
+void
+xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,
+ bool do_discard);
+
+int
+xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len);
+
+void
+xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
+
+void
+xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno,
+ xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen);
+
+int
+xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+
+static inline void xfs_extent_busy_sort(struct list_head *list)
+{
+ list_sort(NULL, list, xfs_extent_busy_ag_cmp);
+}
+
+#endif /* __XFS_EXTENT_BUSY_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 35c2aff38b2..feb36d7551a 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_sb.h"
@@ -64,7 +63,8 @@ __xfs_efi_release(
if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
spin_lock(&ailp->xa_lock);
/* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, &efip->efi_item);
+ xfs_trans_ail_delete(ailp, &efip->efi_item,
+ SHUTDOWN_LOG_IO_ERROR);
xfs_efi_item_free(efip);
}
}
@@ -147,22 +147,20 @@ xfs_efi_item_unpin(
}
/*
- * Efi items have no locking or pushing. However, since EFIs are
- * pulled from the AIL when their corresponding EFDs are committed
- * to disk, their situation is very similar to being pinned. Return
- * XFS_ITEM_PINNED so that the caller will eventually flush the log.
- * This should help in getting the EFI out of the AIL.
+ * Efi items have no locking or pushing. However, since EFIs are pulled from
+ * the AIL when their corresponding EFDs are committed to disk, their situation
+ * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
+ * will eventually flush the log. This should help in getting the EFI out of
+ * the AIL.
*/
STATIC uint
-xfs_efi_item_trylock(
- struct xfs_log_item *lip)
+xfs_efi_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
return XFS_ITEM_PINNED;
}
-/*
- * Efi items have no locking, so just return.
- */
STATIC void
xfs_efi_item_unlock(
struct xfs_log_item *lip)
@@ -190,17 +188,6 @@ xfs_efi_item_committed(
}
/*
- * There isn't much you can do to push on an efi item. It is simply
- * stuck waiting for all of its corresponding efd items to be
- * committed to disk.
- */
-STATIC void
-xfs_efi_item_push(
- struct xfs_log_item *lip)
-{
-}
-
-/*
* The EFI dependency tracking op doesn't do squat. It can't because
* it doesn't know where the free extent is coming from. The dependency
* tracking has to be handled by the "enclosing" metadata object. For
@@ -222,7 +209,6 @@ static const struct xfs_item_ops xfs_efi_item_ops = {
.iop_format = xfs_efi_item_format,
.iop_pin = xfs_efi_item_pin,
.iop_unpin = xfs_efi_item_unpin,
- .iop_trylock = xfs_efi_item_trylock,
.iop_unlock = xfs_efi_item_unlock,
.iop_committed = xfs_efi_item_committed,
.iop_push = xfs_efi_item_push,
@@ -404,19 +390,17 @@ xfs_efd_item_unpin(
}
/*
- * Efd items have no locking, so just return success.
+ * There isn't much you can do to push on an efd item. It is simply stuck
+ * waiting for the log to be flushed to disk.
*/
STATIC uint
-xfs_efd_item_trylock(
- struct xfs_log_item *lip)
+xfs_efd_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
- return XFS_ITEM_LOCKED;
+ return XFS_ITEM_PINNED;
}
-/*
- * Efd items have no locking or pushing, so return failure
- * so that the caller doesn't bother with us.
- */
STATIC void
xfs_efd_item_unlock(
struct xfs_log_item *lip)
@@ -451,16 +435,6 @@ xfs_efd_item_committed(
}
/*
- * There isn't much you can do to push on an efd item. It is simply
- * stuck waiting for the log to be flushed to disk.
- */
-STATIC void
-xfs_efd_item_push(
- struct xfs_log_item *lip)
-{
-}
-
-/*
* The EFD dependency tracking op doesn't do squat. It can't because
* it doesn't know where the free extent is coming from. The dependency
* tracking has to be handled by the "enclosing" metadata object. For
@@ -482,7 +456,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = {
.iop_format = xfs_efd_item_format,
.iop_pin = xfs_efd_item_pin,
.iop_unpin = xfs_efd_item_unpin,
- .iop_trylock = xfs_efd_item_trylock,
.iop_unlock = xfs_efd_item_unlock,
.iop_committed = xfs_efd_item_committed,
.iop_push = xfs_efd_item_push,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 753ed9b5c70..9f7ec15a652 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -17,9 +17,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_trans.h"
@@ -163,7 +161,6 @@ xfs_file_fsync(
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
int error = 0;
int log_flushed = 0;
xfs_lsn_t lsn = 0;
@@ -194,75 +191,18 @@ xfs_file_fsync(
}
/*
- * We always need to make sure that the required inode state is safe on
- * disk. The inode might be clean but we still might need to force the
- * log because of committed transactions that haven't hit the disk yet.
- * Likewise, there could be unflushed non-transactional changes to the
- * inode core that have to go to disk and this requires us to issue
- * a synchronous transaction to capture these changes correctly.
- *
- * This code relies on the assumption that if the i_update_core field
- * of the inode is clear and the inode is unpinned then it is clean
- * and no action is required.
+ * All metadata updates are logged, which means that we just have
+ * to flush the log up to the latest LSN that touched the inode.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
-
- /*
- * First check if the VFS inode is marked dirty. All the dirtying
- * of non-transactional updates no goes through mark_inode_dirty*,
- * which allows us to distinguish beteeen pure timestamp updates
- * and i_size updates which need to be caught for fdatasync.
- * After that also theck for the dirty state in the XFS inode, which
- * might gets cleared when the inode gets written out via the AIL
- * or xfs_iflush_cluster.
- */
- if (((inode->i_state & I_DIRTY_DATASYNC) ||
- ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
- ip->i_update_core) {
- /*
- * Kick off a transaction to log the inode core to get the
- * updates. The sync transaction will also force the log.
- */
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, 0,
- XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return -error;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- /*
- * Note - it's possible that we might have pushed ourselves out
- * of the way during trans_reserve which would flush the inode.
- * But there's no guarantee that the inode buffer has actually
- * gone out yet (it's delwri). Plus the buffer could be pinned
- * anyway if it's part of an inode in another recent
- * transaction. So we play it safe and fire off the
- * transaction anyway.
- */
- xfs_trans_ijoin(tp, ip, 0);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_trans_commit(tp, 0);
-
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- } else {
- /*
- * Timestamps/size haven't changed since last inode flush or
- * inode transaction commit. That means either nothing got
- * written or a transaction committed which caught the updates.
- * If the latter happened and the transaction hasn't hit the
- * disk yet, the inode will be still be pinned. If it is,
- * force the log.
- */
- if (xfs_ipincount(ip))
+ if (xfs_ipincount(ip)) {
+ if (!datasync ||
+ (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
}
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (!error && lsn)
+ if (lsn)
error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
/*
@@ -327,7 +267,7 @@ xfs_file_aio_read(
mp->m_rtdev_targp : mp->m_ddev_targp;
if ((iocb->ki_pos & target->bt_smask) ||
(size & target->bt_smask)) {
- if (iocb->ki_pos == ip->i_size)
+ if (iocb->ki_pos == i_size_read(inode))
return 0;
return -XFS_ERROR(EINVAL);
}
@@ -412,51 +352,6 @@ xfs_file_splice_read(
return ret;
}
-STATIC void
-xfs_aio_write_isize_update(
- struct inode *inode,
- loff_t *ppos,
- ssize_t bytes_written)
-{
- struct xfs_inode *ip = XFS_I(inode);
- xfs_fsize_t isize = i_size_read(inode);
-
- if (bytes_written > 0)
- XFS_STATS_ADD(xs_write_bytes, bytes_written);
-
- if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
- *ppos > isize))
- *ppos = isize;
-
- if (*ppos > ip->i_size) {
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
- if (*ppos > ip->i_size)
- ip->i_size = *ppos;
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
- }
-}
-
-/*
- * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
- * part of the I/O may have been written to disk before the error occurred. In
- * this case the on-disk file size may have been adjusted beyond the in-memory
- * file size and now needs to be truncated back.
- */
-STATIC void
-xfs_aio_write_newsize_update(
- struct xfs_inode *ip,
- xfs_fsize_t new_size)
-{
- if (new_size == ip->i_new_size) {
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
- if (new_size == ip->i_new_size)
- ip->i_new_size = 0;
- if (ip->i_d.di_size > ip->i_size)
- ip->i_d.di_size = ip->i_size;
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
- }
-}
-
/*
* xfs_file_splice_write() does not use xfs_rw_ilock() because
* generic_file_splice_write() takes the i_mutex itself. This, in theory,
@@ -475,7 +370,6 @@ xfs_file_splice_write(
{
struct inode *inode = outfilp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- xfs_fsize_t new_size;
int ioflags = 0;
ssize_t ret;
@@ -489,132 +383,107 @@ xfs_file_splice_write(
xfs_ilock(ip, XFS_IOLOCK_EXCL);
- new_size = *ppos + count;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (new_size > ip->i_size)
- ip->i_new_size = new_size;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+ if (ret > 0)
+ XFS_STATS_ADD(xs_write_bytes, ret);
- xfs_aio_write_isize_update(inode, ppos, ret);
- xfs_aio_write_newsize_update(ip, new_size);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
/*
- * This routine is called to handle zeroing any space in the last
- * block of the file that is beyond the EOF. We do this since the
- * size is being increased without writing anything to that block
- * and we don't want anyone to read the garbage on the disk.
+ * This routine is called to handle zeroing any space in the last block of the
+ * file that is beyond the EOF. We do this since the size is being increased
+ * without writing anything to that block and we don't want to read the
+ * garbage on the disk.
*/
STATIC int /* error (positive) */
xfs_zero_last_block(
- xfs_inode_t *ip,
- xfs_fsize_t offset,
- xfs_fsize_t isize)
+ struct xfs_inode *ip,
+ xfs_fsize_t offset,
+ xfs_fsize_t isize)
{
- xfs_fileoff_t last_fsb;
- xfs_mount_t *mp = ip->i_mount;
- int nimaps;
- int zero_offset;
- int zero_len;
- int error = 0;
- xfs_bmbt_irec_t imap;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- zero_offset = XFS_B_FSB_OFFSET(mp, isize);
- if (zero_offset == 0) {
- /*
- * There are no extra bytes in the last block on disk to
- * zero, so return.
- */
- return 0;
- }
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
+ int zero_offset = XFS_B_FSB_OFFSET(mp, isize);
+ int zero_len;
+ int nimaps = 1;
+ int error = 0;
+ struct xfs_bmbt_irec imap;
- last_fsb = XFS_B_TO_FSBT(mp, isize);
- nimaps = 1;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
+
ASSERT(nimaps > 0);
+
/*
* If the block underlying isize is just a hole, then there
* is nothing to zero.
*/
- if (imap.br_startblock == HOLESTARTBLOCK) {
+ if (imap.br_startblock == HOLESTARTBLOCK)
return 0;
- }
- /*
- * Zero the part of the last block beyond the EOF, and write it
- * out sync. We need to drop the ilock while we do this so we
- * don't deadlock when the buffer cache calls back to us.
- */
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
zero_len = mp->m_sb.sb_blocksize - zero_offset;
if (isize + zero_len > offset)
zero_len = offset - isize;
- error = xfs_iozero(ip, isize, zero_len);
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- ASSERT(error >= 0);
- return error;
+ return xfs_iozero(ip, isize, zero_len);
}
/*
- * Zero any on disk space between the current EOF and the new,
- * larger EOF. This handles the normal case of zeroing the remainder
- * of the last block in the file and the unusual case of zeroing blocks
- * out beyond the size of the file. This second case only happens
- * with fixed size extents and when the system crashes before the inode
- * size was updated but after blocks were allocated. If fill is set,
- * then any holes in the range are filled and zeroed. If not, the holes
- * are left alone as holes.
+ * Zero any on disk space between the current EOF and the new, larger EOF.
+ *
+ * This handles the normal case of zeroing the remainder of the last block in
+ * the file and the unusual case of zeroing blocks out beyond the size of the
+ * file. This second case only happens with fixed size extents and when the
+ * system crashes before the inode size was updated but after blocks were
+ * allocated.
+ *
+ * Expects the iolock to be held exclusive, and will take the ilock internally.
*/
-
int /* error (positive) */
xfs_zero_eof(
- xfs_inode_t *ip,
- xfs_off_t offset, /* starting I/O offset */
- xfs_fsize_t isize) /* current inode size */
+ struct xfs_inode *ip,
+ xfs_off_t offset, /* starting I/O offset */
+ xfs_fsize_t isize) /* current inode size */
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_fileoff_t start_zero_fsb;
- xfs_fileoff_t end_zero_fsb;
- xfs_fileoff_t zero_count_fsb;
- xfs_fileoff_t last_fsb;
- xfs_fileoff_t zero_off;
- xfs_fsize_t zero_len;
- int nimaps;
- int error = 0;
- xfs_bmbt_irec_t imap;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t start_zero_fsb;
+ xfs_fileoff_t end_zero_fsb;
+ xfs_fileoff_t zero_count_fsb;
+ xfs_fileoff_t last_fsb;
+ xfs_fileoff_t zero_off;
+ xfs_fsize_t zero_len;
+ int nimaps;
+ int error = 0;
+ struct xfs_bmbt_irec imap;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(offset > isize);
/*
* First handle zeroing the block on which isize resides.
+ *
* We only zero a part of that block so it is handled specially.
*/
- error = xfs_zero_last_block(ip, offset, isize);
- if (error) {
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
- return error;
+ if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
+ error = xfs_zero_last_block(ip, offset, isize);
+ if (error)
+ return error;
}
/*
- * Calculate the range between the new size and the old
- * where blocks needing to be zeroed may exist. To get the
- * block where the last byte in the file currently resides,
- * we need to subtract one from the size and truncate back
- * to a block boundary. We subtract 1 in case the size is
- * exactly on a block boundary.
+ * Calculate the range between the new size and the old where blocks
+ * needing to be zeroed may exist.
+ *
+ * To get the block where the last byte in the file currently resides,
+ * we need to subtract one from the size and truncate back to a block
+ * boundary. We subtract 1 in case the size is exactly on a block
+ * boundary.
*/
last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
@@ -632,23 +501,18 @@ xfs_zero_eof(
while (start_zero_fsb <= end_zero_fsb) {
nimaps = 1;
zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
&imap, &nimaps, 0);
- if (error) {
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
return error;
- }
+
ASSERT(nimaps > 0);
if (imap.br_state == XFS_EXT_UNWRITTEN ||
imap.br_startblock == HOLESTARTBLOCK) {
- /*
- * This loop handles initializing pages that were
- * partially initialized by the code below this
- * loop. It basically zeroes the part of the page
- * that sits on a hole and sets the page as P_HOLE
- * and calls remapf if it is a mapped file.
- */
start_zero_fsb = imap.br_startoff + imap.br_blockcount;
ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
continue;
@@ -656,11 +520,7 @@ xfs_zero_eof(
/*
* There are blocks we need to zero.
- * Drop the inode lock while we're doing the I/O.
- * We'll still have the iolock to protect us.
*/
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
@@ -668,99 +528,76 @@ xfs_zero_eof(
zero_len = offset - zero_off;
error = xfs_iozero(ip, zero_off, zero_len);
- if (error) {
- goto out_lock;
- }
+ if (error)
+ return error;
start_zero_fsb = imap.br_startoff + imap.br_blockcount;
ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
}
return 0;
-
-out_lock:
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- ASSERT(error >= 0);
- return error;
}
/*
* Common pre-write limit and setup checks.
*
- * Returns with iolock held according to @iolock.
+ * Called with the iolocked held either shared and exclusive according to
+ * @iolock, and returns with it held. Might upgrade the iolock to exclusive
+ * if called for a direct write beyond i_size.
*/
STATIC ssize_t
xfs_file_aio_write_checks(
struct file *file,
loff_t *pos,
size_t *count,
- xfs_fsize_t *new_sizep,
int *iolock)
{
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- xfs_fsize_t new_size;
int error = 0;
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
- *new_sizep = 0;
restart:
error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
- if (error) {
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
- *iolock = 0;
+ if (error)
return error;
- }
-
- if (likely(!(file->f_mode & FMODE_NOCMTIME)))
- file_update_time(file);
/*
* If the offset is beyond the size of the file, we need to zero any
* blocks that fall between the existing EOF and the start of this
- * write. There is no need to issue zeroing if another in-flght IO ends
- * at or before this one If zeronig is needed and we are currently
- * holding the iolock shared, we need to update it to exclusive which
- * involves dropping all locks and relocking to maintain correct locking
- * order. If we do this, restart the function to ensure all checks and
- * values are still valid.
+ * write. If zeroing is needed and we are currently holding the
+ * iolock shared, we need to update it to exclusive which implies
+ * having to redo all checks before.
*/
- if ((ip->i_new_size && *pos > ip->i_new_size) ||
- (!ip->i_new_size && *pos > ip->i_size)) {
+ if (*pos > i_size_read(inode)) {
if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+ xfs_rw_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+ xfs_rw_ilock(ip, *iolock);
goto restart;
}
- error = -xfs_zero_eof(ip, *pos, ip->i_size);
+ error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
+ if (error)
+ return error;
}
/*
- * If this IO extends beyond EOF, we may need to update ip->i_new_size.
- * We have already zeroed space beyond EOF (if necessary). Only update
- * ip->i_new_size if this IO ends beyond any other in-flight writes.
+ * Updating the timestamps will grab the ilock again from
+ * xfs_fs_dirty_inode, so we have to call it after dropping the
+ * lock above. Eventually we should look into a way to avoid
+ * the pointless lock roundtrip.
*/
- new_size = *pos + *count;
- if (new_size > ip->i_size) {
- if (new_size > ip->i_new_size)
- ip->i_new_size = new_size;
- *new_sizep = new_size;
+ if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
+ error = file_update_time(file);
+ if (error)
+ return error;
}
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
-
/*
* If we're writing the file then make sure to clear the setuid and
* setgid bits if the process is not being run by root. This keeps
* people from modifying setuid and setgid binaries.
*/
return file_remove_suid(file);
-
}
/*
@@ -794,9 +631,7 @@ xfs_file_dio_aio_write(
const struct iovec *iovp,
unsigned long nr_segs,
loff_t pos,
- size_t ocount,
- xfs_fsize_t *new_size,
- int *iolock)
+ size_t ocount)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -806,10 +641,10 @@ xfs_file_dio_aio_write(
ssize_t ret = 0;
size_t count = ocount;
int unaligned_io = 0;
+ int iolock;
struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
- *iolock = 0;
if ((pos & target->bt_smask) || (count & target->bt_smask))
return -XFS_ERROR(EINVAL);
@@ -824,31 +659,31 @@ xfs_file_dio_aio_write(
* EOF zeroing cases and fill out the new inode size as appropriate.
*/
if (unaligned_io || mapping->nrpages)
- *iolock = XFS_IOLOCK_EXCL;
+ iolock = XFS_IOLOCK_EXCL;
else
- *iolock = XFS_IOLOCK_SHARED;
- xfs_rw_ilock(ip, *iolock);
+ iolock = XFS_IOLOCK_SHARED;
+ xfs_rw_ilock(ip, iolock);
/*
* Recheck if there are cached pages that need invalidate after we got
* the iolock to protect against other threads adding new pages while
* we were waiting for the iolock.
*/
- if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, *iolock);
- *iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
+ if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
+ xfs_rw_iunlock(ip, iolock);
+ iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, iolock);
}
- ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+ ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
if (ret)
- return ret;
+ goto out;
if (mapping->nrpages) {
ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
FI_REMAPF_LOCKED);
if (ret)
- return ret;
+ goto out;
}
/*
@@ -857,15 +692,18 @@ xfs_file_dio_aio_write(
*/
if (unaligned_io)
inode_dio_wait(inode);
- else if (*iolock == XFS_IOLOCK_EXCL) {
+ else if (iolock == XFS_IOLOCK_EXCL) {
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
- *iolock = XFS_IOLOCK_SHARED;
+ iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
ret = generic_file_direct_write(iocb, iovp,
&nr_segs, pos, &iocb->ki_pos, count, ocount);
+out:
+ xfs_rw_iunlock(ip, iolock);
+
/* No fallback to buffered IO on errors for XFS. */
ASSERT(ret < 0 || ret == count);
return ret;
@@ -877,9 +715,7 @@ xfs_file_buffered_aio_write(
const struct iovec *iovp,
unsigned long nr_segs,
loff_t pos,
- size_t ocount,
- xfs_fsize_t *new_size,
- int *iolock)
+ size_t ocount)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -887,14 +723,14 @@ xfs_file_buffered_aio_write(
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
int enospc = 0;
+ int iolock = XFS_IOLOCK_EXCL;
size_t count = ocount;
- *iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
+ xfs_rw_ilock(ip, iolock);
- ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+ ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
if (ret)
- return ret;
+ goto out;
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
@@ -908,13 +744,15 @@ write_retry:
* page locks and retry *once*
*/
if (ret == -ENOSPC && !enospc) {
- ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
- if (ret)
- return ret;
enospc = 1;
- goto write_retry;
+ ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+ if (!ret)
+ goto write_retry;
}
+
current->backing_dev_info = NULL;
+out:
+ xfs_rw_iunlock(ip, iolock);
return ret;
}
@@ -930,9 +768,7 @@ xfs_file_aio_write(
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- int iolock;
size_t ocount = 0;
- xfs_fsize_t new_size = 0;
XFS_STATS_INC(xs_write_calls);
@@ -951,33 +787,22 @@ xfs_file_aio_write(
return -EIO;
if (unlikely(file->f_flags & O_DIRECT))
- ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
- ocount, &new_size, &iolock);
+ ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
else
ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
- ocount, &new_size, &iolock);
-
- xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
+ ocount);
- if (ret <= 0)
- goto out_unlock;
+ if (ret > 0) {
+ ssize_t err;
- /* Handle various SYNC-type writes */
- if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
- loff_t end = pos + ret - 1;
- int error;
+ XFS_STATS_ADD(xs_write_bytes, ret);
- xfs_rw_iunlock(ip, iolock);
- error = xfs_file_fsync(file, pos, end,
- (file->f_flags & __O_SYNC) ? 0 : 1);
- xfs_rw_ilock(ip, iolock);
- if (error)
- ret = error;
+ /* Handle various SYNC-type writes */
+ err = generic_write_sync(file, pos, ret);
+ if (err < 0)
+ ret = err;
}
-out_unlock:
- xfs_aio_write_newsize_update(ip, new_size);
- xfs_rw_iunlock(ip, iolock);
return ret;
}
@@ -1141,8 +966,149 @@ xfs_vm_page_mkwrite(
return block_page_mkwrite(vma, vmf, xfs_get_blocks);
}
+STATIC loff_t
+xfs_seek_data(
+ struct file *file,
+ loff_t start,
+ u32 type)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec map[2];
+ int nmap = 2;
+ loff_t uninitialized_var(offset);
+ xfs_fsize_t isize;
+ xfs_fileoff_t fsbno;
+ xfs_filblks_t end;
+ uint lock;
+ int error;
+
+ lock = xfs_ilock_map_shared(ip);
+
+ isize = i_size_read(inode);
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ fsbno = XFS_B_TO_FSBT(mp, start);
+
+ /*
+ * Try to read extents from the first block indicated
+ * by fsbno to the end block of the file.
+ */
+ end = XFS_B_TO_FSB(mp, isize);
+
+ error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
+ XFS_BMAPI_ENTIRE);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * Treat unwritten extent as data extent since it might
+ * contains dirty data in page cache.
+ */
+ if (map[0].br_startblock != HOLESTARTBLOCK) {
+ offset = max_t(loff_t, start,
+ XFS_FSB_TO_B(mp, map[0].br_startoff));
+ } else {
+ if (nmap == 1) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ offset = max_t(loff_t, start,
+ XFS_FSB_TO_B(mp, map[1].br_startoff));
+ }
+
+ if (offset != file->f_pos)
+ file->f_pos = offset;
+
+out_unlock:
+ xfs_iunlock_map_shared(ip, lock);
+
+ if (error)
+ return -error;
+ return offset;
+}
+
+STATIC loff_t
+xfs_seek_hole(
+ struct file *file,
+ loff_t start,
+ u32 type)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ loff_t uninitialized_var(offset);
+ loff_t holeoff;
+ xfs_fsize_t isize;
+ xfs_fileoff_t fsbno;
+ uint lock;
+ int error;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ lock = xfs_ilock_map_shared(ip);
+
+ isize = i_size_read(inode);
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ fsbno = XFS_B_TO_FSBT(mp, start);
+ error = xfs_bmap_first_unused(NULL, ip, 1, &fsbno, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+
+ holeoff = XFS_FSB_TO_B(mp, fsbno);
+ if (holeoff <= start)
+ offset = start;
+ else {
+ /*
+ * xfs_bmap_first_unused() could return a value bigger than
+ * isize if there are no more holes past the supplied offset.
+ */
+ offset = min_t(loff_t, holeoff, isize);
+ }
+
+ if (offset != file->f_pos)
+ file->f_pos = offset;
+
+out_unlock:
+ xfs_iunlock_map_shared(ip, lock);
+
+ if (error)
+ return -error;
+ return offset;
+}
+
+STATIC loff_t
+xfs_file_llseek(
+ struct file *file,
+ loff_t offset,
+ int origin)
+{
+ switch (origin) {
+ case SEEK_END:
+ case SEEK_CUR:
+ case SEEK_SET:
+ return generic_file_llseek(file, offset, origin);
+ case SEEK_DATA:
+ return xfs_seek_data(file, offset, origin);
+ case SEEK_HOLE:
+ return xfs_seek_hole(file, offset, origin);
+ default:
+ return -EINVAL;
+ }
+}
+
const struct file_operations xfs_file_operations = {
- .llseek = generic_file_llseek,
+ .llseek = xfs_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = xfs_file_aio_read,
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index ed88ed16811..652b875a9d4 100644
--- a/fs/xfs/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
@@ -90,7 +90,7 @@ xfs_wait_on_pages(
if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
return -filemap_fdatawait_range(mapping, first,
- last == -1 ? ip->i_size - 1 : last);
+ last == -1 ? XFS_ISIZE(ip) - 1 : last);
}
return 0;
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 1c6fdeb702f..c25b094efbf 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -18,8 +18,6 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
@@ -39,7 +37,6 @@
#include "xfs_itable.h"
#include "xfs_trans_space.h"
#include "xfs_rtalloc.h"
-#include "xfs_rw.h"
#include "xfs_filestream.h"
#include "xfs_trace.h"
@@ -147,9 +144,9 @@ xfs_growfs_data_private(
if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
return error;
dpct = pct - mp->m_sb.sb_imax_pct;
- bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+ bp = xfs_buf_read_uncached(mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
- BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
+ XFS_FSS_TO_BB(mp, 1), 0);
if (!bp)
return EIO;
xfs_buf_relse(bp);
@@ -193,7 +190,7 @@ xfs_growfs_data_private(
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+ XFS_FSS_TO_BB(mp, 1), 0);
if (!bp) {
error = ENOMEM;
goto error0;
@@ -230,7 +227,7 @@ xfs_growfs_data_private(
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+ XFS_FSS_TO_BB(mp, 1), 0);
if (!bp) {
error = ENOMEM;
goto error0;
@@ -259,8 +256,7 @@ xfs_growfs_data_private(
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize),
- XBF_LOCK | XBF_MAPPED);
+ BTOBB(mp->m_sb.sb_blocksize), 0);
if (!bp) {
error = ENOMEM;
goto error0;
@@ -286,8 +282,7 @@ xfs_growfs_data_private(
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize),
- XBF_LOCK | XBF_MAPPED);
+ BTOBB(mp->m_sb.sb_blocksize), 0);
if (!bp) {
error = ENOMEM;
goto error0;
@@ -314,8 +309,7 @@ xfs_growfs_data_private(
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize),
- XBF_LOCK | XBF_MAPPED);
+ BTOBB(mp->m_sb.sb_blocksize), 0);
if (!bp) {
error = ENOMEM;
goto error0;
@@ -405,7 +399,7 @@ xfs_growfs_data_private(
/* update secondary superblocks. */
for (agno = 1; agno < nagcount; agno++) {
- error = xfs_read_buf(mp, mp->m_ddev_targp,
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp);
if (error) {
@@ -693,3 +687,63 @@ xfs_fs_goingdown(
return 0;
}
+
+/*
+ * Force a shutdown of the filesystem instantly while keeping the filesystem
+ * consistent. We don't do an unmount here; just shutdown the shop, make sure
+ * that absolutely nothing persistent happens to this filesystem after this
+ * point.
+ */
+void
+xfs_do_force_shutdown(
+ xfs_mount_t *mp,
+ int flags,
+ char *fname,
+ int lnnum)
+{
+ int logerror;
+
+ logerror = flags & SHUTDOWN_LOG_IO_ERROR;
+
+ if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+ xfs_notice(mp,
+ "%s(0x%x) called from line %d of file %s. Return address = 0x%p",
+ __func__, flags, lnnum, fname, __return_address);
+ }
+ /*
+ * No need to duplicate efforts.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
+ return;
+
+ /*
+ * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
+ * queue up anybody new on the log reservations, and wakes up
+ * everybody who's sleeping on log reservations to tell them
+ * the bad news.
+ */
+ if (xfs_log_force_umount(mp, logerror))
+ return;
+
+ if (flags & SHUTDOWN_CORRUPT_INCORE) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
+ "Corruption of in-memory data detected. Shutting down filesystem");
+ if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
+ xfs_stack_trace();
+ } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+ if (logerror) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+ "Log I/O Error Detected. Shutting down filesystem");
+ } else if (flags & SHUTDOWN_DEVICE_REQ) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+ "All device paths lost. Shutting down filesystem");
+ } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+ "I/O Error Detected. Shutting down filesystem");
+ }
+ }
+ if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+ xfs_alert(mp,
+ "Please umount the filesystem and rectify the problem(s)");
+ }
+}
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 169380e6605..177a21a7ac4 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -200,8 +200,7 @@ xfs_ialloc_inode_init(
*/
d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- mp->m_bsize * blks_per_cluster,
- XBF_LOCK);
+ mp->m_bsize * blks_per_cluster, 0);
if (!fbuf)
return ENOMEM;
/*
@@ -447,7 +446,7 @@ STATIC xfs_buf_t * /* allocation group buffer */
xfs_ialloc_ag_select(
xfs_trans_t *tp, /* transaction pointer */
xfs_ino_t parent, /* parent directory inode number */
- mode_t mode, /* bits set to indicate file type */
+ umode_t mode, /* bits set to indicate file type */
int okalloc) /* ok to allocate more space */
{
xfs_buf_t *agbp; /* allocation group header buffer */
@@ -610,6 +609,13 @@ xfs_ialloc_get_rec(
/*
* Visible inode allocation functions.
*/
+/*
+ * Find a free (set) bit in the inode bitmask.
+ */
+static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
+{
+ return xfs_lowbit64(*fp);
+}
/*
* Allocate an inode on disk.
@@ -640,7 +646,7 @@ int
xfs_dialloc(
xfs_trans_t *tp, /* transaction pointer */
xfs_ino_t parent, /* parent inode (directory) */
- mode_t mode, /* mode bits for new inode */
+ umode_t mode, /* mode bits for new inode */
int okalloc, /* ok to allocate more space */
xfs_buf_t **IO_agbp, /* in/out ag header's buffer */
boolean_t *alloc_done, /* true if we needed to replenish
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index bb5385475e1..65ac57c8063 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -47,15 +47,6 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
}
/*
- * Find a free (set) bit in the inode bitmask.
- */
-static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
-{
- return xfs_lowbit64(*fp);
-}
-
-
-/*
* Allocate an inode on disk.
* Mode is used to tell whether the new inode will need space, and whether
* it is a directory.
@@ -81,7 +72,7 @@ int /* error */
xfs_dialloc(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t parent, /* parent inode (directory) */
- mode_t mode, /* mode bits for new inode */
+ umode_t mode, /* mode bits for new inode */
int okalloc, /* ok to allocate more space */
struct xfs_buf **agbp, /* buf for a.g. inode header */
boolean_t *alloc_done, /* an allocation was done to replenish
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index c6a75815aea..2b8b7a37aa1 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 0fa98b1c70e..1bb4365e8c2 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_acl.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
@@ -77,7 +76,7 @@ xfs_inode_alloc(
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
- ASSERT(completion_done(&ip->i_flush));
+ ASSERT(!xfs_isiflocked(ip));
ASSERT(ip->i_ino == 0);
mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
@@ -91,11 +90,8 @@ xfs_inode_alloc(
ip->i_afp = NULL;
memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
ip->i_flags = 0;
- ip->i_update_core = 0;
ip->i_delayed_blks = 0;
memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
- ip->i_size = 0;
- ip->i_new_size = 0;
return ip;
}
@@ -107,7 +103,6 @@ xfs_inode_free_callback(
struct inode *inode = container_of(head, struct inode, i_rcu);
struct xfs_inode *ip = XFS_I(inode);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_zone_free(xfs_inode_zone, ip);
}
@@ -127,23 +122,7 @@ xfs_inode_free(
xfs_idestroy_fork(ip, XFS_ATTR_FORK);
if (ip->i_itemp) {
- /*
- * Only if we are shutting down the fs will we see an
- * inode still in the AIL. If it is there, we should remove
- * it to prevent a use-after-free from occurring.
- */
- xfs_log_item_t *lip = &ip->i_itemp->ili_item;
- struct xfs_ail *ailp = lip->li_ailp;
-
- ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
- XFS_FORCED_SHUTDOWN(ip->i_mount));
- if (lip->li_flags & XFS_LI_IN_AIL) {
- spin_lock(&ailp->xa_lock);
- if (lip->li_flags & XFS_LI_IN_AIL)
- xfs_trans_ail_delete(ailp, lip);
- else
- spin_unlock(&ailp->xa_lock);
- }
+ ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
xfs_inode_item_destroy(ip);
ip->i_itemp = NULL;
}
@@ -151,7 +130,7 @@ xfs_inode_free(
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
- ASSERT(completion_done(&ip->i_flush));
+ ASSERT(!xfs_isiflocked(ip));
/*
* Because we use RCU freeing we need to ensure the inode always
@@ -293,7 +272,7 @@ xfs_iget_cache_hit(
if (lock_flags != 0)
xfs_ilock(ip, lock_flags);
- xfs_iflags_clear(ip, XFS_ISTALE);
+ xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
XFS_STATS_INC(xs_ig_found);
return 0;
@@ -318,6 +297,7 @@ xfs_iget_cache_miss(
struct xfs_inode *ip;
int error;
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
+ int iflags;
ip = xfs_inode_alloc(mp, ino);
if (!ip)
@@ -337,9 +317,10 @@ xfs_iget_cache_miss(
/*
* Preload the radix tree so we can insert safely under the
* write spinlock. Note that we cannot sleep inside the preload
- * region.
+ * region. Since we can be called from transaction context, don't
+ * recurse into the file system.
*/
- if (radix_tree_preload(GFP_KERNEL)) {
+ if (radix_tree_preload(GFP_NOFS)) {
error = EAGAIN;
goto out_destroy;
}
@@ -353,9 +334,23 @@ xfs_iget_cache_miss(
BUG();
}
- spin_lock(&pag->pag_ici_lock);
+ /*
+ * These values must be set before inserting the inode into the radix
+ * tree as the moment it is inserted a concurrent lookup (allowed by the
+ * RCU locking mechanism) can find it and that lookup must see that this
+ * is an inode currently under construction (i.e. that XFS_INEW is set).
+ * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+ * memory barrier that ensures this detection works correctly at lookup
+ * time.
+ */
+ iflags = XFS_INEW;
+ if (flags & XFS_IGET_DONTCACHE)
+ iflags |= XFS_IDONTCACHE;
+ ip->i_udquot = ip->i_gdquot = NULL;
+ xfs_iflags_set(ip, iflags);
/* insert the new inode */
+ spin_lock(&pag->pag_ici_lock);
error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
if (unlikely(error)) {
WARN_ON(error != -EEXIST);
@@ -363,11 +358,6 @@ xfs_iget_cache_miss(
error = EAGAIN;
goto out_preload_end;
}
-
- /* These values _must_ be set before releasing the radix tree lock! */
- ip->i_udquot = ip->i_gdquot = NULL;
- xfs_iflags_set(ip, XFS_INEW);
-
spin_unlock(&pag->pag_ici_lock);
radix_tree_preload_end();
@@ -421,6 +411,15 @@ xfs_iget(
xfs_perag_t *pag;
xfs_agino_t agino;
+ /*
+ * xfs_reclaim_inode() uses the ILOCK to ensure an inode
+ * doesn't get freed while it's being referenced during a
+ * radix tree traversal here. It assumes this function
+ * aqcuires only the ILOCK (and therefore it has no need to
+ * involve the IOLOCK in this synchronization).
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
+
/* reject inode numbers outside existing AGs */
if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
return EINVAL;
@@ -451,8 +450,6 @@ again:
*ipp = ip;
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
/*
* If we have a real type for an on-disk inode, we can set ops(&unlock)
* now. If it's a new inode being created, xfs_ialloc will handle it.
@@ -647,8 +644,7 @@ xfs_iunlock(
(XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY |
- XFS_LOCK_DEP_MASK)) == 0);
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
ASSERT(lock_flags != 0);
if (lock_flags & XFS_IOLOCK_EXCL)
@@ -661,16 +657,6 @@ xfs_iunlock(
else if (lock_flags & XFS_ILOCK_SHARED)
mrunlock_shared(&ip->i_lock);
- if ((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) &&
- !(lock_flags & XFS_IUNLOCK_NONOTIFY) && ip->i_itemp) {
- /*
- * Let the AIL know that this item has been unlocked in case
- * it is in the AIL and anyone is waiting on it. Don't do
- * this if the caller has asked us not to.
- */
- xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
- (xfs_log_item_t*)(ip->i_itemp));
- }
trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
}
@@ -716,3 +702,19 @@ xfs_isilocked(
return 0;
}
#endif
+
+void
+__xfs_iflock(
+ struct xfs_inode *ip)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+ do {
+ prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_isiflocked(ip))
+ io_schedule();
+ } while (!xfs_iflock_nowait(ip));
+
+ finish_wait(wq, &wait.wait);
+}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 755ee816488..a59eea09930 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -20,7 +20,6 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
@@ -61,6 +60,20 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+/*
+ * helper function to extract extent size hint from inode
+ */
+xfs_extlen_t
+xfs_get_extsz_hint(
+ struct xfs_inode *ip)
+{
+ if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
+ return ip->i_d.di_extsize;
+ if (XFS_IS_REALTIME_INODE(ip))
+ return ip->i_mount->m_sb.sb_rextsize;
+ return 0;
+}
+
#ifdef DEBUG
/*
* Make sure that the extents in the given memory buffer
@@ -137,6 +150,7 @@ xfs_imap_to_bp(
int ni;
xfs_buf_t *bp;
+ buf_flags |= XBF_UNMAPPED;
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
(int)imap->im_len, buf_flags, &bp);
if (error) {
@@ -226,7 +240,7 @@ xfs_inotobp(
if (error)
return error;
- error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags);
+ error = xfs_imap_to_bp(mp, tp, &imap, &bp, 0, imap_flags);
if (error)
return error;
@@ -299,11 +313,8 @@ xfs_iformat(
{
xfs_attr_shortform_t *atp;
int size;
- int error;
+ int error = 0;
xfs_fsize_t di_size;
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
- error = 0;
if (unlikely(be32_to_cpu(dip->di_nextents) +
be16_to_cpu(dip->di_anextents) >
@@ -350,7 +361,6 @@ xfs_iformat(
return XFS_ERROR(EFSCORRUPTED);
}
ip->i_d.di_size = 0;
- ip->i_size = 0;
ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
break;
@@ -409,10 +419,10 @@ xfs_iformat(
}
if (!XFS_DFORK_Q(dip))
return 0;
+
ASSERT(ip->i_afp == NULL);
ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
- ip->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
switch (dip->di_aformat) {
case XFS_DINODE_FMT_LOCAL:
atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
@@ -604,10 +614,11 @@ xfs_iformat_btree(
* or the number of extents is greater than the number of
* blocks.
*/
- if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
- || XFS_BMDR_SPACE_CALC(nrecs) >
- XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
- || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+ if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+ XFS_IFORK_MAXEXT(ip, whichfork) ||
+ XFS_BMDR_SPACE_CALC(nrecs) >
+ XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
+ XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
(unsigned long long) ip->i_ino);
XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -785,8 +796,7 @@ xfs_iread(
/*
* Get pointers to the on-disk inode and the buffer containing it.
*/
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
- XBF_LOCK, iget_flags);
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 0, iget_flags);
if (error)
return error;
dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -835,12 +845,6 @@ xfs_iread(
* with the uninitialized part of it.
*/
ip->i_d.di_mode = 0;
- /*
- * Initialize the per-fork minima and maxima for a new
- * inode here. xfs_iformat will do it for old inodes.
- */
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
}
/*
@@ -861,7 +865,6 @@ xfs_iread(
}
ip->i_delayed_blks = 0;
- ip->i_size = ip->i_d.di_size;
/*
* Mark the buffer containing the inode as something to keep
@@ -961,7 +964,7 @@ int
xfs_ialloc(
xfs_trans_t *tp,
xfs_inode_t *pip,
- mode_t mode,
+ umode_t mode,
xfs_nlink_t nlink,
xfs_dev_t rdev,
prid_t prid,
@@ -1002,7 +1005,7 @@ xfs_ialloc(
return error;
ASSERT(ip != NULL);
- ip->i_d.di_mode = (__uint16_t)mode;
+ ip->i_d.di_mode = mode;
ip->i_d.di_onlink = 0;
ip->i_d.di_nlink = nlink;
ASSERT(ip->i_d.di_nlink == nlink);
@@ -1051,7 +1054,6 @@ xfs_ialloc(
}
ip->i_d.di_size = 0;
- ip->i_size = 0;
ip->i_d.di_nextents = 0;
ASSERT(ip->i_d.di_nblocks == 0);
@@ -1166,52 +1168,6 @@ xfs_ialloc(
}
/*
- * Check to make sure that there are no blocks allocated to the
- * file beyond the size of the file. We don't check this for
- * files with fixed size extents or real time extents, but we
- * at least do it for regular files.
- */
-#ifdef DEBUG
-STATIC void
-xfs_isize_check(
- struct xfs_inode *ip,
- xfs_fsize_t isize)
-{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t map_first;
- int nimaps;
- xfs_bmbt_irec_t imaps[2];
- int error;
-
- if (!S_ISREG(ip->i_d.di_mode))
- return;
-
- if (XFS_IS_REALTIME_INODE(ip))
- return;
-
- if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
- return;
-
- nimaps = 2;
- map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
- /*
- * The filesystem could be shutting down, so bmapi may return
- * an error.
- */
- error = xfs_bmapi_read(ip, map_first,
- (XFS_B_TO_FSB(mp,
- (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
- imaps, &nimaps, XFS_BMAPI_ENTIRE);
- if (error)
- return;
- ASSERT(nimaps == 1);
- ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
-}
-#else /* DEBUG */
-#define xfs_isize_check(ip, isize)
-#endif /* DEBUG */
-
-/*
* Free up the underlying blocks past new_size. The new size must be smaller
* than the current size. This routine can be used both for the attribute and
* data fork, and does not modify the inode size, which is left to the caller.
@@ -1252,12 +1208,14 @@ xfs_itruncate_extents(
int done = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
- ASSERT(new_size <= ip->i_size);
+ ASSERT(new_size <= XFS_ISIZE(ip));
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(ip->i_itemp != NULL);
ASSERT(ip->i_itemp->ili_lock_flags == 0);
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+ trace_xfs_itruncate_extents_start(ip, new_size);
+
/*
* Since it is possible for space to become allocated beyond
* the end of the file (in a crash where the space is allocated
@@ -1325,6 +1283,14 @@ xfs_itruncate_extents(
goto out;
}
+ /*
+ * Always re-log the inode so that our permanent transaction can keep
+ * on rolling it forward in the log.
+ */
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ trace_xfs_itruncate_extents_end(ip, new_size);
+
out:
*tpp = tp;
return error;
@@ -1338,74 +1304,6 @@ out_bmap_cancel:
goto out;
}
-int
-xfs_itruncate_data(
- struct xfs_trans **tpp,
- struct xfs_inode *ip,
- xfs_fsize_t new_size)
-{
- int error;
-
- trace_xfs_itruncate_data_start(ip, new_size);
-
- /*
- * The first thing we do is set the size to new_size permanently on
- * disk. This way we don't have to worry about anyone ever being able
- * to look at the data being freed even in the face of a crash.
- * What we're getting around here is the case where we free a block, it
- * is allocated to another file, it is written to, and then we crash.
- * If the new data gets written to the file but the log buffers
- * containing the free and reallocation don't, then we'd end up with
- * garbage in the blocks being freed. As long as we make the new_size
- * permanent before actually freeing any blocks it doesn't matter if
- * they get written to.
- */
- if (ip->i_d.di_nextents > 0) {
- /*
- * If we are not changing the file size then do not update
- * the on-disk file size - we may be called from
- * xfs_inactive_free_eofblocks(). If we update the on-disk
- * file size and then the system crashes before the contents
- * of the file are flushed to disk then the files may be
- * full of holes (ie NULL files bug).
- */
- if (ip->i_size != new_size) {
- ip->i_d.di_size = new_size;
- ip->i_size = new_size;
- xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
- }
- }
-
- error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
- if (error)
- return error;
-
- /*
- * If we are not changing the file size then do not update the on-disk
- * file size - we may be called from xfs_inactive_free_eofblocks().
- * If we update the on-disk file size and then the system crashes
- * before the contents of the file are flushed to disk then the files
- * may be full of holes (ie NULL files bug).
- */
- xfs_isize_check(ip, new_size);
- if (ip->i_size != new_size) {
- ip->i_d.di_size = new_size;
- ip->i_size = new_size;
- }
-
- ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
- ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
-
- /*
- * Always re-log the inode so that our permanent transaction can keep
- * on rolling it forward in the log.
- */
- xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-
- trace_xfs_itruncate_data_end(ip, new_size);
- return 0;
-}
-
/*
* This is called when the inode's link count goes to 0.
* We place the on-disk inode on a list in the AGI. It
@@ -1457,7 +1355,7 @@ xfs_iunlink(
* Here we put the head pointer into our next pointer,
* and then we fall through to point the head at us.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
if (error)
return error;
@@ -1538,7 +1436,7 @@ xfs_iunlink_remove(
* of dealing with the buffer when there is no need to
* change it.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
if (error) {
xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
__func__, error);
@@ -1599,7 +1497,7 @@ xfs_iunlink_remove(
* Now last_ibp points to the buffer previous to us on
* the unlinked list. Pull us from the list.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
if (error) {
xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
__func__, error);
@@ -1681,8 +1579,7 @@ xfs_ifree_cluster(
* to mark all the active inodes on the buffer stale.
*/
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
- mp->m_bsize * blks_per_cluster,
- XBF_LOCK);
+ mp->m_bsize * blks_per_cluster, 0);
if (!bp)
return ENOMEM;
@@ -1771,14 +1668,13 @@ retry:
iip = ip->i_itemp;
if (!iip || xfs_inode_clean(ip)) {
ASSERT(ip != free_ip);
- ip->i_update_core = 0;
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
continue;
}
- iip->ili_last_fields = iip->ili_format.ilf_fields;
- iip->ili_format.ilf_fields = 0;
+ iip->ili_last_fields = iip->ili_fields;
+ iip->ili_fields = 0;
iip->ili_logged = 1;
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
@@ -1824,8 +1720,7 @@ xfs_ifree(
ASSERT(ip->i_d.di_nlink == 0);
ASSERT(ip->i_d.di_nextents == 0);
ASSERT(ip->i_d.di_anextents == 0);
- ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
- (!S_ISREG(ip->i_d.di_mode)));
+ ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
ASSERT(ip->i_d.di_nblocks == 0);
/*
@@ -1844,8 +1739,6 @@ xfs_ifree(
ip->i_d.di_flags = 0;
ip->i_d.di_dmevmask = 0;
ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
/*
@@ -1856,7 +1749,7 @@ xfs_ifree(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK);
+ error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0);
if (error)
return error;
@@ -2151,7 +2044,7 @@ xfs_idestroy_fork(
* once someone is waiting for it to be unpinned.
*/
static void
-xfs_iunpin_nowait(
+xfs_iunpin(
struct xfs_inode *ip)
{
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
@@ -2163,14 +2056,29 @@ xfs_iunpin_nowait(
}
+static void
+__xfs_iunpin_wait(
+ struct xfs_inode *ip)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
+
+ xfs_iunpin(ip);
+
+ do {
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_ipincount(ip))
+ io_schedule();
+ } while (xfs_ipincount(ip));
+ finish_wait(wq, &wait.wait);
+}
+
void
xfs_iunpin_wait(
struct xfs_inode *ip)
{
- if (xfs_ipincount(ip)) {
- xfs_iunpin_nowait(ip);
- wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
- }
+ if (xfs_ipincount(ip))
+ __xfs_iunpin_wait(ip);
}
/*
@@ -2280,7 +2188,7 @@ xfs_iflush_fork(
mp = ip->i_mount;
switch (XFS_IFORK_FORMAT(ip, whichfork)) {
case XFS_DINODE_FMT_LOCAL:
- if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
+ if ((iip->ili_fields & dataflag[whichfork]) &&
(ifp->if_bytes > 0)) {
ASSERT(ifp->if_u1.if_data != NULL);
ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
@@ -2290,8 +2198,8 @@ xfs_iflush_fork(
case XFS_DINODE_FMT_EXTENTS:
ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
- !(iip->ili_format.ilf_fields & extflag[whichfork]));
- if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
+ !(iip->ili_fields & extflag[whichfork]));
+ if ((iip->ili_fields & extflag[whichfork]) &&
(ifp->if_bytes > 0)) {
ASSERT(xfs_iext_get_ext(ifp, 0));
ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
@@ -2301,7 +2209,7 @@ xfs_iflush_fork(
break;
case XFS_DINODE_FMT_BTREE:
- if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
+ if ((iip->ili_fields & brootflag[whichfork]) &&
(ifp->if_broot_bytes > 0)) {
ASSERT(ifp->if_broot != NULL);
ASSERT(ifp->if_broot_bytes <=
@@ -2314,14 +2222,14 @@ xfs_iflush_fork(
break;
case XFS_DINODE_FMT_DEV:
- if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+ if (iip->ili_fields & XFS_ILOG_DEV) {
ASSERT(whichfork == XFS_DATA_FORK);
xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
}
break;
case XFS_DINODE_FMT_UUID:
- if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+ if (iip->ili_fields & XFS_ILOG_UUID) {
ASSERT(whichfork == XFS_DATA_FORK);
memcpy(XFS_DFORK_DPTR(dip),
&ip->i_df.if_u2.if_uuid,
@@ -2451,11 +2359,11 @@ cluster_corrupt_out:
*/
rcu_read_unlock();
/*
- * Clean up the buffer. If it was B_DELWRI, just release it --
+ * Clean up the buffer. If it was delwri, just release it --
* brelse can handle it with no problems. If not, shut down the
* filesystem before releasing the buffer.
*/
- bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp);
+ bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
if (bufwasdelwri)
xfs_buf_relse(bp);
@@ -2481,58 +2389,40 @@ cluster_corrupt_out:
/*
* Unlocks the flush lock
*/
- xfs_iflush_abort(iq);
+ xfs_iflush_abort(iq, false);
kmem_free(ilist);
xfs_perag_put(pag);
return XFS_ERROR(EFSCORRUPTED);
}
/*
- * xfs_iflush() will write a modified inode's changes out to the
- * inode's on disk home. The caller must have the inode lock held
- * in at least shared mode and the inode flush completion must be
- * active as well. The inode lock will still be held upon return from
- * the call and the caller is free to unlock it.
- * The inode flush will be completed when the inode reaches the disk.
- * The flags indicate how the inode's buffer should be written out.
+ * Flush dirty inode metadata into the backing buffer.
+ *
+ * The caller must have the inode lock and the inode flush lock held. The
+ * inode lock will still be held upon return to the caller, and the inode
+ * flush lock will be released after the inode has reached the disk.
+ *
+ * The caller must write out the buffer returned in *bpp and release it.
*/
int
xfs_iflush(
- xfs_inode_t *ip,
- uint flags)
+ struct xfs_inode *ip,
+ struct xfs_buf **bpp)
{
- xfs_inode_log_item_t *iip;
- xfs_buf_t *bp;
- xfs_dinode_t *dip;
- xfs_mount_t *mp;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buf *bp;
+ struct xfs_dinode *dip;
int error;
XFS_STATS_INC(xs_iflush_count);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(!completion_done(&ip->i_flush));
+ ASSERT(xfs_isiflocked(ip));
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > ip->i_df.if_ext_max);
+ ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
- iip = ip->i_itemp;
- mp = ip->i_mount;
+ *bpp = NULL;
- /*
- * We can't flush the inode until it is unpinned, so wait for it if we
- * are allowed to block. We know no one new can pin it, because we are
- * holding the inode lock shared and you need to hold it exclusively to
- * pin the inode.
- *
- * If we are not allowed to block, force the log out asynchronously so
- * that when we come back the inode will be unpinned. If other inodes
- * in the same cluster are dirty, they will probably write the inode
- * out for us if they occur after the log force completes.
- */
- if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
- xfs_iunpin_nowait(ip);
- xfs_ifunlock(ip);
- return EAGAIN;
- }
xfs_iunpin_wait(ip);
/*
@@ -2551,21 +2441,20 @@ xfs_iflush(
/*
* This may have been unpinned because the filesystem is shutting
* down forcibly. If that's the case we must not write this inode
- * to disk, because the log record didn't make it to disk!
+ * to disk, because the log record didn't make it to disk.
+ *
+ * We also have to remove the log item from the AIL in this case,
+ * as we wait for an empty AIL as part of the unmount process.
*/
if (XFS_FORCED_SHUTDOWN(mp)) {
- ip->i_update_core = 0;
- if (iip)
- iip->ili_format.ilf_fields = 0;
- xfs_ifunlock(ip);
- return XFS_ERROR(EIO);
+ error = XFS_ERROR(EIO);
+ goto abort_out;
}
/*
* Get the buffer containing the on-disk inode.
*/
- error = xfs_itobp(mp, NULL, ip, &dip, &bp,
- (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK);
+ error = xfs_itobp(mp, NULL, ip, &dip, &bp, XBF_TRYLOCK);
if (error || !bp) {
xfs_ifunlock(ip);
return error;
@@ -2593,23 +2482,20 @@ xfs_iflush(
if (error)
goto cluster_corrupt_out;
- if (flags & SYNC_WAIT)
- error = xfs_bwrite(bp);
- else
- xfs_buf_delwri_queue(bp);
-
- xfs_buf_relse(bp);
- return error;
+ *bpp = bp;
+ return 0;
corrupt_out:
xfs_buf_relse(bp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
cluster_corrupt_out:
+ error = XFS_ERROR(EFSCORRUPTED);
+abort_out:
/*
* Unlocks the flush lock
*/
- xfs_iflush_abort(ip);
- return XFS_ERROR(EFSCORRUPTED);
+ xfs_iflush_abort(ip, false);
+ return error;
}
@@ -2626,9 +2512,9 @@ xfs_iflush_int(
#endif
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(!completion_done(&ip->i_flush));
+ ASSERT(xfs_isiflocked(ip));
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > ip->i_df.if_ext_max);
+ ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
iip = ip->i_itemp;
mp = ip->i_mount;
@@ -2636,26 +2522,6 @@ xfs_iflush_int(
/* set *dip = inode's place in the buffer */
dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
- /*
- * Clear i_update_core before copying out the data.
- * This is for coordination with our timestamp updates
- * that don't hold the inode lock. They will always
- * update the timestamps BEFORE setting i_update_core,
- * so if we clear i_update_core after they set it we
- * are guaranteed to see their updates to the timestamps.
- * I believe that this depends on strongly ordered memory
- * semantics, but we have that. We use the SYNCHRONIZE
- * macro to make sure that the compiler does not reorder
- * the i_update_core access below the data copy below.
- */
- ip->i_update_core = 0;
- SYNCHRONIZE();
-
- /*
- * Make sure to get the latest timestamps from the Linux inode.
- */
- xfs_synchronize_times(ip);
-
if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
@@ -2766,36 +2632,33 @@ xfs_iflush_int(
xfs_inobp_check(mp, bp);
/*
- * We've recorded everything logged in the inode, so we'd
- * like to clear the ilf_fields bits so we don't log and
- * flush things unnecessarily. However, we can't stop
- * logging all this information until the data we've copied
- * into the disk buffer is written to disk. If we did we might
- * overwrite the copy of the inode in the log with all the
- * data after re-logging only part of it, and in the face of
- * a crash we wouldn't have all the data we need to recover.
+ * We've recorded everything logged in the inode, so we'd like to clear
+ * the ili_fields bits so we don't log and flush things unnecessarily.
+ * However, we can't stop logging all this information until the data
+ * we've copied into the disk buffer is written to disk. If we did we
+ * might overwrite the copy of the inode in the log with all the data
+ * after re-logging only part of it, and in the face of a crash we
+ * wouldn't have all the data we need to recover.
*
- * What we do is move the bits to the ili_last_fields field.
- * When logging the inode, these bits are moved back to the
- * ilf_fields field. In the xfs_iflush_done() routine we
- * clear ili_last_fields, since we know that the information
- * those bits represent is permanently on disk. As long as
- * the flush completes before the inode is logged again, then
- * both ilf_fields and ili_last_fields will be cleared.
+ * What we do is move the bits to the ili_last_fields field. When
+ * logging the inode, these bits are moved back to the ili_fields field.
+ * In the xfs_iflush_done() routine we clear ili_last_fields, since we
+ * know that the information those bits represent is permanently on
+ * disk. As long as the flush completes before the inode is logged
+ * again, then both ili_fields and ili_last_fields will be cleared.
*
- * We can play with the ilf_fields bits here, because the inode
- * lock must be held exclusively in order to set bits there
- * and the flush lock protects the ili_last_fields bits.
- * Set ili_logged so the flush done
- * routine can tell whether or not to look in the AIL.
- * Also, store the current LSN of the inode so that we can tell
- * whether the item has moved in the AIL from xfs_iflush_done().
- * In order to read the lsn we need the AIL lock, because
- * it is a 64 bit value that cannot be read atomically.
- */
- if (iip != NULL && iip->ili_format.ilf_fields != 0) {
- iip->ili_last_fields = iip->ili_format.ilf_fields;
- iip->ili_format.ilf_fields = 0;
+ * We can play with the ili_fields bits here, because the inode lock
+ * must be held exclusively in order to set bits there and the flush
+ * lock protects the ili_last_fields bits. Set ili_logged so the flush
+ * done routine can tell whether or not to look in the AIL. Also, store
+ * the current LSN of the inode so that we can tell whether the item has
+ * moved in the AIL from xfs_iflush_done(). In order to read the lsn we
+ * need the AIL lock, because it is a 64 bit value that cannot be read
+ * atomically.
+ */
+ if (iip != NULL && iip->ili_fields != 0) {
+ iip->ili_last_fields = iip->ili_fields;
+ iip->ili_fields = 0;
iip->ili_logged = 1;
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
@@ -2814,8 +2677,7 @@ xfs_iflush_int(
} else {
/*
* We're flushing an inode which is not in the AIL and has
- * not been logged but has i_update_core set. For this
- * case we can use a B_DELWRI flush and immediately drop
+ * not been logged. For this case we can immediately drop
* the inode flush lock because we can avoid the whole
* AIL state thing. It's OK to drop the flush lock now,
* because we've already locked the buffer and to do anything
@@ -2835,27 +2697,6 @@ corrupt_out:
return XFS_ERROR(EFSCORRUPTED);
}
-void
-xfs_promote_inode(
- struct xfs_inode *ip)
-{
- struct xfs_buf *bp;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-
- bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno,
- ip->i_imap.im_len, XBF_TRYLOCK);
- if (!bp)
- return;
-
- if (XFS_BUF_ISDELAYWRITE(bp)) {
- xfs_buf_delwri_promote(bp);
- wake_up_process(ip->i_mount->m_ddev_targp->bt_task);
- }
-
- xfs_buf_relse(bp);
-}
-
/*
* Return a pointer to the extent record at file index idx.
*/
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index b4cd4739f98..1efff36a75b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -66,7 +66,6 @@ typedef struct xfs_ifork {
struct xfs_btree_block *if_broot; /* file's incore btree root */
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
- unsigned char if_ext_max; /* max # of extent records */
union {
xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
@@ -206,12 +205,12 @@ typedef struct xfs_icdinode {
((w) == XFS_DATA_FORK ? \
((ip)->i_d.di_nextents = (n)) : \
((ip)->i_d.di_anextents = (n)))
-
+#define XFS_IFORK_MAXEXT(ip, w) \
+ (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
#ifdef __KERNEL__
-struct bhv_desc;
struct xfs_buf;
struct xfs_bmap_free;
struct xfs_bmbt_irec;
@@ -220,12 +219,6 @@ struct xfs_mount;
struct xfs_trans;
struct xfs_dquot;
-typedef struct dm_attrs_s {
- __uint32_t da_dmevmask; /* DMIG event mask */
- __uint16_t da_dmstate; /* DMIG state info */
- __uint16_t da_pad; /* DMIG extra padding */
-} dm_attrs_t;
-
typedef struct xfs_inode {
/* Inode linking and identification information. */
struct xfs_mount *i_mount; /* fs mount struct ptr */
@@ -244,27 +237,18 @@ typedef struct xfs_inode {
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
mrlock_t i_iolock; /* inode IO lock */
- struct completion i_flush; /* inode flush completion q */
atomic_t i_pincount; /* inode pin count */
- wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */
spinlock_t i_flags_lock; /* inode i_flags lock */
/* Miscellaneous state. */
- unsigned short i_flags; /* see defined flags below */
- unsigned char i_update_core; /* timestamps/size is dirty */
+ unsigned long i_flags; /* see defined flags below */
unsigned int i_delayed_blks; /* count of delay alloc blks */
xfs_icdinode_t i_d; /* most of ondisk inode */
- xfs_fsize_t i_size; /* in-memory size */
- xfs_fsize_t i_new_size; /* size when write completes */
-
/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */
} xfs_inode_t;
-#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \
- (ip)->i_size : (ip)->i_d.di_size;
-
/* Convert from vfs inode to xfs inode */
static inline struct xfs_inode *XFS_I(struct inode *inode)
{
@@ -278,6 +262,32 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
}
/*
+ * For regular files we only update the on-disk filesize when actually
+ * writing data back to disk. Until then only the copy in the VFS inode
+ * is uptodate.
+ */
+static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
+{
+ if (S_ISREG(ip->i_d.di_mode))
+ return i_size_read(VFS_I(ip));
+ return ip->i_d.di_size;
+}
+
+/*
+ * If this I/O goes past the on-disk inode size update it unless it would
+ * be past the current in-core inode size.
+ */
+static inline xfs_fsize_t
+xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
+{
+ xfs_fsize_t i_size = i_size_read(VFS_I(ip));
+
+ if (new_size > i_size)
+ new_size = i_size;
+ return new_size > ip->i_d.di_size ? new_size : 0;
+}
+
+/*
* i_flags helper functions
*/
static inline void
@@ -331,6 +341,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
return ret;
}
+static inline int
+xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
+{
+ int ret;
+
+ spin_lock(&ip->i_flags_lock);
+ ret = ip->i_flags & flags;
+ if (!ret)
+ ip->i_flags |= flags;
+ spin_unlock(&ip->i_flags_lock);
+ return ret;
+}
+
/*
* Project quota id helpers (previously projid was 16bit only
* and using two 16bit values to hold new 32bit projid was chosen
@@ -351,39 +374,24 @@ xfs_set_projid(struct xfs_inode *ip,
}
/*
- * Manage the i_flush queue embedded in the inode. This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
- wait_for_completion(&ip->i_flush);
-}
-
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
- return try_wait_for_completion(&ip->i_flush);
-}
-
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
- complete(&ip->i_flush);
-}
-
-/*
* In-core inode flags.
*/
-#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */
-#define XFS_ISTALE 0x0002 /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
-#define XFS_INEW 0x0008 /* inode has just been allocated */
-#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
-#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
-#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */
+#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */
+#define XFS_ISTALE (1 << 1) /* inode has been staled */
+#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
+#define XFS_INEW (1 << 3) /* inode has just been allocated */
+#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */
+#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
+#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
+#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT)
+#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
+#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
+#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */
/*
* Per-lifetime flags need to be reset when re-using a reclaimable inode during
- * inode lookup. Thi prevents unintended behaviour on the new inode from
+ * inode lookup. This prevents unintended behaviour on the new inode from
* ocurring.
*/
#define XFS_IRECLAIM_RESET_FLAGS \
@@ -392,6 +400,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
XFS_IFILESTREAM);
/*
+ * Synchronize processes attempting to flush the in-core inode back to disk.
+ */
+
+extern void __xfs_iflock(struct xfs_inode *ip);
+
+static inline int xfs_iflock_nowait(struct xfs_inode *ip)
+{
+ return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
+}
+
+static inline void xfs_iflock(struct xfs_inode *ip)
+{
+ if (!xfs_iflock_nowait(ip))
+ __xfs_iflock(ip);
+}
+
+static inline void xfs_ifunlock(struct xfs_inode *ip)
+{
+ xfs_iflags_clear(ip, XFS_IFLOCK);
+ wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
+}
+
+static inline int xfs_isiflocked(struct xfs_inode *ip)
+{
+ return xfs_iflags_test(ip, XFS_IFLOCK);
+}
+
+/*
* Flags for inode locking.
* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
* 1<<16 - 1<<32-1 -- lockdep annotation (integers)
@@ -400,7 +436,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
#define XFS_IOLOCK_SHARED (1<<1)
#define XFS_ILOCK_EXCL (1<<2)
#define XFS_ILOCK_SHARED (1<<3)
-#define XFS_IUNLOCK_NONOTIFY (1<<4)
#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
@@ -409,8 +444,7 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
{ XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \
{ XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \
{ XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \
- { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \
- { XFS_IUNLOCK_NONOTIFY, "IUNLOCK_NONOTIFY" }
+ { XFS_ILOCK_SHARED, "ILOCK_SHARED" }
/*
@@ -481,7 +515,7 @@ void xfs_inode_free(struct xfs_inode *ip);
/*
* xfs_inode.c prototypes.
*/
-int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
+int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
xfs_nlink_t, xfs_dev_t, prid_t, int,
struct xfs_buf **, boolean_t *, xfs_inode_t **);
@@ -491,20 +525,15 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
struct xfs_bmap_free *);
int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
int, xfs_fsize_t);
-int xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *,
- xfs_fsize_t);
int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
void xfs_iext_realloc(xfs_inode_t *, int, int);
void xfs_iunpin_wait(xfs_inode_t *);
-int xfs_iflush(xfs_inode_t *, uint);
-void xfs_promote_inode(struct xfs_inode *);
+int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
-void xfs_synchronize_times(xfs_inode_t *);
-void xfs_mark_inode_dirty(xfs_inode_t *);
-void xfs_mark_inode_dirty_sync(xfs_inode_t *);
+xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
#define IHOLD(ip) \
do { \
@@ -526,6 +555,7 @@ do { \
*/
#define XFS_IGET_CREATE 0x1
#define XFS_IGET_UNTRUSTED 0x2
+#define XFS_IGET_DONTCACHE 0x4
int xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
xfs_ino_t, struct xfs_dinode **,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index abaafdbb3e6..d041d47d9d8 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -57,79 +55,28 @@ xfs_inode_item_size(
struct xfs_inode *ip = iip->ili_inode;
uint nvecs = 2;
- /*
- * Only log the data/extents/b-tree root if there is something
- * left to log.
- */
- iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
-
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
- XFS_ILOG_DEV | XFS_ILOG_UUID);
- if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) &&
- (ip->i_d.di_nextents > 0) &&
- (ip->i_df.if_bytes > 0)) {
- ASSERT(ip->i_df.if_u1.if_extents != NULL);
+ if ((iip->ili_fields & XFS_ILOG_DEXT) &&
+ ip->i_d.di_nextents > 0 &&
+ ip->i_df.if_bytes > 0)
nvecs++;
- } else {
- iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT;
- }
break;
case XFS_DINODE_FMT_BTREE:
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
- XFS_ILOG_DEV | XFS_ILOG_UUID);
- if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) &&
- (ip->i_df.if_broot_bytes > 0)) {
- ASSERT(ip->i_df.if_broot != NULL);
+ if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
+ ip->i_df.if_broot_bytes > 0)
nvecs++;
- } else {
- ASSERT(!(iip->ili_format.ilf_fields &
- XFS_ILOG_DBROOT));
-#ifdef XFS_TRANS_DEBUG
- if (iip->ili_root_size > 0) {
- ASSERT(iip->ili_root_size ==
- ip->i_df.if_broot_bytes);
- ASSERT(memcmp(iip->ili_orig_root,
- ip->i_df.if_broot,
- iip->ili_root_size) == 0);
- } else {
- ASSERT(ip->i_df.if_broot_bytes == 0);
- }
-#endif
- iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT;
- }
break;
case XFS_DINODE_FMT_LOCAL:
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
- XFS_ILOG_DEV | XFS_ILOG_UUID);
- if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) &&
- (ip->i_df.if_bytes > 0)) {
- ASSERT(ip->i_df.if_u1.if_data != NULL);
- ASSERT(ip->i_d.di_size > 0);
+ if ((iip->ili_fields & XFS_ILOG_DDATA) &&
+ ip->i_df.if_bytes > 0)
nvecs++;
- } else {
- iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA;
- }
break;
case XFS_DINODE_FMT_DEV:
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
- XFS_ILOG_DEXT | XFS_ILOG_UUID);
- break;
-
case XFS_DINODE_FMT_UUID:
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
- XFS_ILOG_DEXT | XFS_ILOG_DEV);
break;
default:
@@ -137,56 +84,31 @@ xfs_inode_item_size(
break;
}
- /*
- * If there are no attributes associated with this file,
- * then there cannot be anything more to log.
- * Clear all attribute-related log flags.
- */
- if (!XFS_IFORK_Q(ip)) {
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+ if (!XFS_IFORK_Q(ip))
return nvecs;
- }
+
/*
* Log any necessary attribute data.
*/
switch (ip->i_d.di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
- if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) &&
- (ip->i_d.di_anextents > 0) &&
- (ip->i_afp->if_bytes > 0)) {
- ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+ if ((iip->ili_fields & XFS_ILOG_AEXT) &&
+ ip->i_d.di_anextents > 0 &&
+ ip->i_afp->if_bytes > 0)
nvecs++;
- } else {
- iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT;
- }
break;
case XFS_DINODE_FMT_BTREE:
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
- if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) &&
- (ip->i_afp->if_broot_bytes > 0)) {
- ASSERT(ip->i_afp->if_broot != NULL);
+ if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
+ ip->i_afp->if_broot_bytes > 0)
nvecs++;
- } else {
- iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT;
- }
break;
case XFS_DINODE_FMT_LOCAL:
- iip->ili_format.ilf_fields &=
- ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
- if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) &&
- (ip->i_afp->if_bytes > 0)) {
- ASSERT(ip->i_afp->if_u1.if_data != NULL);
+ if ((iip->ili_fields & XFS_ILOG_ADATA) &&
+ ip->i_afp->if_bytes > 0)
nvecs++;
- } else {
- iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA;
- }
break;
default:
@@ -256,48 +178,11 @@ xfs_inode_item_format(
vecp++;
nvecs = 1;
- /*
- * Clear i_update_core if the timestamps (or any other
- * non-transactional modification) need flushing/logging
- * and we're about to log them with the rest of the core.
- *
- * This is the same logic as xfs_iflush() but this code can't
- * run at the same time as xfs_iflush because we're in commit
- * processing here and so we have the inode lock held in
- * exclusive mode. Although it doesn't really matter
- * for the timestamps if both routines were to grab the
- * timestamps or not. That would be ok.
- *
- * We clear i_update_core before copying out the data.
- * This is for coordination with our timestamp updates
- * that don't hold the inode lock. They will always
- * update the timestamps BEFORE setting i_update_core,
- * so if we clear i_update_core after they set it we
- * are guaranteed to see their updates to the timestamps
- * either here. Likewise, if they set it after we clear it
- * here, we'll see it either on the next commit of this
- * inode or the next time the inode gets flushed via
- * xfs_iflush(). This depends on strongly ordered memory
- * semantics, but we have that. We use the SYNCHRONIZE
- * macro to make sure that the compiler does not reorder
- * the i_update_core access below the data copy below.
- */
- if (ip->i_update_core) {
- ip->i_update_core = 0;
- SYNCHRONIZE();
- }
-
- /*
- * Make sure to get the latest timestamps from the Linux inode.
- */
- xfs_synchronize_times(ip);
-
vecp->i_addr = &ip->i_d;
vecp->i_len = sizeof(struct xfs_icdinode);
vecp->i_type = XLOG_REG_TYPE_ICORE;
vecp++;
nvecs++;
- iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
/*
* If this is really an old format inode, then we need to
@@ -330,16 +215,17 @@ xfs_inode_item_format(
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
- XFS_ILOG_DEV | XFS_ILOG_UUID)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) {
- ASSERT(ip->i_df.if_bytes > 0);
+ iip->ili_fields &=
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+ XFS_ILOG_DEV | XFS_ILOG_UUID);
+
+ if ((iip->ili_fields & XFS_ILOG_DEXT) &&
+ ip->i_d.di_nextents > 0 &&
+ ip->i_df.if_bytes > 0) {
ASSERT(ip->i_df.if_u1.if_extents != NULL);
- ASSERT(ip->i_d.di_nextents > 0);
+ ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
ASSERT(iip->ili_extents_buf == NULL);
- ASSERT((ip->i_df.if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t)) > 0);
+
#ifdef XFS_NATIVE_HOST
if (ip->i_d.di_nextents == ip->i_df.if_bytes /
(uint)sizeof(xfs_bmbt_rec_t)) {
@@ -361,15 +247,18 @@ xfs_inode_item_format(
iip->ili_format.ilf_dsize = vecp->i_len;
vecp++;
nvecs++;
+ } else {
+ iip->ili_fields &= ~XFS_ILOG_DEXT;
}
break;
case XFS_DINODE_FMT_BTREE:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_DDATA | XFS_ILOG_DEXT |
- XFS_ILOG_DEV | XFS_ILOG_UUID)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
- ASSERT(ip->i_df.if_broot_bytes > 0);
+ iip->ili_fields &=
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+ XFS_ILOG_DEV | XFS_ILOG_UUID);
+
+ if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
+ ip->i_df.if_broot_bytes > 0) {
ASSERT(ip->i_df.if_broot != NULL);
vecp->i_addr = ip->i_df.if_broot;
vecp->i_len = ip->i_df.if_broot_bytes;
@@ -377,15 +266,30 @@ xfs_inode_item_format(
vecp++;
nvecs++;
iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
+ } else {
+ ASSERT(!(iip->ili_fields &
+ XFS_ILOG_DBROOT));
+#ifdef XFS_TRANS_DEBUG
+ if (iip->ili_root_size > 0) {
+ ASSERT(iip->ili_root_size ==
+ ip->i_df.if_broot_bytes);
+ ASSERT(memcmp(iip->ili_orig_root,
+ ip->i_df.if_broot,
+ iip->ili_root_size) == 0);
+ } else {
+ ASSERT(ip->i_df.if_broot_bytes == 0);
+ }
+#endif
+ iip->ili_fields &= ~XFS_ILOG_DBROOT;
}
break;
case XFS_DINODE_FMT_LOCAL:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
- XFS_ILOG_DEV | XFS_ILOG_UUID)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) {
- ASSERT(ip->i_df.if_bytes > 0);
+ iip->ili_fields &=
+ ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
+ XFS_ILOG_DEV | XFS_ILOG_UUID);
+ if ((iip->ili_fields & XFS_ILOG_DDATA) &&
+ ip->i_df.if_bytes > 0) {
ASSERT(ip->i_df.if_u1.if_data != NULL);
ASSERT(ip->i_d.di_size > 0);
@@ -403,24 +307,26 @@ xfs_inode_item_format(
vecp++;
nvecs++;
iip->ili_format.ilf_dsize = (unsigned)data_bytes;
+ } else {
+ iip->ili_fields &= ~XFS_ILOG_DDATA;
}
break;
case XFS_DINODE_FMT_DEV:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
- XFS_ILOG_DDATA | XFS_ILOG_UUID)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+ iip->ili_fields &=
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+ XFS_ILOG_DEXT | XFS_ILOG_UUID);
+ if (iip->ili_fields & XFS_ILOG_DEV) {
iip->ili_format.ilf_u.ilfu_rdev =
ip->i_df.if_u2.if_rdev;
}
break;
case XFS_DINODE_FMT_UUID:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
- XFS_ILOG_DDATA | XFS_ILOG_DEV)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+ iip->ili_fields &=
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+ XFS_ILOG_DEXT | XFS_ILOG_DEV);
+ if (iip->ili_fields & XFS_ILOG_UUID) {
iip->ili_format.ilf_u.ilfu_uuid =
ip->i_df.if_u2.if_uuid;
}
@@ -432,32 +338,25 @@ xfs_inode_item_format(
}
/*
- * If there are no attributes associated with the file,
- * then we're done.
- * Assert that no attribute-related log flags are set.
+ * If there are no attributes associated with the file, then we're done.
*/
if (!XFS_IFORK_Q(ip)) {
- ASSERT(nvecs == lip->li_desc->lid_size);
- iip->ili_format.ilf_size = nvecs;
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
- return;
+ iip->ili_fields &=
+ ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+ goto out;
}
switch (ip->i_d.di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
-#ifdef DEBUG
- int nrecs = ip->i_afp->if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t);
- ASSERT(nrecs > 0);
- ASSERT(nrecs == ip->i_d.di_anextents);
- ASSERT(ip->i_afp->if_bytes > 0);
+ iip->ili_fields &=
+ ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
+
+ if ((iip->ili_fields & XFS_ILOG_AEXT) &&
+ ip->i_d.di_anextents > 0 &&
+ ip->i_afp->if_bytes > 0) {
+ ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
+ ip->i_d.di_anextents);
ASSERT(ip->i_afp->if_u1.if_extents != NULL);
- ASSERT(ip->i_d.di_anextents > 0);
-#endif
#ifdef XFS_NATIVE_HOST
/*
* There are not delayed allocation extents
@@ -474,29 +373,36 @@ xfs_inode_item_format(
iip->ili_format.ilf_asize = vecp->i_len;
vecp++;
nvecs++;
+ } else {
+ iip->ili_fields &= ~XFS_ILOG_AEXT;
}
break;
case XFS_DINODE_FMT_BTREE:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_ADATA | XFS_ILOG_AEXT)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
- ASSERT(ip->i_afp->if_broot_bytes > 0);
+ iip->ili_fields &=
+ ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
+
+ if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
+ ip->i_afp->if_broot_bytes > 0) {
ASSERT(ip->i_afp->if_broot != NULL);
+
vecp->i_addr = ip->i_afp->if_broot;
vecp->i_len = ip->i_afp->if_broot_bytes;
vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
vecp++;
nvecs++;
iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
+ } else {
+ iip->ili_fields &= ~XFS_ILOG_ABROOT;
}
break;
case XFS_DINODE_FMT_LOCAL:
- ASSERT(!(iip->ili_format.ilf_fields &
- (XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
- if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) {
- ASSERT(ip->i_afp->if_bytes > 0);
+ iip->ili_fields &=
+ ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
+
+ if ((iip->ili_fields & XFS_ILOG_ADATA) &&
+ ip->i_afp->if_bytes > 0) {
ASSERT(ip->i_afp->if_u1.if_data != NULL);
vecp->i_addr = ip->i_afp->if_u1.if_data;
@@ -513,6 +419,8 @@ xfs_inode_item_format(
vecp++;
nvecs++;
iip->ili_format.ilf_asize = (unsigned)data_bytes;
+ } else {
+ iip->ili_fields &= ~XFS_ILOG_ADATA;
}
break;
@@ -521,7 +429,15 @@ xfs_inode_item_format(
break;
}
- ASSERT(nvecs == lip->li_desc->lid_size);
+out:
+ /*
+ * Now update the log format that goes out to disk from the in-core
+ * values. We always write the inode core to make the arithmetic
+ * games in recovery easier, which isn't a big deal as just about any
+ * transaction would dirty it anyway.
+ */
+ iip->ili_format.ilf_fields = XFS_ILOG_CORE |
+ (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
iip->ili_format.ilf_size = nvecs;
}
@@ -559,28 +475,19 @@ xfs_inode_item_unpin(
trace_xfs_inode_unpin(ip, _RET_IP_);
ASSERT(atomic_read(&ip->i_pincount) > 0);
if (atomic_dec_and_test(&ip->i_pincount))
- wake_up(&ip->i_ipin_wait);
+ wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
}
-/*
- * This is called to attempt to lock the inode associated with this
- * inode log item, in preparation for the push routine which does the actual
- * iflush. Don't sleep on the inode lock or the flush lock.
- *
- * If the flush lock is already held, indicating that the inode has
- * been or is in the process of being flushed, then (ideally) we'd like to
- * see if the inode's buffer is still incore, and if so give it a nudge.
- * We delay doing so until the pushbuf routine, though, to avoid holding
- * the AIL lock across a call to the blackhole which is the buffer cache.
- * Also we don't want to sleep in any device strategy routines, which can happen
- * if we do the subsequent bawrite in here.
- */
STATIC uint
-xfs_inode_item_trylock(
- struct xfs_log_item *lip)
+xfs_inode_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
+ struct xfs_buf *bp = NULL;
+ uint rval = XFS_ITEM_SUCCESS;
+ int error;
if (xfs_ipincount(ip) > 0)
return XFS_ITEM_PINNED;
@@ -588,34 +495,49 @@ xfs_inode_item_trylock(
if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
return XFS_ITEM_LOCKED;
- if (!xfs_iflock_nowait(ip)) {
- /*
- * inode has already been flushed to the backing buffer,
- * leave it locked in shared mode, pushbuf routine will
- * unlock it.
- */
- return XFS_ITEM_PUSHBUF;
+ /*
+ * Re-check the pincount now that we stabilized the value by
+ * taking the ilock.
+ */
+ if (xfs_ipincount(ip) > 0) {
+ rval = XFS_ITEM_PINNED;
+ goto out_unlock;
}
- /* Stale items should force out the iclog */
+ /*
+ * Stale inode items should force out the iclog.
+ */
if (ip->i_flags & XFS_ISTALE) {
- xfs_ifunlock(ip);
- /*
- * we hold the AIL lock - notify the unlock routine of this
- * so it doesn't try to get the lock again.
- */
- xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
- return XFS_ITEM_PINNED;
+ rval = XFS_ITEM_PINNED;
+ goto out_unlock;
}
-#ifdef DEBUG
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- ASSERT(iip->ili_format.ilf_fields != 0);
- ASSERT(iip->ili_logged == 0);
- ASSERT(lip->li_flags & XFS_LI_IN_AIL);
+ /*
+ * Someone else is already flushing the inode. Nothing we can do
+ * here but wait for the flush to finish and remove the item from
+ * the AIL.
+ */
+ if (!xfs_iflock_nowait(ip)) {
+ rval = XFS_ITEM_FLUSHING;
+ goto out_unlock;
}
-#endif
- return XFS_ITEM_SUCCESS;
+
+ ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
+ ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
+
+ spin_unlock(&lip->li_ailp->xa_lock);
+
+ error = xfs_iflush(ip, &bp);
+ if (!error) {
+ if (!xfs_buf_delwri_queue(bp, buffer_list))
+ rval = XFS_ITEM_FLUSHING;
+ xfs_buf_relse(bp);
+ }
+
+ spin_lock(&lip->li_ailp->xa_lock);
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return rval;
}
/*
@@ -642,7 +564,7 @@ xfs_inode_item_unlock(
if (iip->ili_extents_buf != NULL) {
ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
ASSERT(ip->i_d.di_nextents > 0);
- ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);
+ ASSERT(iip->ili_fields & XFS_ILOG_DEXT);
ASSERT(ip->i_df.if_bytes > 0);
kmem_free(iip->ili_extents_buf);
iip->ili_extents_buf = NULL;
@@ -650,7 +572,7 @@ xfs_inode_item_unlock(
if (iip->ili_aextents_buf != NULL) {
ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
ASSERT(ip->i_d.di_anextents > 0);
- ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT);
+ ASSERT(iip->ili_fields & XFS_ILOG_AEXT);
ASSERT(ip->i_afp->if_bytes > 0);
kmem_free(iip->ili_aextents_buf);
iip->ili_aextents_buf = NULL;
@@ -700,87 +622,6 @@ xfs_inode_item_committed(
}
/*
- * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
- * failed to get the inode flush lock but did get the inode locked SHARED.
- * Here we're trying to see if the inode buffer is incore, and if so whether it's
- * marked delayed write. If that's the case, we'll promote it and that will
- * allow the caller to write the buffer by triggering the xfsbufd to run.
- */
-STATIC bool
-xfs_inode_item_pushbuf(
- struct xfs_log_item *lip)
-{
- struct xfs_inode_log_item *iip = INODE_ITEM(lip);
- struct xfs_inode *ip = iip->ili_inode;
- struct xfs_buf *bp;
- bool ret = true;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
-
- /*
- * If a flush is not in progress anymore, chances are that the
- * inode was taken off the AIL. So, just get out.
- */
- if (completion_done(&ip->i_flush) ||
- !(lip->li_flags & XFS_LI_IN_AIL)) {
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return true;
- }
-
- bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
- iip->ili_format.ilf_len, XBF_TRYLOCK);
-
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (!bp)
- return true;
- if (XFS_BUF_ISDELAYWRITE(bp))
- xfs_buf_delwri_promote(bp);
- if (xfs_buf_ispinned(bp))
- ret = false;
- xfs_buf_relse(bp);
- return ret;
-}
-
-/*
- * This is called to asynchronously write the inode associated with this
- * inode log item out to disk. The inode will already have been locked by
- * a successful call to xfs_inode_item_trylock().
- */
-STATIC void
-xfs_inode_item_push(
- struct xfs_log_item *lip)
-{
- struct xfs_inode_log_item *iip = INODE_ITEM(lip);
- struct xfs_inode *ip = iip->ili_inode;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
- ASSERT(!completion_done(&ip->i_flush));
-
- /*
- * Since we were able to lock the inode's flush lock and
- * we found it on the AIL, the inode must be dirty. This
- * is because the inode is removed from the AIL while still
- * holding the flush lock in xfs_iflush_done(). Thus, if
- * we found it in the AIL and were able to obtain the flush
- * lock without sleeping, then there must not have been
- * anyone in the process of flushing the inode.
- */
- ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
- iip->ili_format.ilf_fields != 0);
-
- /*
- * Push the inode to it's backing buffer. This will not remove the
- * inode from the AIL - a further push will be required to trigger a
- * buffer push. However, this allows all the dirty inodes to be pushed
- * to the buffer before it is pushed to disk. The buffer IO completion
- * will pull the inode from the AIL, mark it clean and unlock the flush
- * lock.
- */
- (void) xfs_iflush(ip, SYNC_TRYLOCK);
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-}
-
-/*
* XXX rcc - this one really has to do something. Probably needs
* to stamp in a new field in the incore inode.
*/
@@ -800,11 +641,9 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
.iop_format = xfs_inode_item_format,
.iop_pin = xfs_inode_item_pin,
.iop_unpin = xfs_inode_item_unpin,
- .iop_trylock = xfs_inode_item_trylock,
.iop_unlock = xfs_inode_item_unlock,
.iop_committed = xfs_inode_item_committed,
.iop_push = xfs_inode_item_push,
- .iop_pushbuf = xfs_inode_item_pushbuf,
.iop_committing = xfs_inode_item_committing
};
@@ -935,7 +774,8 @@ xfs_iflush_done(
ASSERT(i <= need_ail);
}
/* xfs_trans_ail_delete_bulk() drops the AIL lock. */
- xfs_trans_ail_delete_bulk(ailp, log_items, i);
+ xfs_trans_ail_delete_bulk(ailp, log_items, i,
+ SHUTDOWN_CORRUPT_INCORE);
}
@@ -956,16 +796,15 @@ xfs_iflush_done(
}
/*
- * This is the inode flushing abort routine. It is called
- * from xfs_iflush when the filesystem is shutting down to clean
- * up the inode state.
- * It is responsible for removing the inode item
- * from the AIL if it has not been re-logged, and unlocking the inode's
- * flush lock.
+ * This is the inode flushing abort routine. It is called from xfs_iflush when
+ * the filesystem is shutting down to clean up the inode state. It is
+ * responsible for removing the inode item from the AIL if it has not been
+ * re-logged, and unlocking the inode's flush lock.
*/
void
xfs_iflush_abort(
- xfs_inode_t *ip)
+ xfs_inode_t *ip,
+ bool stale)
{
xfs_inode_log_item_t *iip = ip->i_itemp;
@@ -975,7 +814,10 @@ xfs_iflush_abort(
spin_lock(&ailp->xa_lock);
if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
/* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip);
+ xfs_trans_ail_delete(ailp, &iip->ili_item,
+ stale ?
+ SHUTDOWN_LOG_IO_ERROR :
+ SHUTDOWN_CORRUPT_INCORE);
} else
spin_unlock(&ailp->xa_lock);
}
@@ -989,7 +831,7 @@ xfs_iflush_abort(
* Clear the inode logging fields so no more flushes are
* attempted.
*/
- iip->ili_format.ilf_fields = 0;
+ iip->ili_fields = 0;
}
/*
* Release the inode's flush lock since we're done with it.
@@ -1002,7 +844,7 @@ xfs_istale_done(
struct xfs_buf *bp,
struct xfs_log_item *lip)
{
- xfs_iflush_abort(INODE_ITEM(lip)->ili_inode);
+ xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true);
}
/*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index d3dee61e6d9..376d4d0b263 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -86,6 +86,15 @@ typedef struct xfs_inode_log_format_64 {
#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
+
+/*
+ * The timestamps are dirty, but not necessarily anything else in the inode
+ * core. Unlike the other fields above this one must never make it to disk
+ * in the ilf_fields of the inode_log_format, but is purely store in-memory in
+ * ili_fields in the inode_log_item.
+ */
+#define XFS_ILOG_TIMESTAMP 0x4000
+
#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
XFS_ILOG_UUID | XFS_ILOG_ADATA | \
@@ -101,7 +110,7 @@ typedef struct xfs_inode_log_format_64 {
XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
XFS_ILOG_DEV | XFS_ILOG_UUID | \
XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
- XFS_ILOG_ABROOT)
+ XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
static inline int xfs_ilog_fbroot(int w)
{
@@ -134,6 +143,7 @@ typedef struct xfs_inode_log_item {
unsigned short ili_lock_flags; /* lock flags */
unsigned short ili_logged; /* flushed logged data */
unsigned int ili_last_fields; /* fields when flushed */
+ unsigned int ili_fields; /* fields to be logged */
struct xfs_bmbt_rec *ili_extents_buf; /* array of logged
data exts */
struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged
@@ -148,16 +158,14 @@ typedef struct xfs_inode_log_item {
static inline int xfs_inode_clean(xfs_inode_t *ip)
{
- return (!ip->i_itemp ||
- !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
- !ip->i_update_core;
+ return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
}
extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
extern void xfs_inode_item_destroy(struct xfs_inode *);
extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
-extern void xfs_iflush_abort(struct xfs_inode *);
+extern void xfs_iflush_abort(struct xfs_inode *, bool);
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
xfs_inode_log_format_t *);
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
index b253c0ea5be..90efdaf1706 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/xfs_inum.h
@@ -26,11 +26,6 @@
* high agno_log-agblklog-inopblog bits - 0
*/
-typedef __uint32_t xfs_agino_t; /* within allocation grp inode number */
-
-#define NULLFSINO ((xfs_ino_t)-1)
-#define NULLAGINO ((xfs_agino_t)-1)
-
struct xfs_mount;
#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d99a9051890..3a05a41b5d7 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -17,9 +17,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -209,6 +207,7 @@ xfs_open_by_handle(
struct file *filp;
struct inode *inode;
struct dentry *dentry;
+ fmode_t fmode;
if (!capable(CAP_SYS_ADMIN))
return -XFS_ERROR(EPERM);
@@ -228,26 +227,21 @@ xfs_open_by_handle(
hreq->oflags |= O_LARGEFILE;
#endif
- /* Put open permission in namei format. */
permflag = hreq->oflags;
- if ((permflag+1) & O_ACCMODE)
- permflag++;
- if (permflag & O_TRUNC)
- permflag |= 2;
-
+ fmode = OPEN_FMODE(permflag);
if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
- (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
+ (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
error = -XFS_ERROR(EPERM);
goto out_dput;
}
- if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
+ if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
error = -XFS_ERROR(EACCES);
goto out_dput;
}
/* Can't write directories. */
- if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
+ if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
error = -XFS_ERROR(EISDIR);
goto out_dput;
}
@@ -450,9 +444,12 @@ xfs_attrmulti_attr_get(
if (*len > XATTR_SIZE_MAX)
return EINVAL;
- kbuf = kmalloc(*len, GFP_KERNEL);
- if (!kbuf)
- return ENOMEM;
+ kbuf = kmem_zalloc(*len, KM_SLEEP | KM_MAYFAIL);
+ if (!kbuf) {
+ kbuf = kmem_zalloc_large(*len);
+ if (!kbuf)
+ return ENOMEM;
+ }
error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
if (error)
@@ -462,7 +459,10 @@ xfs_attrmulti_attr_get(
error = EFAULT;
out_kfree:
- kfree(kbuf);
+ if (is_vmalloc_addr(kbuf))
+ kmem_free_large(kbuf);
+ else
+ kmem_free(kbuf);
return error;
}
@@ -559,23 +559,23 @@ xfs_attrmulti_by_handle(
ops[i].am_flags);
break;
case ATTR_OP_SET:
- ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ ops[i].am_error = mnt_want_write_file(parfilp);
if (ops[i].am_error)
break;
ops[i].am_error = xfs_attrmulti_attr_set(
dentry->d_inode, attr_name,
ops[i].am_attrvalue, ops[i].am_length,
ops[i].am_flags);
- mnt_drop_write(parfilp->f_path.mnt);
+ mnt_drop_write_file(parfilp);
break;
case ATTR_OP_REMOVE:
- ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ ops[i].am_error = mnt_want_write_file(parfilp);
if (ops[i].am_error)
break;
ops[i].am_error = xfs_attrmulti_attr_remove(
dentry->d_inode, attr_name,
ops[i].am_flags);
- mnt_drop_write(parfilp->f_path.mnt);
+ mnt_drop_write_file(parfilp);
break;
default:
ops[i].am_error = EINVAL;
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 54e623bfbb8..c4f2da0d2bf 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -22,9 +22,7 @@
#include <asm/uaccess.h>
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -293,7 +291,7 @@ xfs_compat_ioc_bulkstat(
int res;
error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
- sizeof(compat_xfs_bstat_t), 0, &res);
+ sizeof(compat_xfs_bstat_t), NULL, &res);
} else if (cmd == XFS_IOC_FSBULKSTAT_32) {
error = xfs_bulkstat(mp, &inlast, &count,
xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
@@ -454,23 +452,23 @@ xfs_compat_attrmulti_by_handle(
&ops[i].am_length, ops[i].am_flags);
break;
case ATTR_OP_SET:
- ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ ops[i].am_error = mnt_want_write_file(parfilp);
if (ops[i].am_error)
break;
ops[i].am_error = xfs_attrmulti_attr_set(
dentry->d_inode, attr_name,
compat_ptr(ops[i].am_attrvalue),
ops[i].am_length, ops[i].am_flags);
- mnt_drop_write(parfilp->f_path.mnt);
+ mnt_drop_write_file(parfilp);
break;
case ATTR_OP_REMOVE:
- ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ ops[i].am_error = mnt_want_write_file(parfilp);
if (ops[i].am_error)
break;
ops[i].am_error = xfs_attrmulti_attr_remove(
dentry->d_inode, attr_name,
ops[i].am_flags);
- mnt_drop_write(parfilp->f_path.mnt);
+ mnt_drop_write_file(parfilp);
break;
default:
ops[i].am_error = EINVAL;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 9afa282aa93..aadfce6681e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -17,9 +17,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -31,12 +29,12 @@
#include "xfs_ialloc_btree.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_inode_item.h"
#include "xfs_btree.h"
#include "xfs_bmap.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_itable.h"
-#include "xfs_rw.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_trans_space.h"
@@ -57,26 +55,26 @@ xfs_iomap_eof_align_last_fsb(
xfs_fileoff_t *last_fsb)
{
xfs_fileoff_t new_last_fsb = 0;
- xfs_extlen_t align;
+ xfs_extlen_t align = 0;
int eof, error;
- if (XFS_IS_REALTIME_INODE(ip))
- ;
- /*
- * If mounted with the "-o swalloc" option, roundup the allocation
- * request to a stripe width boundary if the file size is >=
- * stripe width and we are allocating past the allocation eof.
- */
- else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
- (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
- new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
- /*
- * Roundup the allocation request to a stripe unit (m_dalign) boundary
- * if the file size is >= stripe unit size, and we are allocating past
- * the allocation eof.
- */
- else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
- new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
+ if (!XFS_IS_REALTIME_INODE(ip)) {
+ /*
+ * Round up the allocation request to a stripe unit
+ * (m_dalign) boundary if the file size is >= stripe unit
+ * size, and we are allocating past the allocation eof.
+ *
+ * If mounted with the "-o swalloc" option the alignment is
+ * increased from the strip unit size to the stripe width.
+ */
+ if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+ align = mp->m_swidth;
+ else if (mp->m_dalign)
+ align = mp->m_dalign;
+
+ if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
+ new_last_fsb = roundup_64(*last_fsb, align);
+ }
/*
* Always round up the allocation request to an extent boundary
@@ -141,11 +139,7 @@ xfs_iomap_write_direct(
int committed;
int error;
- /*
- * Make sure that the dquots are there. This doesn't hold
- * the ilock across a disk read.
- */
- error = xfs_qm_dqattach_locked(ip, 0);
+ error = xfs_qm_dqattach(ip, 0);
if (error)
return XFS_ERROR(error);
@@ -154,10 +148,10 @@ xfs_iomap_write_direct(
offset_fsb = XFS_B_TO_FSBT(mp, offset);
last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
- if ((offset + count) > ip->i_size) {
+ if ((offset + count) > XFS_ISIZE(ip)) {
error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
if (error)
- goto error_out;
+ return XFS_ERROR(error);
} else {
if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
last_fsb = MIN(last_fsb, (xfs_fileoff_t)
@@ -189,7 +183,6 @@ xfs_iomap_write_direct(
/*
* Allocate and setup the transaction
*/
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
error = xfs_trans_reserve(tp, resblks,
XFS_WRITE_LOG_RES(mp), resrtextents,
@@ -198,20 +191,21 @@ xfs_iomap_write_direct(
/*
* Check for running out of space, note: need lock to return
*/
- if (error)
+ if (error) {
xfs_trans_cancel(tp, 0);
+ return XFS_ERROR(error);
+ }
+
xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (error)
- goto error_out;
error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
if (error)
- goto error1;
+ goto out_trans_cancel;
xfs_trans_ijoin(tp, ip, 0);
bmapi_flag = 0;
- if (offset < ip->i_size || extsz)
+ if (offset < XFS_ISIZE(ip) || extsz)
bmapi_flag |= XFS_BMAPI_PREALLOC;
/*
@@ -223,42 +217,39 @@ xfs_iomap_write_direct(
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag,
&firstfsb, 0, imap, &nimaps, &free_list);
if (error)
- goto error0;
+ goto out_bmap_cancel;
/*
* Complete the transaction
*/
error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error)
- goto error0;
+ goto out_bmap_cancel;
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
if (error)
- goto error_out;
+ goto out_unlock;
/*
* Copy any maps to caller's array and return any error.
*/
if (nimaps == 0) {
- error = ENOSPC;
- goto error_out;
+ error = XFS_ERROR(ENOSPC);
+ goto out_unlock;
}
- if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
+ if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
error = xfs_alert_fsblock_zero(ip, imap);
- goto error_out;
- }
- return 0;
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
-error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
+out_bmap_cancel:
xfs_bmap_cancel(&free_list);
- xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
-
-error1: /* Just cancel transaction */
+ xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
+out_trans_cancel:
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-
-error_out:
- return XFS_ERROR(error);
+ goto out_unlock;
}
/*
@@ -286,7 +277,7 @@ xfs_iomap_eof_want_preallocate(
int found_delalloc = 0;
*prealloc = 0;
- if ((offset + count) <= ip->i_size)
+ if (offset + count <= XFS_ISIZE(ip))
return 0;
/*
@@ -340,7 +331,7 @@ xfs_iomap_prealloc_size(
* if we pass in alloc_blocks = 0. Hence the "+ 1" to
* ensure we always pass in a non-zero value.
*/
- alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+ alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
rounddown_pow_of_two(alloc_blocks));
@@ -421,6 +412,15 @@ retry:
return error;
}
+ /*
+ * Make sure preallocation does not create extents beyond the range we
+ * actually support in this filesystem.
+ */
+ if (last_fsb > XFS_B_TO_FSB(mp, mp->m_maxioffset))
+ last_fsb = XFS_B_TO_FSB(mp, mp->m_maxioffset);
+
+ ASSERT(last_fsb > offset_fsb);
+
nimaps = XFS_WRITE_IMAPS;
error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
imap, &nimaps, XFS_BMAPI_ENTIRE);
@@ -564,7 +564,7 @@ xfs_iomap_write_allocate(
* back....
*/
nimaps = 1;
- end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
+ end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
error = xfs_bmap_last_offset(NULL, ip, &last_block,
XFS_DATA_FORK);
if (error)
@@ -645,6 +645,7 @@ xfs_iomap_write_unwritten(
xfs_trans_t *tp;
xfs_bmbt_irec_t imap;
xfs_bmap_free_t free_list;
+ xfs_fsize_t i_size;
uint resblks;
int committed;
int error;
@@ -705,7 +706,22 @@ xfs_iomap_write_unwritten(
if (error)
goto error_on_bmapi_transaction;
- error = xfs_bmap_finish(&(tp), &(free_list), &committed);
+ /*
+ * Log the updated inode size as we go. We have to be careful
+ * to only log it up to the actual write offset if it is
+ * halfway into a block.
+ */
+ i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
+ if (i_size > offset + count)
+ i_size = offset + count;
+
+ i_size = xfs_new_eof(ip, i_size);
+ if (i_size) {
+ ip->i_d.di_size = i_size;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ }
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error)
goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 23ce927973a..1a25fd80279 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_acl.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -34,7 +32,6 @@
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_itable.h"
-#include "xfs_rw.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_utils.h"
@@ -50,65 +47,15 @@
#include <linux/fiemap.h>
#include <linux/slab.h>
-/*
- * Bring the timestamps in the XFS inode uptodate.
- *
- * Used before writing the inode to disk.
- */
-void
-xfs_synchronize_times(
- xfs_inode_t *ip)
-{
- struct inode *inode = VFS_I(ip);
-
- ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
- ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
- ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
- ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
- ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
-}
-
-/*
- * If the linux inode is valid, mark it dirty, else mark the dirty state
- * in the XFS inode to make sure we pick it up when reclaiming the inode.
- */
-void
-xfs_mark_inode_dirty_sync(
- xfs_inode_t *ip)
-{
- struct inode *inode = VFS_I(ip);
-
- if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
- mark_inode_dirty_sync(inode);
- else {
- barrier();
- ip->i_update_core = 1;
- }
-}
-
-void
-xfs_mark_inode_dirty(
- xfs_inode_t *ip)
-{
- struct inode *inode = VFS_I(ip);
-
- if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
- mark_inode_dirty(inode);
- else {
- barrier();
- ip->i_update_core = 1;
- }
-
-}
-
-
-int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *fs_info)
+static int
+xfs_initxattrs(
+ struct inode *inode,
+ const struct xattr *xattr_array,
+ void *fs_info)
{
- const struct xattr *xattr;
- struct xfs_inode *ip = XFS_I(inode);
- int error = 0;
+ const struct xattr *xattr;
+ struct xfs_inode *ip = XFS_I(inode);
+ int error = 0;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
error = xfs_attr_set(ip, xattr->name, xattr->value,
@@ -168,7 +115,7 @@ STATIC int
xfs_vn_mknod(
struct inode *dir,
struct dentry *dentry,
- int mode,
+ umode_t mode,
dev_t rdev)
{
struct inode *inode;
@@ -231,7 +178,7 @@ STATIC int
xfs_vn_create(
struct inode *dir,
struct dentry *dentry,
- int mode,
+ umode_t mode,
struct nameidata *nd)
{
return xfs_vn_mknod(dir, dentry, mode, 0);
@@ -241,7 +188,7 @@ STATIC int
xfs_vn_mkdir(
struct inode *dir,
struct dentry *dentry,
- int mode)
+ umode_t mode)
{
return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0);
}
@@ -366,7 +313,7 @@ xfs_vn_symlink(
struct xfs_inode *cip = NULL;
struct xfs_name name;
int error;
- mode_t mode;
+ umode_t mode;
mode = S_IFLNK |
(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
@@ -678,19 +625,16 @@ xfs_setattr_nonsize(
inode->i_atime = iattr->ia_atime;
ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
- ip->i_update_core = 1;
}
if (mask & ATTR_CTIME) {
inode->i_ctime = iattr->ia_ctime;
ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
- ip->i_update_core = 1;
}
if (mask & ATTR_MTIME) {
inode->i_mtime = iattr->ia_mtime;
ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
- ip->i_update_core = 1;
}
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -750,9 +694,10 @@ xfs_setattr_size(
struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
int mask = iattr->ia_valid;
+ xfs_off_t oldsize, newsize;
struct xfs_trans *tp;
int error;
- uint lock_flags;
+ uint lock_flags = 0;
uint commit_flags = 0;
trace_xfs_setattr(ip);
@@ -772,16 +717,18 @@ xfs_setattr_size(
ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID|
ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
- lock_flags = XFS_ILOCK_EXCL;
- if (!(flags & XFS_ATTR_NOLOCK))
+ if (!(flags & XFS_ATTR_NOLOCK)) {
lock_flags |= XFS_IOLOCK_EXCL;
- xfs_ilock(ip, lock_flags);
+ xfs_ilock(ip, lock_flags);
+ }
+
+ oldsize = inode->i_size;
+ newsize = iattr->ia_size;
/*
* Short circuit the truncate case for zero length files.
*/
- if (iattr->ia_size == 0 &&
- ip->i_size == 0 && ip->i_d.di_nextents == 0) {
+ if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
goto out_unlock;
@@ -796,7 +743,7 @@ xfs_setattr_size(
/*
* Make sure that the dquots are attached to the inode.
*/
- error = xfs_qm_dqattach_locked(ip, 0);
+ error = xfs_qm_dqattach(ip, 0);
if (error)
goto out_unlock;
@@ -807,19 +754,17 @@ xfs_setattr_size(
* the inode to the transaction, because the inode cannot be unlocked
* once it is a part of the transaction.
*/
- if (iattr->ia_size > ip->i_size) {
+ if (newsize > oldsize) {
/*
* Do the first part of growing a file: zero any data in the
* last block that is beyond the old EOF. We need to do this
* before the inode is joined to the transaction to modify
* i_size.
*/
- error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+ error = xfs_zero_eof(ip, newsize, oldsize);
if (error)
goto out_unlock;
}
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- lock_flags &= ~XFS_ILOCK_EXCL;
/*
* We are going to log the inode size change in this transaction so
@@ -833,8 +778,8 @@ xfs_setattr_size(
* here and prevents waiting for other data not within the range we
* care about here.
*/
- if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
- error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
+ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
+ error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
FI_NONE);
if (error)
goto out_unlock;
@@ -845,8 +790,7 @@ xfs_setattr_size(
*/
inode_dio_wait(inode);
- error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
- xfs_get_blocks);
+ error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
if (error)
goto out_unlock;
@@ -857,7 +801,7 @@ xfs_setattr_size(
if (error)
goto out_trans_cancel;
- truncate_setsize(inode, iattr->ia_size);
+ truncate_setsize(inode, newsize);
commit_flags = XFS_TRANS_RELEASE_LOG_RES;
lock_flags |= XFS_ILOCK_EXCL;
@@ -876,19 +820,29 @@ xfs_setattr_size(
* these flags set. For all other operations the VFS set these flags
* explicitly if it wants a timestamp update.
*/
- if (iattr->ia_size != ip->i_size &&
- (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+ if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
iattr->ia_ctime = iattr->ia_mtime =
current_fs_time(inode->i_sb);
mask |= ATTR_CTIME | ATTR_MTIME;
}
- if (iattr->ia_size > ip->i_size) {
- ip->i_d.di_size = iattr->ia_size;
- ip->i_size = iattr->ia_size;
- } else if (iattr->ia_size <= ip->i_size ||
- (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
- error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
+ /*
+ * The first thing we do is set the size to new_size permanently on
+ * disk. This way we don't have to worry about anyone ever being able
+ * to look at the data being freed even in the face of a crash.
+ * What we're getting around here is the case where we free a block, it
+ * is allocated to another file, it is written to, and then we crash.
+ * If the new data gets written to the file but the log buffers
+ * containing the free and reallocation don't, then we'd end up with
+ * garbage in the blocks being freed. As long as we make the new size
+ * permanent before actually freeing any blocks it doesn't matter if
+ * they get written to.
+ */
+ ip->i_d.di_size = newsize;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ if (newsize <= oldsize) {
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
if (error)
goto out_trans_abort;
@@ -906,13 +860,11 @@ xfs_setattr_size(
inode->i_ctime = iattr->ia_ctime;
ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
- ip->i_update_core = 1;
}
if (mask & ATTR_MTIME) {
inode->i_mtime = iattr->ia_mtime;
ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
- ip->i_update_core = 1;
}
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 751e94fe1f7..eff577a9b67 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -18,7 +18,6 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
@@ -62,7 +61,6 @@ xfs_bulkstat_one_int(
{
struct xfs_icdinode *dic; /* dinode core info pointer */
struct xfs_inode *ip; /* incore inode pointer */
- struct inode *inode;
struct xfs_bstat *buf; /* return buffer */
int error = 0; /* error value */
@@ -76,7 +74,8 @@ xfs_bulkstat_one_int(
return XFS_ERROR(ENOMEM);
error = xfs_iget(mp, NULL, ino,
- XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip);
+ (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
+ XFS_ILOCK_SHARED, &ip);
if (error) {
*stat = BULKSTAT_RV_NOTHING;
goto out_free;
@@ -86,7 +85,6 @@ xfs_bulkstat_one_int(
ASSERT(ip->i_imap.im_blkno != 0);
dic = &ip->i_d;
- inode = VFS_I(ip);
/* xfs_iget returns the following without needing
* further change.
@@ -99,19 +97,12 @@ xfs_bulkstat_one_int(
buf->bs_uid = dic->di_uid;
buf->bs_gid = dic->di_gid;
buf->bs_size = dic->di_size;
-
- /*
- * We need to read the timestamps from the Linux inode because
- * the VFS keeps writing directly into the inode structure instead
- * of telling us about the updates.
- */
- buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
- buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
- buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
- buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
- buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
- buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
-
+ buf->bs_atime.tv_sec = dic->di_atime.t_sec;
+ buf->bs_atime.tv_nsec = dic->di_atime.t_nsec;
+ buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
+ buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
+ buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
+ buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
buf->bs_xflags = xfs_ip2xflags(ip);
buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
buf->bs_extents = dic->di_nextents;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 34817adf4b9..d90d4a38860 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -35,19 +33,26 @@
#include "xfs_trans_priv.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_rw.h"
#include "xfs_trace.h"
kmem_zone_t *xfs_log_ticket_zone;
/* Local miscellaneous function prototypes */
-STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket,
- xlog_in_core_t **, xfs_lsn_t *);
+STATIC int
+xlog_commit_record(
+ struct xlog *log,
+ struct xlog_ticket *ticket,
+ struct xlog_in_core **iclog,
+ xfs_lsn_t *commitlsnp);
+
STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
xfs_buftarg_t *log_target,
xfs_daddr_t blk_offset,
int num_bblks);
-STATIC int xlog_space_left(struct log *log, atomic64_t *head);
+STATIC int
+xlog_space_left(
+ struct xlog *log,
+ atomic64_t *head);
STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
STATIC void xlog_dealloc_log(xlog_t *log);
@@ -67,21 +72,20 @@ STATIC void xlog_state_switch_iclogs(xlog_t *log,
int eventual_size);
STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
-/* local functions to manipulate grant head */
-STATIC int xlog_grant_log_space(xlog_t *log,
- xlog_ticket_t *xtic);
-STATIC void xlog_grant_push_ail(struct log *log,
- int need_bytes);
+STATIC void
+xlog_grant_push_ail(
+ struct xlog *log,
+ int need_bytes);
STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
xlog_ticket_t *ticket);
-STATIC int xlog_regrant_write_log_space(xlog_t *log,
- xlog_ticket_t *ticket);
STATIC void xlog_ungrant_log_space(xlog_t *log,
xlog_ticket_t *ticket);
#if defined(DEBUG)
STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
-STATIC void xlog_verify_grant_tail(struct log *log);
+STATIC void
+xlog_verify_grant_tail(
+ struct xlog *log);
STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
int count, boolean_t syncing);
STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
@@ -97,9 +101,9 @@ STATIC int xlog_iclogs_empty(xlog_t *log);
static void
xlog_grant_sub_space(
- struct log *log,
- atomic64_t *head,
- int bytes)
+ struct xlog *log,
+ atomic64_t *head,
+ int bytes)
{
int64_t head_val = atomic64_read(head);
int64_t new, old;
@@ -123,9 +127,9 @@ xlog_grant_sub_space(
static void
xlog_grant_add_space(
- struct log *log,
- atomic64_t *head,
- int bytes)
+ struct xlog *log,
+ atomic64_t *head,
+ int bytes)
{
int64_t head_val = atomic64_read(head);
int64_t new, old;
@@ -150,78 +154,93 @@ xlog_grant_add_space(
} while (head_val != old);
}
-STATIC bool
-xlog_reserveq_wake(
- struct log *log,
- int *free_bytes)
+STATIC void
+xlog_grant_head_init(
+ struct xlog_grant_head *head)
+{
+ xlog_assign_grant_head(&head->grant, 1, 0);
+ INIT_LIST_HEAD(&head->waiters);
+ spin_lock_init(&head->lock);
+}
+
+STATIC void
+xlog_grant_head_wake_all(
+ struct xlog_grant_head *head)
{
struct xlog_ticket *tic;
- int need_bytes;
- list_for_each_entry(tic, &log->l_reserveq, t_queue) {
+ spin_lock(&head->lock);
+ list_for_each_entry(tic, &head->waiters, t_queue)
+ wake_up_process(tic->t_task);
+ spin_unlock(&head->lock);
+}
+
+static inline int
+xlog_ticket_reservation(
+ struct xlog *log,
+ struct xlog_grant_head *head,
+ struct xlog_ticket *tic)
+{
+ if (head == &log->l_write_head) {
+ ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+ return tic->t_unit_res;
+ } else {
if (tic->t_flags & XLOG_TIC_PERM_RESERV)
- need_bytes = tic->t_unit_res * tic->t_cnt;
+ return tic->t_unit_res * tic->t_cnt;
else
- need_bytes = tic->t_unit_res;
-
- if (*free_bytes < need_bytes)
- return false;
- *free_bytes -= need_bytes;
-
- trace_xfs_log_grant_wake_up(log, tic);
- wake_up(&tic->t_wait);
+ return tic->t_unit_res;
}
-
- return true;
}
STATIC bool
-xlog_writeq_wake(
- struct log *log,
+xlog_grant_head_wake(
+ struct xlog *log,
+ struct xlog_grant_head *head,
int *free_bytes)
{
struct xlog_ticket *tic;
int need_bytes;
- list_for_each_entry(tic, &log->l_writeq, t_queue) {
- ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
-
- need_bytes = tic->t_unit_res;
-
+ list_for_each_entry(tic, &head->waiters, t_queue) {
+ need_bytes = xlog_ticket_reservation(log, head, tic);
if (*free_bytes < need_bytes)
return false;
- *free_bytes -= need_bytes;
- trace_xfs_log_regrant_write_wake_up(log, tic);
- wake_up(&tic->t_wait);
+ *free_bytes -= need_bytes;
+ trace_xfs_log_grant_wake_up(log, tic);
+ wake_up_process(tic->t_task);
}
return true;
}
STATIC int
-xlog_reserveq_wait(
- struct log *log,
+xlog_grant_head_wait(
+ struct xlog *log,
+ struct xlog_grant_head *head,
struct xlog_ticket *tic,
int need_bytes)
{
- list_add_tail(&tic->t_queue, &log->l_reserveq);
+ list_add_tail(&tic->t_queue, &head->waiters);
do {
if (XLOG_FORCED_SHUTDOWN(log))
goto shutdown;
xlog_grant_push_ail(log, need_bytes);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&head->lock);
+
XFS_STATS_INC(xs_sleep_logspace);
- trace_xfs_log_grant_sleep(log, tic);
- xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+ trace_xfs_log_grant_sleep(log, tic);
+ schedule();
trace_xfs_log_grant_wake(log, tic);
- spin_lock(&log->l_grant_reserve_lock);
+ spin_lock(&head->lock);
if (XLOG_FORCED_SHUTDOWN(log))
goto shutdown;
- } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes);
+ } while (xlog_space_left(log, &head->grant) < need_bytes);
list_del_init(&tic->t_queue);
return 0;
@@ -230,35 +249,58 @@ shutdown:
return XFS_ERROR(EIO);
}
+/*
+ * Atomically get the log space required for a log ticket.
+ *
+ * Once a ticket gets put onto head->waiters, it will only return after the
+ * needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off head->waiters under head->lock, we
+ * only need to take that lock if we are going to add the ticket to the queue
+ * and sleep. We can avoid taking the lock if the ticket was never added to
+ * head->waiters because the t_queue list head will be empty and we hold the
+ * only reference to it so it can safely be checked unlocked.
+ */
STATIC int
-xlog_writeq_wait(
- struct log *log,
+xlog_grant_head_check(
+ struct xlog *log,
+ struct xlog_grant_head *head,
struct xlog_ticket *tic,
- int need_bytes)
+ int *need_bytes)
{
- list_add_tail(&tic->t_queue, &log->l_writeq);
-
- do {
- if (XLOG_FORCED_SHUTDOWN(log))
- goto shutdown;
- xlog_grant_push_ail(log, need_bytes);
-
- XFS_STATS_INC(xs_sleep_logspace);
- trace_xfs_log_regrant_write_sleep(log, tic);
+ int free_bytes;
+ int error = 0;
- xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
- trace_xfs_log_regrant_write_wake(log, tic);
+ ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
- spin_lock(&log->l_grant_write_lock);
- if (XLOG_FORCED_SHUTDOWN(log))
- goto shutdown;
- } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes);
+ /*
+ * If there are other waiters on the queue then give them a chance at
+ * logspace before us. Wake up the first waiters, if we do not wake
+ * up all the waiters then go to sleep waiting for more free space,
+ * otherwise try to get some space for this transaction.
+ */
+ *need_bytes = xlog_ticket_reservation(log, head, tic);
+ free_bytes = xlog_space_left(log, &head->grant);
+ if (!list_empty_careful(&head->waiters)) {
+ spin_lock(&head->lock);
+ if (!xlog_grant_head_wake(log, head, &free_bytes) ||
+ free_bytes < *need_bytes) {
+ error = xlog_grant_head_wait(log, head, tic,
+ *need_bytes);
+ }
+ spin_unlock(&head->lock);
+ } else if (free_bytes < *need_bytes) {
+ spin_lock(&head->lock);
+ error = xlog_grant_head_wait(log, head, tic, *need_bytes);
+ spin_unlock(&head->lock);
+ }
- list_del_init(&tic->t_queue);
- return 0;
-shutdown:
- list_del_init(&tic->t_queue);
- return XFS_ERROR(EIO);
+ return error;
}
static void
@@ -286,6 +328,128 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
}
/*
+ * Replenish the byte reservation required by moving the grant write head.
+ */
+int
+xfs_log_regrant(
+ struct xfs_mount *mp,
+ struct xlog_ticket *tic)
+{
+ struct xlog *log = mp->m_log;
+ int need_bytes;
+ int error = 0;
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return XFS_ERROR(EIO);
+
+ XFS_STATS_INC(xs_try_logspace);
+
+ /*
+ * This is a new transaction on the ticket, so we need to change the
+ * transaction ID so that the next transaction has a different TID in
+ * the log. Just add one to the existing tid so that we can see chains
+ * of rolling transactions in the log easily.
+ */
+ tic->t_tid++;
+
+ xlog_grant_push_ail(log, tic->t_unit_res);
+
+ tic->t_curr_res = tic->t_unit_res;
+ xlog_tic_reset_res(tic);
+
+ if (tic->t_cnt > 0)
+ return 0;
+
+ trace_xfs_log_regrant(log, tic);
+
+ error = xlog_grant_head_check(log, &log->l_write_head, tic,
+ &need_bytes);
+ if (error)
+ goto out_error;
+
+ xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
+ trace_xfs_log_regrant_exit(log, tic);
+ xlog_verify_grant_tail(log);
+ return 0;
+
+out_error:
+ /*
+ * If we are failing, make sure the ticket doesn't have any current
+ * reservations. We don't want to add this back when the ticket/
+ * transaction gets cancelled.
+ */
+ tic->t_curr_res = 0;
+ tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+ return error;
+}
+
+/*
+ * Reserve log space and return a ticket corresponding the reservation.
+ *
+ * Each reservation is going to reserve extra space for a log record header.
+ * When writes happen to the on-disk log, we don't subtract the length of the
+ * log record header from any reservation. By wasting space in each
+ * reservation, we prevent over allocation problems.
+ */
+int
+xfs_log_reserve(
+ struct xfs_mount *mp,
+ int unit_bytes,
+ int cnt,
+ struct xlog_ticket **ticp,
+ __uint8_t client,
+ bool permanent,
+ uint t_type)
+{
+ struct xlog *log = mp->m_log;
+ struct xlog_ticket *tic;
+ int need_bytes;
+ int error = 0;
+
+ ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return XFS_ERROR(EIO);
+
+ XFS_STATS_INC(xs_try_logspace);
+
+ ASSERT(*ticp == NULL);
+ tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
+ KM_SLEEP | KM_MAYFAIL);
+ if (!tic)
+ return XFS_ERROR(ENOMEM);
+
+ tic->t_trans_type = t_type;
+ *ticp = tic;
+
+ xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt);
+
+ trace_xfs_log_reserve(log, tic);
+
+ error = xlog_grant_head_check(log, &log->l_reserve_head, tic,
+ &need_bytes);
+ if (error)
+ goto out_error;
+
+ xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes);
+ xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
+ trace_xfs_log_reserve_exit(log, tic);
+ xlog_verify_grant_tail(log);
+ return 0;
+
+out_error:
+ /*
+ * If we are failing, make sure the ticket doesn't have any current
+ * reservations. We don't want to add this back when the ticket/
+ * transaction gets cancelled.
+ */
+ tic->t_curr_res = 0;
+ tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+ return error;
+}
+
+
+/*
* NOTES:
*
* 1. currblock field gets updated at startup and after in-core logs
@@ -313,7 +477,7 @@ xfs_log_done(
struct xlog_in_core **iclog,
uint flags)
{
- struct log *log = mp->m_log;
+ struct xlog *log = mp->m_log;
xfs_lsn_t lsn = 0;
if (XLOG_FORCED_SHUTDOWN(log) ||
@@ -395,88 +559,6 @@ xfs_log_release_iclog(
}
/*
- * 1. Reserve an amount of on-disk log space and return a ticket corresponding
- * to the reservation.
- * 2. Potentially, push buffers at tail of log to disk.
- *
- * Each reservation is going to reserve extra space for a log record header.
- * When writes happen to the on-disk log, we don't subtract the length of the
- * log record header from any reservation. By wasting space in each
- * reservation, we prevent over allocation problems.
- */
-int
-xfs_log_reserve(
- struct xfs_mount *mp,
- int unit_bytes,
- int cnt,
- struct xlog_ticket **ticket,
- __uint8_t client,
- uint flags,
- uint t_type)
-{
- struct log *log = mp->m_log;
- struct xlog_ticket *internal_ticket;
- int retval = 0;
-
- ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
-
- if (XLOG_FORCED_SHUTDOWN(log))
- return XFS_ERROR(EIO);
-
- XFS_STATS_INC(xs_try_logspace);
-
-
- if (*ticket != NULL) {
- ASSERT(flags & XFS_LOG_PERM_RESERV);
- internal_ticket = *ticket;
-
- /*
- * this is a new transaction on the ticket, so we need to
- * change the transaction ID so that the next transaction has a
- * different TID in the log. Just add one to the existing tid
- * so that we can see chains of rolling transactions in the log
- * easily.
- */
- internal_ticket->t_tid++;
-
- trace_xfs_log_reserve(log, internal_ticket);
-
- xlog_grant_push_ail(log, internal_ticket->t_unit_res);
- retval = xlog_regrant_write_log_space(log, internal_ticket);
- } else {
- /* may sleep if need to allocate more tickets */
- internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
- client, flags,
- KM_SLEEP|KM_MAYFAIL);
- if (!internal_ticket)
- return XFS_ERROR(ENOMEM);
- internal_ticket->t_trans_type = t_type;
- *ticket = internal_ticket;
-
- trace_xfs_log_reserve(log, internal_ticket);
-
- xlog_grant_push_ail(log,
- (internal_ticket->t_unit_res *
- internal_ticket->t_cnt));
- retval = xlog_grant_log_space(log, internal_ticket);
- }
-
- if (unlikely(retval)) {
- /*
- * If we are failing, make sure the ticket doesn't have any
- * current reservations. We don't want to add this back
- * when the ticket/ transaction gets cancelled.
- */
- internal_ticket->t_curr_res = 0;
- /* ungrant will give back unit_res * t_cnt. */
- internal_ticket->t_cnt = 0;
- }
-
- return retval;
-}
-
-
-/*
* Mount a log filesystem
*
* mp - ubiquitous xfs mount point structure
@@ -653,8 +735,9 @@ xfs_log_unmount_write(xfs_mount_t *mp)
.lv_iovecp = &reg,
};
- /* remove inited flag */
+ /* remove inited flag, and account for space used */
tic->t_flags = 0;
+ tic->t_curr_res -= sizeof(magic);
error = xlog_write(log, &vec, tic, &lsn,
NULL, XLOG_UNMOUNT_TRANS);
/*
@@ -739,6 +822,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
void
xfs_log_unmount(xfs_mount_t *mp)
{
+ cancel_delayed_work_sync(&mp->m_sync_work);
xfs_trans_ail_destroy(mp);
xlog_dealloc_log(mp->m_log);
}
@@ -761,95 +845,34 @@ xfs_log_item_init(
}
/*
- * Write region vectors to log. The write happens using the space reservation
- * of the ticket (tic). It is not a requirement that all writes for a given
- * transaction occur with one call to xfs_log_write(). However, it is important
- * to note that the transaction reservation code makes an assumption about the
- * number of log headers a transaction requires that may be violated if you
- * don't pass all the transaction vectors in one call....
+ * Wake up processes waiting for log space after we have moved the log tail.
*/
-int
-xfs_log_write(
- struct xfs_mount *mp,
- struct xfs_log_iovec reg[],
- int nentries,
- struct xlog_ticket *tic,
- xfs_lsn_t *start_lsn)
-{
- struct log *log = mp->m_log;
- int error;
- struct xfs_log_vec vec = {
- .lv_niovecs = nentries,
- .lv_iovecp = reg,
- };
-
- if (XLOG_FORCED_SHUTDOWN(log))
- return XFS_ERROR(EIO);
-
- error = xlog_write(log, &vec, tic, start_lsn, NULL, 0);
- if (error)
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
- return error;
-}
-
void
-xfs_log_move_tail(xfs_mount_t *mp,
- xfs_lsn_t tail_lsn)
+xfs_log_space_wake(
+ struct xfs_mount *mp)
{
- xlog_ticket_t *tic;
- xlog_t *log = mp->m_log;
- int need_bytes, free_bytes;
+ struct xlog *log = mp->m_log;
+ int free_bytes;
if (XLOG_FORCED_SHUTDOWN(log))
return;
- if (tail_lsn == 0)
- tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+ if (!list_empty_careful(&log->l_write_head.waiters)) {
+ ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
- /* tail_lsn == 1 implies that we weren't passed a valid value. */
- if (tail_lsn != 1)
- atomic64_set(&log->l_tail_lsn, tail_lsn);
-
- if (!list_empty_careful(&log->l_writeq)) {
-#ifdef DEBUG
- if (log->l_flags & XLOG_ACTIVE_RECOVERY)
- panic("Recovery problem");
-#endif
- spin_lock(&log->l_grant_write_lock);
- free_bytes = xlog_space_left(log, &log->l_grant_write_head);
- list_for_each_entry(tic, &log->l_writeq, t_queue) {
- ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
-
- if (free_bytes < tic->t_unit_res && tail_lsn != 1)
- break;
- tail_lsn = 0;
- free_bytes -= tic->t_unit_res;
- trace_xfs_log_regrant_write_wake_up(log, tic);
- wake_up(&tic->t_wait);
- }
- spin_unlock(&log->l_grant_write_lock);
+ spin_lock(&log->l_write_head.lock);
+ free_bytes = xlog_space_left(log, &log->l_write_head.grant);
+ xlog_grant_head_wake(log, &log->l_write_head, &free_bytes);
+ spin_unlock(&log->l_write_head.lock);
}
- if (!list_empty_careful(&log->l_reserveq)) {
-#ifdef DEBUG
- if (log->l_flags & XLOG_ACTIVE_RECOVERY)
- panic("Recovery problem");
-#endif
- spin_lock(&log->l_grant_reserve_lock);
- free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
- list_for_each_entry(tic, &log->l_reserveq, t_queue) {
- if (tic->t_flags & XLOG_TIC_PERM_RESERV)
- need_bytes = tic->t_unit_res*tic->t_cnt;
- else
- need_bytes = tic->t_unit_res;
- if (free_bytes < need_bytes && tail_lsn != 1)
- break;
- tail_lsn = 0;
- free_bytes -= need_bytes;
- trace_xfs_log_grant_wake_up(log, tic);
- wake_up(&tic->t_wait);
- }
- spin_unlock(&log->l_grant_reserve_lock);
+ if (!list_empty_careful(&log->l_reserve_head.waiters)) {
+ ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
+
+ spin_lock(&log->l_reserve_head.lock);
+ free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
+ xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes);
+ spin_unlock(&log->l_reserve_head.lock);
}
}
@@ -899,38 +922,46 @@ xfs_log_need_covered(xfs_mount_t *mp)
return needed;
}
-/******************************************************************************
- *
- * local routines
- *
- ******************************************************************************
- */
-
-/* xfs_trans_tail_ail returns 0 when there is nothing in the list.
- * The log manager must keep track of the last LR which was committed
- * to disk. The lsn of this LR will become the new tail_lsn whenever
- * xfs_trans_tail_ail returns 0. If we don't do this, we run into
- * the situation where stuff could be written into the log but nothing
- * was ever in the AIL when asked. Eventually, we panic since the
- * tail hits the head.
- *
+/*
* We may be holding the log iclog lock upon entering this routine.
*/
xfs_lsn_t
-xlog_assign_tail_lsn(
+xlog_assign_tail_lsn_locked(
struct xfs_mount *mp)
{
+ struct xlog *log = mp->m_log;
+ struct xfs_log_item *lip;
xfs_lsn_t tail_lsn;
- struct log *log = mp->m_log;
- tail_lsn = xfs_ail_min_lsn(mp->m_ail);
- if (!tail_lsn)
- tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+ assert_spin_locked(&mp->m_ail->xa_lock);
+ /*
+ * To make sure we always have a valid LSN for the log tail we keep
+ * track of the last LSN which was committed in log->l_last_sync_lsn,
+ * and use that when the AIL was empty.
+ */
+ lip = xfs_ail_min(mp->m_ail);
+ if (lip)
+ tail_lsn = lip->li_lsn;
+ else
+ tail_lsn = atomic64_read(&log->l_last_sync_lsn);
atomic64_set(&log->l_tail_lsn, tail_lsn);
return tail_lsn;
}
+xfs_lsn_t
+xlog_assign_tail_lsn(
+ struct xfs_mount *mp)
+{
+ xfs_lsn_t tail_lsn;
+
+ spin_lock(&mp->m_ail->xa_lock);
+ tail_lsn = xlog_assign_tail_lsn_locked(mp);
+ spin_unlock(&mp->m_ail->xa_lock);
+
+ return tail_lsn;
+}
+
/*
* Return the space in the log between the tail and the head. The head
* is passed in the cycle/bytes formal parms. In the special case where
@@ -947,7 +978,7 @@ xlog_assign_tail_lsn(
*/
STATIC int
xlog_space_left(
- struct log *log,
+ struct xlog *log,
atomic64_t *head)
{
int free_bytes;
@@ -1132,12 +1163,9 @@ xlog_alloc_log(xfs_mount_t *mp,
xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
- xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
- xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
- INIT_LIST_HEAD(&log->l_reserveq);
- INIT_LIST_HEAD(&log->l_writeq);
- spin_lock_init(&log->l_grant_reserve_lock);
- spin_lock_init(&log->l_grant_write_lock);
+
+ xlog_grant_head_init(&log->l_reserve_head);
+ xlog_grant_head_init(&log->l_write_head);
error = EFSCORRUPTED;
if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1169,7 +1197,7 @@ xlog_alloc_log(xfs_mount_t *mp,
xlog_get_iclog_buffer_size(mp, log);
error = ENOMEM;
- bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0);
+ bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
if (!bp)
goto out_free_log;
bp->b_iodone = xlog_iodone;
@@ -1179,9 +1207,6 @@ xlog_alloc_log(xfs_mount_t *mp,
spin_lock_init(&log->l_icloglock);
init_waitqueue_head(&log->l_flush_wait);
- /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
- ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
-
iclogp = &log->l_iclog;
/*
* The amount of memory to allocate for the iclog structure is
@@ -1201,7 +1226,7 @@ xlog_alloc_log(xfs_mount_t *mp,
prev_iclog = iclog;
bp = xfs_buf_get_uncached(mp->m_logdev_targp,
- log->l_iclog_size, 0);
+ BTOBB(log->l_iclog_size), 0);
if (!bp)
goto out_free_iclog;
@@ -1221,7 +1246,7 @@ xlog_alloc_log(xfs_mount_t *mp,
head->h_fmt = cpu_to_be32(XLOG_FMT);
memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
- iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
+ iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize;
iclog->ic_state = XLOG_STATE_ACTIVE;
iclog->ic_log = log;
atomic_set(&iclog->ic_refcnt, 0);
@@ -1265,7 +1290,7 @@ out:
*/
STATIC int
xlog_commit_record(
- struct log *log,
+ struct xlog *log,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
xfs_lsn_t *commitlsnp)
@@ -1299,7 +1324,7 @@ xlog_commit_record(
*/
STATIC void
xlog_grant_push_ail(
- struct log *log,
+ struct xlog *log,
int need_bytes)
{
xfs_lsn_t threshold_lsn = 0;
@@ -1312,7 +1337,7 @@ xlog_grant_push_ail(
ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
- free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+ free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
free_blocks = BTOBBT(free_bytes);
/*
@@ -1444,8 +1469,8 @@ xlog_sync(xlog_t *log,
roundoff < BBTOB(1)));
/* move grant heads by roundoff in sync */
- xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
- xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
+ xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
+ xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
/* put cycle number in every block */
xlog_pack_data(log, iclog, roundoff);
@@ -1472,7 +1497,7 @@ xlog_sync(xlog_t *log,
} else {
iclog->ic_bwritecnt = 1;
}
- XFS_BUF_SET_COUNT(bp, count);
+ bp->b_io_length = BTOBB(count);
bp->b_fspriv = iclog;
XFS_BUF_ZEROFLAGS(bp);
XFS_BUF_ASYNC(bp);
@@ -1570,7 +1595,7 @@ xlog_dealloc_log(xlog_t *log)
* always need to ensure that the extra buffer does not point to memory
* owned by another log buffer before we free it.
*/
- xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size);
+ xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
xfs_buf_free(log->l_xbuf);
iclog = log->l_iclog;
@@ -1685,7 +1710,7 @@ xlog_print_tic_res(
};
xfs_warn(mp,
- "xfs_log_write: reservation summary:\n"
+ "xlog_write: reservation summary:\n"
" trans type = %s (%u)\n"
" unit res = %d bytes\n"
" current res = %d bytes\n"
@@ -1714,7 +1739,7 @@ xlog_print_tic_res(
}
xfs_alert_tag(mp, XFS_PTAG_LOGRES,
- "xfs_log_write: reservation ran out. Need to up reservation");
+ "xlog_write: reservation ran out. Need to up reservation");
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
@@ -1778,7 +1803,7 @@ xlog_write_start_rec(
static xlog_op_header_t *
xlog_write_setup_ophdr(
- struct log *log,
+ struct xlog *log,
struct xlog_op_header *ophdr,
struct xlog_ticket *ticket,
uint flags)
@@ -1861,7 +1886,7 @@ xlog_write_setup_copy(
static int
xlog_write_copy_finish(
- struct log *log,
+ struct xlog *log,
struct xlog_in_core *iclog,
uint flags,
int *record_cnt,
@@ -1946,7 +1971,7 @@ xlog_write_copy_finish(
*/
int
xlog_write(
- struct log *log,
+ struct xlog *log,
struct xfs_log_vec *log_vector,
struct xlog_ticket *ticket,
xfs_lsn_t *start_lsn,
@@ -1968,23 +1993,21 @@ xlog_write(
*start_lsn = 0;
len = xlog_write_calc_vec_length(ticket, log_vector);
- if (log->l_cilp) {
- /*
- * Region headers and bytes are already accounted for.
- * We only need to take into account start records and
- * split regions in this function.
- */
- if (ticket->t_flags & XLOG_TIC_INITED)
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
- /*
- * Commit record headers need to be accounted for. These
- * come in as separate writes so are easy to detect.
- */
- if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
- } else
- ticket->t_curr_res -= len;
+ /*
+ * Region headers and bytes are already accounted for.
+ * We only need to take into account start records and
+ * split regions in this function.
+ */
+ if (ticket->t_flags & XLOG_TIC_INITED)
+ ticket->t_curr_res -= sizeof(xlog_op_header_t);
+
+ /*
+ * Commit record headers need to be accounted for. These
+ * come in as separate writes so are easy to detect.
+ */
+ if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
+ ticket->t_curr_res -= sizeof(xlog_op_header_t);
if (ticket->t_curr_res < 0)
xlog_print_tic_res(log->l_mp, ticket);
@@ -2600,119 +2623,6 @@ restart:
return 0;
} /* xlog_state_get_iclog_space */
-/*
- * Atomically get the log space required for a log ticket.
- *
- * Once a ticket gets put onto the reserveq, it will only return after the
- * needed reservation is satisfied.
- *
- * This function is structured so that it has a lock free fast path. This is
- * necessary because every new transaction reservation will come through this
- * path. Hence any lock will be globally hot if we take it unconditionally on
- * every pass.
- *
- * As tickets are only ever moved on and off the reserveq under the
- * l_grant_reserve_lock, we only need to take that lock if we are going to add
- * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
- * was never added to the reserveq because the t_queue list head will be empty
- * and we hold the only reference to it so it can safely be checked unlocked.
- */
-STATIC int
-xlog_grant_log_space(
- struct log *log,
- struct xlog_ticket *tic)
-{
- int free_bytes, need_bytes;
- int error = 0;
-
- ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-
- trace_xfs_log_grant_enter(log, tic);
-
- /*
- * If there are other waiters on the queue then give them a chance at
- * logspace before us. Wake up the first waiters, if we do not wake
- * up all the waiters then go to sleep waiting for more free space,
- * otherwise try to get some space for this transaction.
- */
- need_bytes = tic->t_unit_res;
- if (tic->t_flags & XFS_LOG_PERM_RESERV)
- need_bytes *= tic->t_ocnt;
- free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
- if (!list_empty_careful(&log->l_reserveq)) {
- spin_lock(&log->l_grant_reserve_lock);
- if (!xlog_reserveq_wake(log, &free_bytes) ||
- free_bytes < need_bytes)
- error = xlog_reserveq_wait(log, tic, need_bytes);
- spin_unlock(&log->l_grant_reserve_lock);
- } else if (free_bytes < need_bytes) {
- spin_lock(&log->l_grant_reserve_lock);
- error = xlog_reserveq_wait(log, tic, need_bytes);
- spin_unlock(&log->l_grant_reserve_lock);
- }
- if (error)
- return error;
-
- xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
- xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
- trace_xfs_log_grant_exit(log, tic);
- xlog_verify_grant_tail(log);
- return 0;
-}
-
-/*
- * Replenish the byte reservation required by moving the grant write head.
- *
- * Similar to xlog_grant_log_space, the function is structured to have a lock
- * free fast path.
- */
-STATIC int
-xlog_regrant_write_log_space(
- struct log *log,
- struct xlog_ticket *tic)
-{
- int free_bytes, need_bytes;
- int error = 0;
-
- tic->t_curr_res = tic->t_unit_res;
- xlog_tic_reset_res(tic);
-
- if (tic->t_cnt > 0)
- return 0;
-
- ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-
- trace_xfs_log_regrant_write_enter(log, tic);
-
- /*
- * If there are other waiters on the queue then give them a chance at
- * logspace before us. Wake up the first waiters, if we do not wake
- * up all the waiters then go to sleep waiting for more free space,
- * otherwise try to get some space for this transaction.
- */
- need_bytes = tic->t_unit_res;
- free_bytes = xlog_space_left(log, &log->l_grant_write_head);
- if (!list_empty_careful(&log->l_writeq)) {
- spin_lock(&log->l_grant_write_lock);
- if (!xlog_writeq_wake(log, &free_bytes) ||
- free_bytes < need_bytes)
- error = xlog_writeq_wait(log, tic, need_bytes);
- spin_unlock(&log->l_grant_write_lock);
- } else if (free_bytes < need_bytes) {
- spin_lock(&log->l_grant_write_lock);
- error = xlog_writeq_wait(log, tic, need_bytes);
- spin_unlock(&log->l_grant_write_lock);
- }
-
- if (error)
- return error;
-
- xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
- trace_xfs_log_regrant_write_exit(log, tic);
- xlog_verify_grant_tail(log);
- return 0;
-}
-
/* The first cnt-1 times through here we don't need to
* move the grant write head because the permanent
* reservation has reserved cnt times the unit amount.
@@ -2729,9 +2639,9 @@ xlog_regrant_reserve_log_space(xlog_t *log,
if (ticket->t_cnt > 0)
ticket->t_cnt--;
- xlog_grant_sub_space(log, &log->l_grant_reserve_head,
+ xlog_grant_sub_space(log, &log->l_reserve_head.grant,
ticket->t_curr_res);
- xlog_grant_sub_space(log, &log->l_grant_write_head,
+ xlog_grant_sub_space(log, &log->l_write_head.grant,
ticket->t_curr_res);
ticket->t_curr_res = ticket->t_unit_res;
xlog_tic_reset_res(ticket);
@@ -2742,7 +2652,7 @@ xlog_regrant_reserve_log_space(xlog_t *log,
if (ticket->t_cnt > 0)
return;
- xlog_grant_add_space(log, &log->l_grant_reserve_head,
+ xlog_grant_add_space(log, &log->l_reserve_head.grant,
ticket->t_unit_res);
trace_xfs_log_regrant_reserve_exit(log, ticket);
@@ -2788,14 +2698,13 @@ xlog_ungrant_log_space(xlog_t *log,
bytes += ticket->t_unit_res*ticket->t_cnt;
}
- xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
- xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
+ xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
+ xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
trace_xfs_log_ungrant_exit(log, ticket);
- xfs_log_move_tail(log->l_mp, 1);
-} /* xlog_ungrant_log_space */
-
+ xfs_log_space_wake(log->l_mp);
+}
/*
* Flush iclog to disk if this is the last reference to the given iclog and
@@ -2925,14 +2834,13 @@ _xfs_log_force(
uint flags,
int *log_flushed)
{
- struct log *log = mp->m_log;
+ struct xlog *log = mp->m_log;
struct xlog_in_core *iclog;
xfs_lsn_t lsn;
XFS_STATS_INC(xs_log_force);
- if (log->l_cilp)
- xlog_cil_force(log);
+ xlog_cil_force(log);
spin_lock(&log->l_icloglock);
@@ -3046,6 +2954,7 @@ xfs_log_force(
{
int error;
+ trace_xfs_log_force(mp, 0);
error = _xfs_log_force(mp, flags, NULL);
if (error)
xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3073,7 +2982,7 @@ _xfs_log_force_lsn(
uint flags,
int *log_flushed)
{
- struct log *log = mp->m_log;
+ struct xlog *log = mp->m_log;
struct xlog_in_core *iclog;
int already_slept = 0;
@@ -3081,11 +2990,9 @@ _xfs_log_force_lsn(
XFS_STATS_INC(xs_log_force);
- if (log->l_cilp) {
- lsn = xlog_cil_force_lsn(log, lsn);
- if (lsn == NULLCOMMITLSN)
- return 0;
- }
+ lsn = xlog_cil_force_lsn(log, lsn);
+ if (lsn == NULLCOMMITLSN)
+ return 0;
try_again:
spin_lock(&log->l_icloglock);
@@ -3196,6 +3103,7 @@ xfs_log_force_lsn(
{
int error;
+ trace_xfs_log_force(mp, lsn);
error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
if (error)
xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3252,12 +3160,12 @@ xfs_log_ticket_get(
*/
xlog_ticket_t *
xlog_ticket_alloc(
- struct log *log,
+ struct xlog *log,
int unit_bytes,
int cnt,
char client,
- uint xflags,
- int alloc_flags)
+ bool permanent,
+ xfs_km_flags_t alloc_flags)
{
struct xlog_ticket *tic;
uint num_headers;
@@ -3350,6 +3258,7 @@ xlog_ticket_alloc(
}
atomic_set(&tic->t_ref, 1);
+ tic->t_task = current;
INIT_LIST_HEAD(&tic->t_queue);
tic->t_unit_res = unit_bytes;
tic->t_curr_res = unit_bytes;
@@ -3359,9 +3268,8 @@ xlog_ticket_alloc(
tic->t_clientid = client;
tic->t_flags = XLOG_TIC_INITED;
tic->t_trans_type = 0;
- if (xflags & XFS_LOG_PERM_RESERV)
+ if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
- init_waitqueue_head(&tic->t_wait);
xlog_tic_reset_res(tic);
@@ -3383,7 +3291,7 @@ xlog_ticket_alloc(
*/
void
xlog_verify_dest_ptr(
- struct log *log,
+ struct xlog *log,
char *ptr)
{
int i;
@@ -3412,12 +3320,12 @@ xlog_verify_dest_ptr(
*/
STATIC void
xlog_verify_grant_tail(
- struct log *log)
+ struct xlog *log)
{
int tail_cycle, tail_blocks;
int cycle, space;
- xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
+ xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space);
xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
if (tail_cycle != cycle) {
if (cycle - 1 != tail_cycle &&
@@ -3619,7 +3527,6 @@ xfs_log_force_umount(
struct xfs_mount *mp,
int logerror)
{
- xlog_ticket_t *tic;
xlog_t *log;
int retval;
@@ -3653,7 +3560,7 @@ xfs_log_force_umount(
* completed transactions are flushed to disk with the xfs_log_force()
* call below.
*/
- if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
+ if (!logerror)
xlog_cil_force(log);
/*
@@ -3687,15 +3594,8 @@ xfs_log_force_umount(
* we don't enqueue anything once the SHUTDOWN flag is set, and this
* action is protected by the grant locks.
*/
- spin_lock(&log->l_grant_reserve_lock);
- list_for_each_entry(tic, &log->l_reserveq, t_queue)
- wake_up(&tic->t_wait);
- spin_unlock(&log->l_grant_reserve_lock);
-
- spin_lock(&log->l_grant_write_lock);
- list_for_each_entry(tic, &log->l_writeq, t_queue)
- wake_up(&tic->t_wait);
- spin_unlock(&log->l_grant_write_lock);
+ xlog_grant_head_wake_all(&log->l_reserve_head);
+ xlog_grant_head_wake_all(&log->l_write_head);
if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 3f7bf451c03..748d312850e 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -53,15 +53,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
#define XFS_LOG_REL_PERM_RESERV 0x1
/*
- * Flags to xfs_log_reserve()
- *
- * XFS_LOG_PERM_RESERV: Permanent reservation. When writes are
- * performed against this type of reservation, the reservation
- * is not decreased. Long running transactions should use this.
- */
-#define XFS_LOG_PERM_RESERV 0x2
-
-/*
* Flags to xfs_log_force()
*
* XFS_LOG_SYNC: Synchronous force in-core log to disk
@@ -160,8 +151,9 @@ int xfs_log_mount(struct xfs_mount *mp,
xfs_daddr_t start_block,
int num_bblocks);
int xfs_log_mount_finish(struct xfs_mount *mp);
-void xfs_log_move_tail(struct xfs_mount *mp,
- xfs_lsn_t tail_lsn);
+xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
+xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
+void xfs_log_space_wake(struct xfs_mount *mp);
int xfs_log_notify(struct xfs_mount *mp,
struct xlog_in_core *iclog,
xfs_log_callback_t *callback_entry);
@@ -172,13 +164,9 @@ int xfs_log_reserve(struct xfs_mount *mp,
int count,
struct xlog_ticket **ticket,
__uint8_t clientid,
- uint flags,
+ bool permanent,
uint t_type);
-int xfs_log_write(struct xfs_mount *mp,
- xfs_log_iovec_t region[],
- int nentries,
- struct xlog_ticket *ticket,
- xfs_lsn_t *start_lsn);
+int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
int xfs_log_unmount_write(struct xfs_mount *mp);
void xfs_log_unmount(struct xfs_mount *mp);
int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
@@ -189,8 +177,7 @@ void xlog_iodone(struct xfs_buf *);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
-void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
- struct xfs_log_vec *log_vector,
+int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_lsn_t *commit_lsn, int flags);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index c7755d5a5fb..ddc4529d07d 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_log_priv.h"
@@ -29,71 +27,10 @@
#include "xfs_mount.h"
#include "xfs_error.h"
#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
#include "xfs_discard.h"
/*
- * Perform initial CIL structure initialisation. If the CIL is not
- * enabled in this filesystem, ensure the log->l_cilp is null so
- * we can check this conditional to determine if we are doing delayed
- * logging or not.
- */
-int
-xlog_cil_init(
- struct log *log)
-{
- struct xfs_cil *cil;
- struct xfs_cil_ctx *ctx;
-
- log->l_cilp = NULL;
- if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
- return 0;
-
- cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
- if (!cil)
- return ENOMEM;
-
- ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
- if (!ctx) {
- kmem_free(cil);
- return ENOMEM;
- }
-
- INIT_LIST_HEAD(&cil->xc_cil);
- INIT_LIST_HEAD(&cil->xc_committing);
- spin_lock_init(&cil->xc_cil_lock);
- init_rwsem(&cil->xc_ctx_lock);
- init_waitqueue_head(&cil->xc_commit_wait);
-
- INIT_LIST_HEAD(&ctx->committing);
- INIT_LIST_HEAD(&ctx->busy_extents);
- ctx->sequence = 1;
- ctx->cil = cil;
- cil->xc_ctx = ctx;
- cil->xc_current_sequence = ctx->sequence;
-
- cil->xc_log = log;
- log->l_cilp = cil;
- return 0;
-}
-
-void
-xlog_cil_destroy(
- struct log *log)
-{
- if (!log->l_cilp)
- return;
-
- if (log->l_cilp->xc_ctx) {
- if (log->l_cilp->xc_ctx->ticket)
- xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
- kmem_free(log->l_cilp->xc_ctx);
- }
-
- ASSERT(list_empty(&log->l_cilp->xc_cil));
- kmem_free(log->l_cilp);
-}
-
-/*
* Allocate a new ticket. Failing to get a new ticket makes it really hard to
* recover, so we don't allow failure here. Also, we allocate in a context that
* we don't want to be issuing transactions from, so we need to tell the
@@ -107,7 +44,7 @@ xlog_cil_destroy(
*/
static struct xlog_ticket *
xlog_cil_ticket_alloc(
- struct log *log)
+ struct xlog *log)
{
struct xlog_ticket *tic;
@@ -135,11 +72,8 @@ xlog_cil_ticket_alloc(
*/
void
xlog_cil_init_post_recovery(
- struct log *log)
+ struct xlog *log)
{
- if (!log->l_cilp)
- return;
-
log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
log->l_cilp->xc_ctx->sequence = 1;
log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
@@ -172,37 +106,73 @@ xlog_cil_init_post_recovery(
* format the regions into the iclog as though they are being formatted
* directly out of the objects themselves.
*/
-static void
-xlog_cil_format_items(
- struct log *log,
- struct xfs_log_vec *log_vector)
+static struct xfs_log_vec *
+xlog_cil_prepare_log_vecs(
+ struct xfs_trans *tp)
{
- struct xfs_log_vec *lv;
+ struct xfs_log_item_desc *lidp;
+ struct xfs_log_vec *lv = NULL;
+ struct xfs_log_vec *ret_lv = NULL;
- ASSERT(log_vector);
- for (lv = log_vector; lv; lv = lv->lv_next) {
+
+ /* Bail out if we didn't find a log item. */
+ if (list_empty(&tp->t_items)) {
+ ASSERT(0);
+ return NULL;
+ }
+
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+ struct xfs_log_vec *new_lv;
void *ptr;
int index;
int len = 0;
+ uint niovecs;
+
+ /* Skip items which aren't dirty in this transaction. */
+ if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ continue;
+
+ /* Skip items that do not have any vectors for writing */
+ niovecs = IOP_SIZE(lidp->lid_item);
+ if (!niovecs)
+ continue;
+
+ new_lv = kmem_zalloc(sizeof(*new_lv) +
+ niovecs * sizeof(struct xfs_log_iovec),
+ KM_SLEEP);
+
+ /* The allocated iovec region lies beyond the log vector. */
+ new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
+ new_lv->lv_niovecs = niovecs;
+ new_lv->lv_item = lidp->lid_item;
/* build the vector array and calculate it's length */
- IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
- for (index = 0; index < lv->lv_niovecs; index++)
- len += lv->lv_iovecp[index].i_len;
+ IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
+ for (index = 0; index < new_lv->lv_niovecs; index++)
+ len += new_lv->lv_iovecp[index].i_len;
- lv->lv_buf_len = len;
- lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
- ptr = lv->lv_buf;
+ new_lv->lv_buf_len = len;
+ new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len,
+ KM_SLEEP|KM_NOFS);
+ ptr = new_lv->lv_buf;
- for (index = 0; index < lv->lv_niovecs; index++) {
- struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
+ for (index = 0; index < new_lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index];
memcpy(ptr, vec->i_addr, vec->i_len);
vec->i_addr = ptr;
ptr += vec->i_len;
}
- ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+ ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
+
+ if (!ret_lv)
+ ret_lv = new_lv;
+ else
+ lv->lv_next = new_lv;
+ lv = new_lv;
}
+
+ return ret_lv;
}
/*
@@ -212,7 +182,7 @@ xlog_cil_format_items(
*/
STATIC void
xfs_cil_prepare_item(
- struct log *log,
+ struct xlog *log,
struct xfs_log_vec *lv,
int *len,
int *diff_iovecs)
@@ -256,12 +226,12 @@ xfs_cil_prepare_item(
* Insert the log items into the CIL and calculate the difference in space
* consumed by the item. Add the space to the checkpoint ticket and calculate
* if the change requires additional log metadata. If it does, take that space
- * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * as well. Remove the amount of space we added to the checkpoint ticket from
* the current transaction ticket so that the accounting works out correctly.
*/
static void
xlog_cil_insert_items(
- struct log *log,
+ struct xlog *log,
struct xfs_log_vec *log_vector,
struct xlog_ticket *ticket)
{
@@ -367,8 +337,8 @@ xlog_cil_committed(
xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
ctx->start_lsn, abort);
- xfs_alloc_busy_sort(&ctx->busy_extents);
- xfs_alloc_busy_clear(mp, &ctx->busy_extents,
+ xfs_extent_busy_sort(&ctx->busy_extents);
+ xfs_extent_busy_clear(mp, &ctx->busy_extents,
(mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
spin_lock(&ctx->cil->xc_cil_lock);
@@ -381,7 +351,7 @@ xlog_cil_committed(
ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
xfs_discard_extents(mp, &ctx->busy_extents);
- xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
+ xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
}
kmem_free(ctx);
@@ -403,8 +373,7 @@ xlog_cil_committed(
*/
STATIC int
xlog_cil_push(
- struct log *log,
- xfs_lsn_t push_seq)
+ struct xlog *log)
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_log_vec *lv;
@@ -420,39 +389,36 @@ xlog_cil_push(
struct xfs_log_iovec lhdr;
struct xfs_log_vec lvhdr = { NULL };
xfs_lsn_t commit_lsn;
+ xfs_lsn_t push_seq;
if (!cil)
return 0;
- ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
-
new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
new_ctx->ticket = xlog_cil_ticket_alloc(log);
- /*
- * Lock out transaction commit, but don't block for background pushes
- * unless we are well over the CIL space limit. See the definition of
- * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
- * used here.
- */
- if (!down_write_trylock(&cil->xc_ctx_lock)) {
- if (!push_seq &&
- cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
- goto out_free_ticket;
- down_write(&cil->xc_ctx_lock);
- }
+ down_write(&cil->xc_ctx_lock);
ctx = cil->xc_ctx;
- /* check if we've anything to push */
- if (list_empty(&cil->xc_cil))
- goto out_skip;
+ spin_lock(&cil->xc_cil_lock);
+ push_seq = cil->xc_push_seq;
+ ASSERT(push_seq <= ctx->sequence);
- /* check for spurious background flush */
- if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+ /*
+ * Check if we've anything to push. If there is nothing, then we don't
+ * move on to a new sequence number and so we have to be able to push
+ * this sequence again later.
+ */
+ if (list_empty(&cil->xc_cil)) {
+ cil->xc_push_seq = 0;
+ spin_unlock(&cil->xc_cil_lock);
goto out_skip;
+ }
+ spin_unlock(&cil->xc_cil_lock);
+
/* check for a previously pushed seqeunce */
- if (push_seq && push_seq < cil->xc_ctx->sequence)
+ if (push_seq < cil->xc_ctx->sequence)
goto out_skip;
/*
@@ -606,7 +572,6 @@ restart:
out_skip:
up_write(&cil->xc_ctx_lock);
-out_free_ticket:
xfs_log_ticket_put(new_ctx->ticket);
kmem_free(new_ctx);
return 0;
@@ -618,6 +583,82 @@ out_abort:
return XFS_ERROR(EIO);
}
+static void
+xlog_cil_push_work(
+ struct work_struct *work)
+{
+ struct xfs_cil *cil = container_of(work, struct xfs_cil,
+ xc_push_work);
+ xlog_cil_push(cil->xc_log);
+}
+
+/*
+ * We need to push CIL every so often so we don't cache more than we can fit in
+ * the log. The limit really is that a checkpoint can't be more than half the
+ * log (the current checkpoint is not allowed to overwrite the previous
+ * checkpoint), but commit latency and memory usage limit this to a smaller
+ * size.
+ */
+static void
+xlog_cil_push_background(
+ struct xlog *log)
+{
+ struct xfs_cil *cil = log->l_cilp;
+
+ /*
+ * The cil won't be empty because we are called while holding the
+ * context lock so whatever we added to the CIL will still be there
+ */
+ ASSERT(!list_empty(&cil->xc_cil));
+
+ /*
+ * don't do a background push if we haven't used up all the
+ * space available yet.
+ */
+ if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+ return;
+
+ spin_lock(&cil->xc_cil_lock);
+ if (cil->xc_push_seq < cil->xc_current_sequence) {
+ cil->xc_push_seq = cil->xc_current_sequence;
+ queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
+ }
+ spin_unlock(&cil->xc_cil_lock);
+
+}
+
+static void
+xlog_cil_push_foreground(
+ struct xlog *log,
+ xfs_lsn_t push_seq)
+{
+ struct xfs_cil *cil = log->l_cilp;
+
+ if (!cil)
+ return;
+
+ ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
+
+ /* start on any pending background push to minimise wait time on it */
+ flush_work(&cil->xc_push_work);
+
+ /*
+ * If the CIL is empty or we've already pushed the sequence then
+ * there's no work we need to do.
+ */
+ spin_lock(&cil->xc_cil_lock);
+ if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
+ spin_unlock(&cil->xc_cil_lock);
+ return;
+ }
+
+ cil->xc_push_seq = push_seq;
+ spin_unlock(&cil->xc_cil_lock);
+
+ /* do the push now */
+ xlog_cil_push(log);
+}
+
/*
* Commit a transaction with the given vector to the Committed Item List.
*
@@ -635,28 +676,29 @@ out_abort:
* background commit, returns without it held once background commits are
* allowed again.
*/
-void
+int
xfs_log_commit_cil(
struct xfs_mount *mp,
struct xfs_trans *tp,
- struct xfs_log_vec *log_vector,
xfs_lsn_t *commit_lsn,
int flags)
{
- struct log *log = mp->m_log;
+ struct xlog *log = mp->m_log;
int log_flags = 0;
- int push = 0;
+ struct xfs_log_vec *log_vector;
if (flags & XFS_TRANS_RELEASE_LOG_RES)
log_flags = XFS_LOG_REL_PERM_RESERV;
/*
- * do all the hard work of formatting items (including memory
+ * Do all the hard work of formatting items (including memory
* allocation) outside the CIL context lock. This prevents stalling CIL
* pushes when we are low on memory and a transaction commit spends a
* lot of time in memory reclaim.
*/
- xlog_cil_format_items(log, log_vector);
+ log_vector = xlog_cil_prepare_log_vecs(tp);
+ if (!log_vector)
+ return ENOMEM;
/* lock out background commit */
down_read(&log->l_cilp->xc_ctx_lock);
@@ -694,21 +736,10 @@ xfs_log_commit_cil(
*/
xfs_trans_free_items(tp, *commit_lsn, 0);
- /* check for background commit before unlock */
- if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
- push = 1;
+ xlog_cil_push_background(log);
up_read(&log->l_cilp->xc_ctx_lock);
-
- /*
- * We need to push CIL every so often so we don't cache more than we
- * can fit in the log. The limit really is that a checkpoint can't be
- * more than half the log (the current checkpoint is not allowed to
- * overwrite the previous checkpoint), but commit latency and memory
- * usage limit this to a smaller size in most cases.
- */
- if (push)
- xlog_cil_push(log, 0);
+ return 0;
}
/*
@@ -720,13 +751,10 @@ xfs_log_commit_cil(
*
* We return the current commit lsn to allow the callers to determine if a
* iclog flush is necessary following this call.
- *
- * XXX: Initially, just push the CIL unconditionally and return whatever
- * commit lsn is there. It'll be empty, so this is broken for now.
*/
xfs_lsn_t
xlog_cil_force_lsn(
- struct log *log,
+ struct xlog *log,
xfs_lsn_t sequence)
{
struct xfs_cil *cil = log->l_cilp;
@@ -740,8 +768,7 @@ xlog_cil_force_lsn(
* xlog_cil_push() handles racing pushes for the same sequence,
* so no need to deal with it here.
*/
- if (sequence == cil->xc_current_sequence)
- xlog_cil_push(log, sequence);
+ xlog_cil_push_foreground(log, sequence);
/*
* See if we can find a previous sequence still committing.
@@ -786,8 +813,6 @@ xfs_log_item_in_current_chkpt(
{
struct xfs_cil_ctx *ctx;
- if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
- return false;
if (list_empty(&lip->li_cil))
return false;
@@ -802,3 +827,57 @@ xfs_log_item_in_current_chkpt(
return false;
return true;
}
+
+/*
+ * Perform initial CIL structure initialisation.
+ */
+int
+xlog_cil_init(
+ struct xlog *log)
+{
+ struct xfs_cil *cil;
+ struct xfs_cil_ctx *ctx;
+
+ cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+ if (!cil)
+ return ENOMEM;
+
+ ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+ if (!ctx) {
+ kmem_free(cil);
+ return ENOMEM;
+ }
+
+ INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
+ INIT_LIST_HEAD(&cil->xc_cil);
+ INIT_LIST_HEAD(&cil->xc_committing);
+ spin_lock_init(&cil->xc_cil_lock);
+ init_rwsem(&cil->xc_ctx_lock);
+ init_waitqueue_head(&cil->xc_commit_wait);
+
+ INIT_LIST_HEAD(&ctx->committing);
+ INIT_LIST_HEAD(&ctx->busy_extents);
+ ctx->sequence = 1;
+ ctx->cil = cil;
+ cil->xc_ctx = ctx;
+ cil->xc_current_sequence = ctx->sequence;
+
+ cil->xc_log = log;
+ log->l_cilp = cil;
+ return 0;
+}
+
+void
+xlog_cil_destroy(
+ struct xlog *log)
+{
+ if (log->l_cilp->xc_ctx) {
+ if (log->l_cilp->xc_ctx->ticket)
+ xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
+ kmem_free(log->l_cilp->xc_ctx);
+ }
+
+ ASSERT(list_empty(&log->l_cilp->xc_cil));
+ kmem_free(log->l_cilp);
+}
+
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 2d3b6a498d6..72eba2201b1 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -19,7 +19,7 @@
#define __XFS_LOG_PRIV_H__
struct xfs_buf;
-struct log;
+struct xlog;
struct xlog_ticket;
struct xfs_mount;
@@ -239,8 +239,8 @@ typedef struct xlog_res {
} xlog_res_t;
typedef struct xlog_ticket {
- wait_queue_head_t t_wait; /* ticket wait queue */
struct list_head t_queue; /* reserve/write queue */
+ struct task_struct *t_task; /* task that owns this ticket */
xlog_tid_t t_tid; /* transaction identifier : 4 */
atomic_t t_ref; /* ticket reference count : 4 */
int t_curr_res; /* current reservation in bytes : 4 */
@@ -352,7 +352,7 @@ typedef struct xlog_in_core {
struct xlog_in_core *ic_next;
struct xlog_in_core *ic_prev;
struct xfs_buf *ic_bp;
- struct log *ic_log;
+ struct xlog *ic_log;
int ic_size;
int ic_offset;
int ic_bwritecnt;
@@ -409,7 +409,7 @@ struct xfs_cil_ctx {
* operations almost as efficient as the old logging methods.
*/
struct xfs_cil {
- struct log *xc_log;
+ struct xlog *xc_log;
struct list_head xc_cil;
spinlock_t xc_cil_lock;
struct xfs_cil_ctx *xc_ctx;
@@ -417,6 +417,8 @@ struct xfs_cil {
struct list_head xc_committing;
wait_queue_head_t xc_commit_wait;
xfs_lsn_t xc_current_sequence;
+ struct work_struct xc_push_work;
+ xfs_lsn_t xc_push_seq;
};
/*
@@ -470,12 +472,22 @@ struct xfs_cil {
#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4))
/*
+ * ticket grant locks, queues and accounting have their own cachlines
+ * as these are quite hot and can be operated on concurrently.
+ */
+struct xlog_grant_head {
+ spinlock_t lock ____cacheline_aligned_in_smp;
+ struct list_head waiters;
+ atomic64_t grant;
+};
+
+/*
* The reservation head lsn is not made up of a cycle number and block number.
* Instead, it uses a cycle number and byte number. Logs don't expect to
* overflow 31 bits worth of byte offset, so using a byte number will mean
* that round off problems won't occur when releasing partial reservations.
*/
-typedef struct log {
+typedef struct xlog {
/* The following fields don't need locking */
struct xfs_mount *l_mp; /* mount point */
struct xfs_ail *l_ailp; /* AIL log is working with */
@@ -520,17 +532,8 @@ typedef struct log {
/* lsn of 1st LR with unflushed * buffers */
atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
- /*
- * ticket grant locks, queues and accounting have their own cachlines
- * as these are quite hot and can be operated on concurrently.
- */
- spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp;
- struct list_head l_reserveq;
- atomic64_t l_grant_reserve_head;
-
- spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp;
- struct list_head l_writeq;
- atomic64_t l_grant_write_head;
+ struct xlog_grant_head l_reserve_head;
+ struct xlog_grant_head l_write_head;
/* The following field are used for debugging; need to hold icloglock */
#ifdef DEBUG
@@ -545,15 +548,19 @@ typedef struct log {
#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
/* common routines */
-extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
extern int xlog_recover(xlog_t *log);
extern int xlog_recover_finish(xlog_t *log);
extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
extern kmem_zone_t *xfs_log_ticket_zone;
-struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
- int count, char client, uint xflags,
- int alloc_flags);
+struct xlog_ticket *
+xlog_ticket_alloc(
+ struct xlog *log,
+ int unit_bytes,
+ int count,
+ char client,
+ bool permanent,
+ xfs_km_flags_t alloc_flags);
static inline void
@@ -565,9 +572,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
}
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
-int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
- struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
- xlog_in_core_t **commit_iclog, uint flags);
+int
+xlog_write(
+ struct xlog *log,
+ struct xfs_log_vec *log_vector,
+ struct xlog_ticket *tic,
+ xfs_lsn_t *start_lsn,
+ struct xlog_in_core **commit_iclog,
+ uint flags);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
@@ -627,17 +639,23 @@ xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
/*
* Committed Item List interfaces
*/
-int xlog_cil_init(struct log *log);
-void xlog_cil_init_post_recovery(struct log *log);
-void xlog_cil_destroy(struct log *log);
+int
+xlog_cil_init(struct xlog *log);
+void
+xlog_cil_init_post_recovery(struct xlog *log);
+void
+xlog_cil_destroy(struct xlog *log);
/*
* CIL force routines
*/
-xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
+xfs_lsn_t
+xlog_cil_force_lsn(
+ struct xlog *log,
+ xfs_lsn_t sequence);
static inline void
-xlog_cil_force(struct log *log)
+xlog_cil_force(struct xlog *log)
{
xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 541a508adea..a7be98abd6a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -40,7 +40,6 @@
#include "xfs_extfree_item.h"
#include "xfs_trans_priv.h"
#include "xfs_quota.h"
-#include "xfs_rw.h"
#include "xfs_utils.h"
#include "xfs_trace.h"
@@ -120,7 +119,7 @@ xlog_get_bp(
nbblks += log->l_sectBBsize;
nbblks = round_up(nbblks, log->l_sectBBsize);
- bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, BBTOB(nbblks), 0);
+ bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
if (bp)
xfs_buf_unlock(bp);
return bp;
@@ -146,7 +145,7 @@ xlog_align(
{
xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
- ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
+ ASSERT(offset + nbblks <= bp->b_length);
return bp->b_addr + BBTOB(offset);
}
@@ -174,11 +173,12 @@ xlog_bread_noalign(
nbblks = round_up(nbblks, log->l_sectBBsize);
ASSERT(nbblks > 0);
- ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+ ASSERT(nbblks <= bp->b_length);
XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
XFS_BUF_READ(bp);
- XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+ bp->b_io_length = nbblks;
+ bp->b_error = 0;
xfsbdstrat(log->l_mp, bp);
error = xfs_buf_iowait(bp);
@@ -218,7 +218,7 @@ xlog_bread_offset(
xfs_caddr_t offset)
{
xfs_caddr_t orig_offset = bp->b_addr;
- int orig_len = bp->b_buffer_length;
+ int orig_len = BBTOB(bp->b_length);
int error, error2;
error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
@@ -259,13 +259,14 @@ xlog_bwrite(
nbblks = round_up(nbblks, log->l_sectBBsize);
ASSERT(nbblks > 0);
- ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+ ASSERT(nbblks <= bp->b_length);
XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
XFS_BUF_ZEROFLAGS(bp);
xfs_buf_hold(bp);
xfs_buf_lock(bp);
- XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+ bp->b_io_length = nbblks;
+ bp->b_error = 0;
error = xfs_bwrite(bp);
if (error)
@@ -440,6 +441,8 @@ xlog_find_verify_cycle(
* a log sector, or we're out of luck.
*/
bufblks = 1 << ffs(nbblks);
+ while (bufblks > log->l_logBBsize)
+ bufblks >>= 1;
while (!(bp = xlog_get_bp(log, bufblks))) {
bufblks >>= 1;
if (bufblks < log->l_sectBBsize)
@@ -965,9 +968,9 @@ xlog_find_tail(
log->l_curr_cycle++;
atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
- xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
+ xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
BBTOB(log->l_curr_block));
- xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
+ xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
BBTOB(log->l_curr_block));
/*
@@ -1225,6 +1228,8 @@ xlog_write_log_records(
* log sector, or we're out of luck.
*/
bufblks = 1 << ffs(blocks);
+ while (bufblks > log->l_logBBsize)
+ bufblks >>= 1;
while (!(bp = xlog_get_bp(log, bufblks))) {
bufblks >>= 1;
if (bufblks < sectbb)
@@ -1466,8 +1471,8 @@ xlog_recover_add_item(
STATIC int
xlog_recover_add_to_cont_trans(
- struct log *log,
- xlog_recover_t *trans,
+ struct xlog *log,
+ struct xlog_recover *trans,
xfs_caddr_t dp,
int len)
{
@@ -1489,7 +1494,7 @@ xlog_recover_add_to_cont_trans(
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
+ ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
memcpy(&ptr[old_len], dp, len); /* d, s, l */
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -1512,8 +1517,8 @@ xlog_recover_add_to_cont_trans(
*/
STATIC int
xlog_recover_add_to_trans(
- struct log *log,
- xlog_recover_t *trans,
+ struct xlog *log,
+ struct xlog_recover *trans,
xfs_caddr_t dp,
int len)
{
@@ -1583,8 +1588,8 @@ xlog_recover_add_to_trans(
*/
STATIC int
xlog_recover_reorder_trans(
- struct log *log,
- xlog_recover_t *trans,
+ struct xlog *log,
+ struct xlog_recover *trans,
int pass)
{
xlog_recover_item_t *item, *n;
@@ -1637,8 +1642,8 @@ xlog_recover_reorder_trans(
*/
STATIC int
xlog_recover_buffer_pass1(
- struct log *log,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct xlog_recover_item *item)
{
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
struct list_head *bucket;
@@ -1691,7 +1696,7 @@ xlog_recover_buffer_pass1(
*/
STATIC int
xlog_check_buffer_cancelled(
- struct log *log,
+ struct xlog *log,
xfs_daddr_t blkno,
uint len,
ushort flags)
@@ -1772,7 +1777,7 @@ xlog_recover_do_inode_buffer(
trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
- inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
+ inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
for (i = 0; i < inodes_per_buf; i++) {
next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
offsetof(xfs_dinode_t, di_next_unlinked);
@@ -1814,7 +1819,8 @@ xlog_recover_do_inode_buffer(
ASSERT(item->ri_buf[item_index].i_addr != NULL);
ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
- ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
+ ASSERT((reg_buf_offset + reg_buf_bytes) <=
+ BBTOB(bp->b_io_length));
/*
* The current logged region contains a copy of the
@@ -1873,8 +1879,8 @@ xlog_recover_do_reg_buffer(
ASSERT(nbits > 0);
ASSERT(item->ri_buf[i].i_addr != NULL);
ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
- ASSERT(XFS_BUF_COUNT(bp) >=
- ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
+ ASSERT(BBTOB(bp->b_io_length) >=
+ ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
/*
* Do a sanity check if this is a dquot buffer. Just checking
@@ -1981,7 +1987,7 @@ xfs_qm_dqcheck(
if (!errs && ddq->d_id) {
if (ddq->d_blk_softlimit &&
- be64_to_cpu(ddq->d_bcount) >=
+ be64_to_cpu(ddq->d_bcount) >
be64_to_cpu(ddq->d_blk_softlimit)) {
if (!ddq->d_btimer) {
if (flags & XFS_QMOPT_DOWARN)
@@ -1992,7 +1998,7 @@ xfs_qm_dqcheck(
}
}
if (ddq->d_ino_softlimit &&
- be64_to_cpu(ddq->d_icount) >=
+ be64_to_cpu(ddq->d_icount) >
be64_to_cpu(ddq->d_ino_softlimit)) {
if (!ddq->d_itimer) {
if (flags & XFS_QMOPT_DOWARN)
@@ -2003,7 +2009,7 @@ xfs_qm_dqcheck(
}
}
if (ddq->d_rtb_softlimit &&
- be64_to_cpu(ddq->d_rtbcount) >=
+ be64_to_cpu(ddq->d_rtbcount) >
be64_to_cpu(ddq->d_rtb_softlimit)) {
if (!ddq->d_rtbtimer) {
if (flags & XFS_QMOPT_DOWARN)
@@ -2103,6 +2109,7 @@ xlog_recover_do_dquot_buffer(
STATIC int
xlog_recover_buffer_pass2(
xlog_t *log,
+ struct list_head *buffer_list,
xlog_recover_item_t *item)
{
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
@@ -2123,9 +2130,9 @@ xlog_recover_buffer_pass2(
trace_xfs_log_recover_buf_recover(log, buf_f);
- buf_flags = XBF_LOCK;
- if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
- buf_flags |= XBF_MAPPED;
+ buf_flags = 0;
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
+ buf_flags |= XBF_UNMAPPED;
bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
buf_flags);
@@ -2166,14 +2173,14 @@ xlog_recover_buffer_pass2(
*/
if (XFS_DINODE_MAGIC ==
be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
- (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
+ (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
xfs_buf_stale(bp);
error = xfs_bwrite(bp);
} else {
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp);
+ xfs_buf_delwri_queue(bp, buffer_list);
}
xfs_buf_relse(bp);
@@ -2183,6 +2190,7 @@ xlog_recover_buffer_pass2(
STATIC int
xlog_recover_inode_pass2(
xlog_t *log,
+ struct list_head *buffer_list,
xlog_recover_item_t *item)
{
xfs_inode_log_format_t *in_f;
@@ -2220,8 +2228,7 @@ xlog_recover_inode_pass2(
}
trace_xfs_log_recover_inode_recover(log, in_f);
- bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
- XBF_LOCK);
+ bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0);
if (!bp) {
error = ENOMEM;
goto error;
@@ -2436,7 +2443,7 @@ xlog_recover_inode_pass2(
write_inode_buffer:
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp);
+ xfs_buf_delwri_queue(bp, buffer_list);
xfs_buf_relse(bp);
error:
if (need_free)
@@ -2477,6 +2484,7 @@ xlog_recover_quotaoff_pass1(
STATIC int
xlog_recover_dquot_pass2(
xlog_t *log,
+ struct list_head *buffer_list,
xlog_recover_item_t *item)
{
xfs_mount_t *mp = log->l_mp;
@@ -2530,14 +2538,11 @@ xlog_recover_dquot_pass2(
return XFS_ERROR(EIO);
ASSERT(dq_f->qlf_len == 1);
- error = xfs_read_buf(mp, mp->m_ddev_targp,
- dq_f->qlf_blkno,
- XFS_FSB_TO_BB(mp, dq_f->qlf_len),
- 0, &bp);
- if (error) {
- xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)");
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
+ XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp);
+ if (error)
return error;
- }
+
ASSERT(bp);
ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
@@ -2558,7 +2563,7 @@ xlog_recover_dquot_pass2(
ASSERT(dq_f->qlf_size == 2);
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp);
+ xfs_buf_delwri_queue(bp, buffer_list);
xfs_buf_relse(bp);
return (0);
@@ -2642,7 +2647,8 @@ xlog_recover_efd_pass2(
* xfs_trans_ail_delete() drops the
* AIL lock.
*/
- xfs_trans_ail_delete(ailp, lip);
+ xfs_trans_ail_delete(ailp, lip,
+ SHUTDOWN_CORRUPT_INCORE);
xfs_efi_item_free(efip);
spin_lock(&ailp->xa_lock);
break;
@@ -2683,9 +2689,9 @@ xlog_recover_free_trans(
STATIC int
xlog_recover_commit_pass1(
- struct log *log,
- struct xlog_recover *trans,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct xlog_recover *trans,
+ struct xlog_recover_item *item)
{
trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
@@ -2710,23 +2716,24 @@ xlog_recover_commit_pass1(
STATIC int
xlog_recover_commit_pass2(
- struct log *log,
- struct xlog_recover *trans,
- xlog_recover_item_t *item)
+ struct xlog *log,
+ struct xlog_recover *trans,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item)
{
trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
switch (ITEM_TYPE(item)) {
case XFS_LI_BUF:
- return xlog_recover_buffer_pass2(log, item);
+ return xlog_recover_buffer_pass2(log, buffer_list, item);
case XFS_LI_INODE:
- return xlog_recover_inode_pass2(log, item);
+ return xlog_recover_inode_pass2(log, buffer_list, item);
case XFS_LI_EFI:
return xlog_recover_efi_pass2(log, item, trans->r_lsn);
case XFS_LI_EFD:
return xlog_recover_efd_pass2(log, item);
case XFS_LI_DQUOT:
- return xlog_recover_dquot_pass2(log, item);
+ return xlog_recover_dquot_pass2(log, buffer_list, item);
case XFS_LI_QUOTAOFF:
/* nothing to do in pass2 */
return 0;
@@ -2746,12 +2753,13 @@ xlog_recover_commit_pass2(
*/
STATIC int
xlog_recover_commit_trans(
- struct log *log,
+ struct xlog *log,
struct xlog_recover *trans,
int pass)
{
- int error = 0;
+ int error = 0, error2;
xlog_recover_item_t *item;
+ LIST_HEAD (buffer_list);
hlist_del(&trans->r_list);
@@ -2760,22 +2768,33 @@ xlog_recover_commit_trans(
return error;
list_for_each_entry(item, &trans->r_itemq, ri_list) {
- if (pass == XLOG_RECOVER_PASS1)
+ switch (pass) {
+ case XLOG_RECOVER_PASS1:
error = xlog_recover_commit_pass1(log, trans, item);
- else
- error = xlog_recover_commit_pass2(log, trans, item);
+ break;
+ case XLOG_RECOVER_PASS2:
+ error = xlog_recover_commit_pass2(log, trans,
+ &buffer_list, item);
+ break;
+ default:
+ ASSERT(0);
+ }
+
if (error)
- return error;
+ goto out;
}
xlog_recover_free_trans(trans);
- return 0;
+
+out:
+ error2 = xfs_buf_delwri_submit(&buffer_list);
+ return error ? error : error2;
}
STATIC int
xlog_recover_unmount_trans(
- struct log *log,
- xlog_recover_t *trans)
+ struct xlog *log,
+ struct xlog_recover *trans)
{
/* Do nothing now */
xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
@@ -3079,7 +3098,7 @@ xlog_recover_process_one_iunlink(
/*
* Get the on disk inode to find the next inode in the bucket.
*/
- error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
+ error = xfs_itobp(mp, NULL, ip, &dip, &ibp, 0);
if (error)
goto fail_iput;
@@ -3161,37 +3180,26 @@ xlog_recover_process_iunlinks(
*/
continue;
}
+ /*
+ * Unlock the buffer so that it can be acquired in the normal
+ * course of the transaction to truncate and free each inode.
+ * Because we are not racing with anyone else here for the AGI
+ * buffer, we don't even need to hold it locked to read the
+ * initial unlinked bucket entries out of the buffer. We keep
+ * buffer reference though, so that it stays pinned in memory
+ * while we need the buffer.
+ */
agi = XFS_BUF_TO_AGI(agibp);
+ xfs_buf_unlock(agibp);
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
agino = be32_to_cpu(agi->agi_unlinked[bucket]);
while (agino != NULLAGINO) {
- /*
- * Release the agi buffer so that it can
- * be acquired in the normal course of the
- * transaction to truncate and free the inode.
- */
- xfs_buf_relse(agibp);
-
agino = xlog_recover_process_one_iunlink(mp,
agno, agino, bucket);
-
- /*
- * Reacquire the agibuffer and continue around
- * the loop. This should never fail as we know
- * the buffer was good earlier on.
- */
- error = xfs_read_agi(mp, NULL, agno, &agibp);
- ASSERT(error == 0);
- agi = XFS_BUF_TO_AGI(agibp);
}
}
-
- /*
- * Release the buffer for the current agi so we can
- * go on to the next one.
- */
- xfs_buf_relse(agibp);
+ xfs_buf_rele(agibp);
}
mp->m_dmevmask = mp_dmevmask;
@@ -3650,11 +3658,8 @@ xlog_do_recover(
* First replay the images in the log.
*/
error = xlog_do_log_recovery(log, head_blk, tail_blk);
- if (error) {
+ if (error)
return error;
- }
-
- xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1);
/*
* If IO errors happened during recovery, bail out.
@@ -3681,7 +3686,6 @@ xlog_do_recover(
bp = xfs_getsb(log->l_mp, 0);
XFS_BUF_UNDONE(bp);
ASSERT(!(XFS_BUF_ISWRITE(bp)));
- ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
XFS_BUF_READ(bp);
XFS_BUF_UNASYNC(bp);
xfsbdstrat(log->l_mp, bp);
@@ -3695,7 +3699,7 @@ xlog_do_recover(
/* Convert superblock from on-disk format */
sbp = &log->l_mp->m_sb;
- xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+ xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp));
ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
ASSERT(xfs_sb_good_version(sbp));
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index bd672def95a..331cd9f83a7 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d06afbc3540..536021fb3d4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -22,6 +22,7 @@
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
@@ -37,7 +38,6 @@
#include "xfs_rtalloc.h"
#include "xfs_bmap.h"
#include "xfs_error.h"
-#include "xfs_rw.h"
#include "xfs_quota.h"
#include "xfs_fsops.h"
#include "xfs_utils.h"
@@ -158,7 +158,7 @@ xfs_uuid_mount(
out_duplicate:
mutex_unlock(&xfs_uuid_table_mutex);
- xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
+ xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
return XFS_ERROR(EINVAL);
}
@@ -553,9 +553,11 @@ out_unwind:
void
xfs_sb_from_disk(
- xfs_sb_t *to,
+ struct xfs_mount *mp,
xfs_dsb_t *from)
{
+ struct xfs_sb *to = &mp->m_sb;
+
to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -681,8 +683,8 @@ xfs_readsb(xfs_mount_t *mp, int flags)
sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
reread:
- bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
- XFS_SB_DADDR, sector_size, 0);
+ bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
+ BTOBB(sector_size), 0);
if (!bp) {
if (loud)
xfs_warn(mp, "SB buffer read failed");
@@ -693,7 +695,7 @@ reread:
* Initialize the mount structure from the superblock.
* But first do some basic consistency checking.
*/
- xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+ xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
if (error) {
if (loud)
@@ -1030,9 +1032,9 @@ xfs_check_sizes(xfs_mount_t *mp)
xfs_warn(mp, "filesystem size mismatch detected");
return XFS_ERROR(EFBIG);
}
- bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+ bp = xfs_buf_read_uncached(mp->m_ddev_targp,
d - XFS_FSS_TO_BB(mp, 1),
- BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
+ XFS_FSS_TO_BB(mp, 1), 0);
if (!bp) {
xfs_warn(mp, "last sector read failed");
return EIO;
@@ -1045,9 +1047,9 @@ xfs_check_sizes(xfs_mount_t *mp)
xfs_warn(mp, "log size mismatch detected");
return XFS_ERROR(EFBIG);
}
- bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
+ bp = xfs_buf_read_uncached(mp->m_logdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_B(mp, 1), 0);
+ XFS_FSB_TO_BB(mp, 1), 0);
if (!bp) {
xfs_warn(mp, "log device read failed");
return EIO;
@@ -1286,7 +1288,7 @@ xfs_mountfs(
XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
if (error) {
xfs_warn(mp, "log mount failed");
- goto out_free_perag;
+ goto out_fail_wait;
}
/*
@@ -1313,7 +1315,7 @@ xfs_mountfs(
!mp->m_sb.sb_inprogress) {
error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
if (error)
- goto out_free_perag;
+ goto out_fail_wait;
}
/*
@@ -1437,6 +1439,10 @@ xfs_mountfs(
IRELE(rip);
out_log_dealloc:
xfs_log_unmount(mp);
+ out_fail_wait:
+ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
+ xfs_wait_buftarg(mp->m_logdev_targp);
+ xfs_wait_buftarg(mp->m_ddev_targp);
out_free_perag:
xfs_free_perag(mp);
out_remove_uuid:
@@ -1473,15 +1479,15 @@ xfs_unmountfs(
xfs_log_force(mp, XFS_LOG_SYNC);
/*
- * Do a delwri reclaim pass first so that as many dirty inodes are
- * queued up for IO as possible. Then flush the buffers before making
- * a synchronous path to catch all the remaining inodes are reclaimed.
- * This makes the reclaim process as quick as possible by avoiding
- * synchronous writeout and blocking on inodes already in the delwri
- * state as much as possible.
+ * Flush all pending changes from the AIL.
+ */
+ xfs_ail_push_all_sync(mp->m_ail);
+
+ /*
+ * And reclaim all inodes. At this point there should be no dirty
+ * inode, and none should be pinned or locked, but use synchronous
+ * reclaim just to be sure.
*/
- xfs_reclaim_inodes(mp, 0);
- xfs_flush_buftarg(mp->m_ddev_targp, 1);
xfs_reclaim_inodes(mp, SYNC_WAIT);
xfs_qm_unmount(mp);
@@ -1517,15 +1523,12 @@ xfs_unmountfs(
if (error)
xfs_warn(mp, "Unable to update superblock counters. "
"Freespace may not be correct on next mount.");
- xfs_unmountfs_writesb(mp);
/*
- * Make sure all buffers have been flushed and completed before
- * unmounting the log.
+ * At this point we might have modified the superblock again and thus
+ * added an item to the AIL, thus flush it again.
*/
- error = xfs_flush_buftarg(mp->m_ddev_targp, 1);
- if (error)
- xfs_warn(mp, "%d busy buffers during unmount.", error);
+ xfs_ail_push_all_sync(mp->m_ail);
xfs_wait_buftarg(mp->m_ddev_targp);
xfs_log_unmount_write(mp);
@@ -1586,36 +1589,6 @@ xfs_log_sbcount(xfs_mount_t *mp)
return error;
}
-int
-xfs_unmountfs_writesb(xfs_mount_t *mp)
-{
- xfs_buf_t *sbp;
- int error = 0;
-
- /*
- * skip superblock write if fs is read-only, or
- * if we are doing a forced umount.
- */
- if (!((mp->m_flags & XFS_MOUNT_RDONLY) ||
- XFS_FORCED_SHUTDOWN(mp))) {
-
- sbp = xfs_getsb(mp, 0);
-
- XFS_BUF_UNDONE(sbp);
- XFS_BUF_UNREAD(sbp);
- xfs_buf_delwri_dequeue(sbp);
- XFS_BUF_WRITE(sbp);
- XFS_BUF_UNASYNC(sbp);
- ASSERT(sbp->b_target == mp->m_ddev_targp);
- xfsbdstrat(mp, sbp);
- error = xfs_buf_iowait(sbp);
- if (error)
- xfs_buf_ioerror_alert(sbp, __func__);
- xfs_buf_relse(sbp);
- }
- return error;
-}
-
/*
* xfs_mod_sb() can be used to copy arbitrary changes to the
* in-core superblock into the superblock buffer to be logged.
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index bb24dac42a2..90c1fc9eaea 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -53,7 +53,7 @@ typedef struct xfs_trans_reservations {
#include "xfs_sync.h"
-struct log;
+struct xlog;
struct xfs_mount_args;
struct xfs_inode;
struct xfs_bmbt_irec;
@@ -133,7 +133,7 @@ typedef struct xfs_mount {
uint m_readio_blocks; /* min read size blocks */
uint m_writeio_log; /* min write size log bytes */
uint m_writeio_blocks; /* min write size blocks */
- struct log *m_log; /* log specific stuff */
+ struct xlog *m_log; /* log specific stuff */
int m_logbufs; /* number of log buffers */
int m_logbsize; /* size of each log buffer */
uint m_rsumlevels; /* rt summary levels */
@@ -211,6 +211,10 @@ typedef struct xfs_mount {
struct shrinker m_inode_shrink; /* inode reclaim shrinker */
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
+
+ struct workqueue_struct *m_data_workqueue;
+ struct workqueue_struct *m_unwritten_workqueue;
+ struct workqueue_struct *m_cil_workqueue;
} xfs_mount_t;
/*
@@ -219,7 +223,6 @@ typedef struct xfs_mount {
#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
must be synchronous except
for space allocations */
-#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */
#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
operations, typically for
@@ -376,7 +379,6 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
extern void xfs_unmountfs(xfs_mount_t *);
-extern int xfs_unmountfs_writesb(xfs_mount_t *);
extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
uint, int);
@@ -396,7 +398,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
xfs_agnumber_t *);
-extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *);
extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 0bbb1a41998..249db198776 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -48,227 +47,182 @@
* quota functionality, including maintaining the freelist and hash
* tables of dquots.
*/
-struct mutex xfs_Gqm_lock;
-struct xfs_qm *xfs_Gqm;
-uint ndquot;
-
-kmem_zone_t *qm_dqzone;
-kmem_zone_t *qm_dqtrxzone;
-
-STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
-STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
-
STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *);
-static struct shrinker xfs_qm_shaker = {
- .shrink = xfs_qm_shake,
- .seeks = DEFAULT_SEEKS,
-};
-
/*
- * Initialize the XQM structure.
- * Note that there is not one quota manager per file system.
+ * We use the batch lookup interface to iterate over the dquots as it
+ * currently is the only interface into the radix tree code that allows
+ * fuzzy lookups instead of exact matches. Holding the lock over multiple
+ * operations is fine as all callers are used either during mount/umount
+ * or quotaoff.
*/
-STATIC struct xfs_qm *
-xfs_Gqm_init(void)
+#define XFS_DQ_LOOKUP_BATCH 32
+
+STATIC int
+xfs_qm_dquot_walk(
+ struct xfs_mount *mp,
+ int type,
+ int (*execute)(struct xfs_dquot *dqp, void *data),
+ void *data)
{
- xfs_dqhash_t *udqhash, *gdqhash;
- xfs_qm_t *xqm;
- size_t hsize;
- uint i;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+ uint32_t next_index;
+ int last_error = 0;
+ int skipped;
+ int nr_found;
+
+restart:
+ skipped = 0;
+ next_index = 0;
+ nr_found = 0;
+
+ while (1) {
+ struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH];
+ int error = 0;
+ int i;
+
+ mutex_lock(&qi->qi_tree_lock);
+ nr_found = radix_tree_gang_lookup(tree, (void **)batch,
+ next_index, XFS_DQ_LOOKUP_BATCH);
+ if (!nr_found) {
+ mutex_unlock(&qi->qi_tree_lock);
+ break;
+ }
- /*
- * Initialize the dquot hash tables.
- */
- udqhash = kmem_zalloc_greedy(&hsize,
- XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
- XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
- if (!udqhash)
- goto out;
+ for (i = 0; i < nr_found; i++) {
+ struct xfs_dquot *dqp = batch[i];
- gdqhash = kmem_zalloc_large(hsize);
- if (!gdqhash)
- goto out_free_udqhash;
+ next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
- hsize /= sizeof(xfs_dqhash_t);
- ndquot = hsize << 8;
+ error = execute(batch[i], data);
+ if (error == EAGAIN) {
+ skipped++;
+ continue;
+ }
+ if (error && last_error != EFSCORRUPTED)
+ last_error = error;
+ }
- xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
- xqm->qm_dqhashmask = hsize - 1;
- xqm->qm_usr_dqhtable = udqhash;
- xqm->qm_grp_dqhtable = gdqhash;
- ASSERT(xqm->qm_usr_dqhtable != NULL);
- ASSERT(xqm->qm_grp_dqhtable != NULL);
+ mutex_unlock(&qi->qi_tree_lock);
- for (i = 0; i < hsize; i++) {
- xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
- xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
+ /* bail out if the filesystem is corrupted. */
+ if (last_error == EFSCORRUPTED) {
+ skipped = 0;
+ break;
+ }
}
- /*
- * Freelist of all dquots of all file systems
- */
- INIT_LIST_HEAD(&xqm->qm_dqfrlist);
- xqm->qm_dqfrlist_cnt = 0;
- mutex_init(&xqm->qm_dqfrlist_lock);
-
- /*
- * dquot zone. we register our own low-memory callback.
- */
- if (!qm_dqzone) {
- xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
- "xfs_dquots");
- qm_dqzone = xqm->qm_dqzone;
- } else
- xqm->qm_dqzone = qm_dqzone;
-
- register_shrinker(&xfs_qm_shaker);
-
- /*
- * The t_dqinfo portion of transactions.
- */
- if (!qm_dqtrxzone) {
- xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
- "xfs_dqtrx");
- qm_dqtrxzone = xqm->qm_dqtrxzone;
- } else
- xqm->qm_dqtrxzone = qm_dqtrxzone;
-
- atomic_set(&xqm->qm_totaldquots, 0);
- xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
- xqm->qm_nrefs = 0;
- return xqm;
+ if (skipped) {
+ delay(1);
+ goto restart;
+ }
- out_free_udqhash:
- kmem_free_large(udqhash);
- out:
- return NULL;
+ return last_error;
}
-/*
- * Destroy the global quota manager when its reference count goes to zero.
- */
-STATIC void
-xfs_qm_destroy(
- struct xfs_qm *xqm)
-{
- struct xfs_dquot *dqp, *n;
- int hsize, i;
-
- ASSERT(xqm != NULL);
- ASSERT(xqm->qm_nrefs == 0);
- unregister_shrinker(&xfs_qm_shaker);
- hsize = xqm->qm_dqhashmask + 1;
- for (i = 0; i < hsize; i++) {
- xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
- xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
- }
- kmem_free_large(xqm->qm_usr_dqhtable);
- kmem_free_large(xqm->qm_grp_dqhtable);
- xqm->qm_usr_dqhtable = NULL;
- xqm->qm_grp_dqhtable = NULL;
- xqm->qm_dqhashmask = 0;
-
- /* frlist cleanup */
- mutex_lock(&xqm->qm_dqfrlist_lock);
- list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
- xfs_dqlock(dqp);
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
- xfs_dqunlock(dqp);
- xfs_qm_dqdestroy(dqp);
- }
- mutex_unlock(&xqm->qm_dqfrlist_lock);
- mutex_destroy(&xqm->qm_dqfrlist_lock);
- kmem_free(xqm);
-}
/*
- * Called at mount time to let XQM know that another file system is
- * starting quotas. This isn't crucial information as the individual mount
- * structures are pretty independent, but it helps the XQM keep a
- * global view of what's going on.
+ * Purge a dquot from all tracking data structures and free it.
*/
-/* ARGSUSED */
STATIC int
-xfs_qm_hold_quotafs_ref(
- struct xfs_mount *mp)
+xfs_qm_dqpurge(
+ struct xfs_dquot *dqp,
+ void *data)
{
- /*
- * Need to lock the xfs_Gqm structure for things like this. For example,
- * the structure could disappear between the entry to this routine and
- * a HOLD operation if not locked.
- */
- mutex_lock(&xfs_Gqm_lock);
+ struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_dquot *gdqp = NULL;
- if (!xfs_Gqm) {
- xfs_Gqm = xfs_Gqm_init();
- if (!xfs_Gqm) {
- mutex_unlock(&xfs_Gqm_lock);
- return ENOMEM;
- }
+ xfs_dqlock(dqp);
+ if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
+ xfs_dqunlock(dqp);
+ return EAGAIN;
}
/*
- * We can keep a list of all filesystems with quotas mounted for
- * debugging and statistical purposes, but ...
- * Just take a reference and get out.
+ * If this quota has a group hint attached, prepare for releasing it
+ * now.
*/
- xfs_Gqm->qm_nrefs++;
- mutex_unlock(&xfs_Gqm_lock);
+ gdqp = dqp->q_gdquot;
+ if (gdqp) {
+ xfs_dqlock(gdqp);
+ dqp->q_gdquot = NULL;
+ }
- return 0;
-}
+ dqp->dq_flags |= XFS_DQ_FREEING;
-
-/*
- * Release the reference that a filesystem took at mount time,
- * so that we know when we need to destroy the entire quota manager.
- */
-/* ARGSUSED */
-STATIC void
-xfs_qm_rele_quotafs_ref(
- struct xfs_mount *mp)
-{
- xfs_dquot_t *dqp, *n;
-
- ASSERT(xfs_Gqm);
- ASSERT(xfs_Gqm->qm_nrefs > 0);
+ xfs_dqflock(dqp);
/*
- * Go thru the freelist and destroy all inactive dquots.
+ * If we are turning this type of quotas off, we don't care
+ * about the dirty metadata sitting in this dquot. OTOH, if
+ * we're unmounting, we do care, so we flush it and wait.
*/
- mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-
- list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
- xfs_dqlock(dqp);
- if (dqp->dq_flags & XFS_DQ_INACTIVE) {
- ASSERT(dqp->q_mount == NULL);
- ASSERT(! XFS_DQ_IS_DIRTY(dqp));
- ASSERT(list_empty(&dqp->q_hashlist));
- ASSERT(list_empty(&dqp->q_mplist));
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
- xfs_dqunlock(dqp);
- xfs_qm_dqdestroy(dqp);
+ if (XFS_DQ_IS_DIRTY(dqp)) {
+ struct xfs_buf *bp = NULL;
+ int error;
+
+ /*
+ * We don't care about getting disk errors here. We need
+ * to purge this dquot anyway, so we go ahead regardless.
+ */
+ error = xfs_qm_dqflush(dqp, &bp);
+ if (error) {
+ xfs_warn(mp, "%s: dquot %p flush failed",
+ __func__, dqp);
} else {
- xfs_dqunlock(dqp);
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
}
+ xfs_dqflock(dqp);
}
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+
+ ASSERT(atomic_read(&dqp->q_pincount) == 0);
+ ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+ !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+
+ xfs_dqfunlock(dqp);
+ xfs_dqunlock(dqp);
+
+ radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+ be32_to_cpu(dqp->q_core.d_id));
+ qi->qi_dquots--;
/*
- * Destroy the entire XQM. If somebody mounts with quotaon, this'll
- * be restarted.
+ * We move dquots to the freelist as soon as their reference count
+ * hits zero, so it really should be on the freelist here.
*/
- mutex_lock(&xfs_Gqm_lock);
- if (--xfs_Gqm->qm_nrefs == 0) {
- xfs_qm_destroy(xfs_Gqm);
- xfs_Gqm = NULL;
- }
- mutex_unlock(&xfs_Gqm_lock);
+ mutex_lock(&qi->qi_lru_lock);
+ ASSERT(!list_empty(&dqp->q_lru));
+ list_del_init(&dqp->q_lru);
+ qi->qi_lru_count--;
+ XFS_STATS_DEC(xs_qm_dquot_unused);
+ mutex_unlock(&qi->qi_lru_lock);
+
+ xfs_qm_dqdestroy(dqp);
+
+ if (gdqp)
+ xfs_qm_dqput(gdqp);
+ return 0;
+}
+
+/*
+ * Purge the dquot cache.
+ */
+void
+xfs_qm_dqpurge_all(
+ struct xfs_mount *mp,
+ uint flags)
+{
+ if (flags & XFS_QMOPT_UQUOTA)
+ xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL);
+ if (flags & XFS_QMOPT_GQUOTA)
+ xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
+ if (flags & XFS_QMOPT_PQUOTA)
+ xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL);
}
/*
@@ -409,200 +363,6 @@ xfs_qm_unmount_quotas(
}
}
-/*
- * Flush all dquots of the given file system to disk. The dquots are
- * _not_ purged from memory here, just their data written to disk.
- */
-STATIC int
-xfs_qm_dqflush_all(
- struct xfs_mount *mp,
- int sync_mode)
-{
- struct xfs_quotainfo *q = mp->m_quotainfo;
- int recl;
- struct xfs_dquot *dqp;
- int error;
-
- if (!q)
- return 0;
-again:
- mutex_lock(&q->qi_dqlist_lock);
- list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
- xfs_dqlock(dqp);
- if (! XFS_DQ_IS_DIRTY(dqp)) {
- xfs_dqunlock(dqp);
- continue;
- }
-
- /* XXX a sentinel would be better */
- recl = q->qi_dqreclaims;
- if (!xfs_dqflock_nowait(dqp)) {
- /*
- * If we can't grab the flush lock then check
- * to see if the dquot has been flushed delayed
- * write. If so, grab its buffer and send it
- * out immediately. We'll be able to acquire
- * the flush lock when the I/O completes.
- */
- xfs_qm_dqflock_pushbuf_wait(dqp);
- }
- /*
- * Let go of the mplist lock. We don't want to hold it
- * across a disk write.
- */
- mutex_unlock(&q->qi_dqlist_lock);
- error = xfs_qm_dqflush(dqp, sync_mode);
- xfs_dqunlock(dqp);
- if (error)
- return error;
-
- mutex_lock(&q->qi_dqlist_lock);
- if (recl != q->qi_dqreclaims) {
- mutex_unlock(&q->qi_dqlist_lock);
- /* XXX restart limit */
- goto again;
- }
- }
-
- mutex_unlock(&q->qi_dqlist_lock);
- /* return ! busy */
- return 0;
-}
-/*
- * Release the group dquot pointers the user dquots may be
- * carrying around as a hint. mplist is locked on entry and exit.
- */
-STATIC void
-xfs_qm_detach_gdquots(
- struct xfs_mount *mp)
-{
- struct xfs_quotainfo *q = mp->m_quotainfo;
- struct xfs_dquot *dqp, *gdqp;
- int nrecl;
-
- again:
- ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
- list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
- xfs_dqlock(dqp);
- if ((gdqp = dqp->q_gdquot)) {
- xfs_dqlock(gdqp);
- dqp->q_gdquot = NULL;
- }
- xfs_dqunlock(dqp);
-
- if (gdqp) {
- /*
- * Can't hold the mplist lock across a dqput.
- * XXXmust convert to marker based iterations here.
- */
- nrecl = q->qi_dqreclaims;
- mutex_unlock(&q->qi_dqlist_lock);
- xfs_qm_dqput(gdqp);
-
- mutex_lock(&q->qi_dqlist_lock);
- if (nrecl != q->qi_dqreclaims)
- goto again;
- }
- }
-}
-
-/*
- * Go through all the incore dquots of this file system and take them
- * off the mplist and hashlist, if the dquot type matches the dqtype
- * parameter. This is used when turning off quota accounting for
- * users and/or groups, as well as when the filesystem is unmounting.
- */
-STATIC int
-xfs_qm_dqpurge_int(
- struct xfs_mount *mp,
- uint flags)
-{
- struct xfs_quotainfo *q = mp->m_quotainfo;
- struct xfs_dquot *dqp, *n;
- uint dqtype;
- int nrecl;
- int nmisses;
-
- if (!q)
- return 0;
-
- dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
- dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
- dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
-
- mutex_lock(&q->qi_dqlist_lock);
-
- /*
- * In the first pass through all incore dquots of this filesystem,
- * we release the group dquot pointers the user dquots may be
- * carrying around as a hint. We need to do this irrespective of
- * what's being turned off.
- */
- xfs_qm_detach_gdquots(mp);
-
- again:
- nmisses = 0;
- ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
- /*
- * Try to get rid of all of the unwanted dquots. The idea is to
- * get them off mplist and hashlist, but leave them on freelist.
- */
- list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
- /*
- * It's OK to look at the type without taking dqlock here.
- * We're holding the mplist lock here, and that's needed for
- * a dqreclaim.
- */
- if ((dqp->dq_flags & dqtype) == 0)
- continue;
-
- if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
- nrecl = q->qi_dqreclaims;
- mutex_unlock(&q->qi_dqlist_lock);
- mutex_lock(&dqp->q_hash->qh_lock);
- mutex_lock(&q->qi_dqlist_lock);
-
- /*
- * XXXTheoretically, we can get into a very long
- * ping pong game here.
- * No one can be adding dquots to the mplist at
- * this point, but somebody might be taking things off.
- */
- if (nrecl != q->qi_dqreclaims) {
- mutex_unlock(&dqp->q_hash->qh_lock);
- goto again;
- }
- }
-
- /*
- * Take the dquot off the mplist and hashlist. It may remain on
- * freelist in INACTIVE state.
- */
- nmisses += xfs_qm_dqpurge(dqp);
- }
- mutex_unlock(&q->qi_dqlist_lock);
- return nmisses;
-}
-
-int
-xfs_qm_dqpurge_all(
- xfs_mount_t *mp,
- uint flags)
-{
- int ndquots;
-
- /*
- * Purge the dquot cache.
- * None of the dquots should really be busy at this point.
- */
- if (mp->m_quotainfo) {
- while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) {
- delay(ndquots * 10);
- }
- }
- return 0;
-}
-
STATIC int
xfs_qm_dqattach_one(
xfs_inode_t *ip,
@@ -648,12 +408,9 @@ xfs_qm_dqattach_one(
*/
dqp = udqhint->q_gdquot;
if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
- xfs_dqlock(dqp);
- XFS_DQHOLD(dqp);
ASSERT(*IO_idqpp == NULL);
- *IO_idqpp = dqp;
- xfs_dqunlock(dqp);
+ *IO_idqpp = xfs_qm_dqhold(dqp);
xfs_dqunlock(udqhint);
return 0;
}
@@ -693,11 +450,7 @@ xfs_qm_dqattach_one(
/*
* Given a udquot and gdquot, attach a ptr to the group dquot in the
- * udquot as a hint for future lookups. The idea sounds simple, but the
- * execution isn't, because the udquot might have a group dquot attached
- * already and getting rid of that gets us into lock ordering constraints.
- * The process is complicated more by the fact that the dquots may or may not
- * be locked on entry.
+ * udquot as a hint for future lookups.
*/
STATIC void
xfs_qm_dqattach_grouphint(
@@ -708,48 +461,37 @@ xfs_qm_dqattach_grouphint(
xfs_dqlock(udq);
- if ((tmp = udq->q_gdquot)) {
- if (tmp == gdq) {
- xfs_dqunlock(udq);
- return;
- }
+ tmp = udq->q_gdquot;
+ if (tmp) {
+ if (tmp == gdq)
+ goto done;
udq->q_gdquot = NULL;
- /*
- * We can't keep any dqlocks when calling dqrele,
- * because the freelist lock comes before dqlocks.
- */
- xfs_dqunlock(udq);
- /*
- * we took a hard reference once upon a time in dqget,
- * so give it back when the udquot no longer points at it
- * dqput() does the unlocking of the dquot.
- */
xfs_qm_dqrele(tmp);
-
- xfs_dqlock(udq);
- xfs_dqlock(gdq);
-
- } else {
- ASSERT(XFS_DQ_IS_LOCKED(udq));
- xfs_dqlock(gdq);
}
- ASSERT(XFS_DQ_IS_LOCKED(udq));
- ASSERT(XFS_DQ_IS_LOCKED(gdq));
- /*
- * Somebody could have attached a gdquot here,
- * when we dropped the uqlock. If so, just do nothing.
- */
- if (udq->q_gdquot == NULL) {
- XFS_DQHOLD(gdq);
- udq->q_gdquot = gdq;
- }
-
- xfs_dqunlock(gdq);
+ udq->q_gdquot = xfs_qm_dqhold(gdq);
+done:
xfs_dqunlock(udq);
}
+static bool
+xfs_qm_need_dqattach(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return false;
+ if (!XFS_IS_QUOTA_ON(mp))
+ return false;
+ if (!XFS_NOT_DQATTACHED(mp, ip))
+ return false;
+ if (ip->i_ino == mp->m_sb.sb_uquotino ||
+ ip->i_ino == mp->m_sb.sb_gquotino)
+ return false;
+ return true;
+}
/*
* Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
@@ -767,11 +509,7 @@ xfs_qm_dqattach_locked(
uint nquotas = 0;
int error = 0;
- if (!XFS_IS_QUOTA_RUNNING(mp) ||
- !XFS_IS_QUOTA_ON(mp) ||
- !XFS_NOT_DQATTACHED(mp, ip) ||
- ip->i_ino == mp->m_sb.sb_uquotino ||
- ip->i_ino == mp->m_sb.sb_gquotino)
+ if (!xfs_qm_need_dqattach(ip))
return 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -813,17 +551,13 @@ xfs_qm_dqattach_locked(
ASSERT(ip->i_gdquot);
/*
- * We may or may not have the i_udquot locked at this point,
- * but this check is OK since we don't depend on the i_gdquot to
- * be accurate 100% all the time. It is just a hint, and this
- * will succeed in general.
- */
- if (ip->i_udquot->q_gdquot == ip->i_gdquot)
- goto done;
- /*
- * Attach i_gdquot to the gdquot hint inside the i_udquot.
+ * We do not have i_udquot locked at this point, but this check
+ * is OK since we don't depend on the i_gdquot to be accurate
+ * 100% all the time. It is just a hint, and this will
+ * succeed in general.
*/
- xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
+ if (ip->i_udquot->q_gdquot != ip->i_gdquot)
+ xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
}
done:
@@ -846,6 +580,9 @@ xfs_qm_dqattach(
{
int error;
+ if (!xfs_qm_need_dqattach(ip))
+ return 0;
+
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_qm_dqattach_locked(ip, flags);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -879,108 +616,6 @@ xfs_qm_dqdetach(
}
}
-int
-xfs_qm_sync(
- struct xfs_mount *mp,
- int flags)
-{
- struct xfs_quotainfo *q = mp->m_quotainfo;
- int recl, restarts;
- struct xfs_dquot *dqp;
- int error;
-
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
- return 0;
-
- restarts = 0;
-
- again:
- mutex_lock(&q->qi_dqlist_lock);
- /*
- * dqpurge_all() also takes the mplist lock and iterate thru all dquots
- * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
- * when we have the mplist lock, we know that dquots will be consistent
- * as long as we have it locked.
- */
- if (!XFS_IS_QUOTA_ON(mp)) {
- mutex_unlock(&q->qi_dqlist_lock);
- return 0;
- }
- ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
- list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
- /*
- * If this is vfs_sync calling, then skip the dquots that
- * don't 'seem' to be dirty. ie. don't acquire dqlock.
- * This is very similar to what xfs_sync does with inodes.
- */
- if (flags & SYNC_TRYLOCK) {
- if (!XFS_DQ_IS_DIRTY(dqp))
- continue;
- if (!xfs_qm_dqlock_nowait(dqp))
- continue;
- } else {
- xfs_dqlock(dqp);
- }
-
- /*
- * Now, find out for sure if this dquot is dirty or not.
- */
- if (! XFS_DQ_IS_DIRTY(dqp)) {
- xfs_dqunlock(dqp);
- continue;
- }
-
- /* XXX a sentinel would be better */
- recl = q->qi_dqreclaims;
- if (!xfs_dqflock_nowait(dqp)) {
- if (flags & SYNC_TRYLOCK) {
- xfs_dqunlock(dqp);
- continue;
- }
- /*
- * If we can't grab the flush lock then if the caller
- * really wanted us to give this our best shot, so
- * see if we can give a push to the buffer before we wait
- * on the flush lock. At this point, we know that
- * even though the dquot is being flushed,
- * it has (new) dirty data.
- */
- xfs_qm_dqflock_pushbuf_wait(dqp);
- }
- /*
- * Let go of the mplist lock. We don't want to hold it
- * across a disk write
- */
- mutex_unlock(&q->qi_dqlist_lock);
- error = xfs_qm_dqflush(dqp, flags);
- xfs_dqunlock(dqp);
- if (error && XFS_FORCED_SHUTDOWN(mp))
- return 0; /* Need to prevent umount failure */
- else if (error)
- return error;
-
- mutex_lock(&q->qi_dqlist_lock);
- if (recl != q->qi_dqreclaims) {
- if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
- break;
-
- mutex_unlock(&q->qi_dqlist_lock);
- goto again;
- }
- }
-
- mutex_unlock(&q->qi_dqlist_lock);
- return 0;
-}
-
-/*
- * The hash chains and the mplist use the same xfs_dqhash structure as
- * their list head, but we can take the mplist qh_lock and one of the
- * hash qh_locks at the same time without any problem as they aren't
- * related.
- */
-static struct lock_class_key xfs_quota_mplist_class;
-
/*
* This initializes all the quota information that's kept in the
* mount structure
@@ -995,13 +630,6 @@ xfs_qm_init_quotainfo(
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- /*
- * Tell XQM that we exist as soon as possible.
- */
- if ((error = xfs_qm_hold_quotafs_ref(mp))) {
- return error;
- }
-
qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
/*
@@ -1014,11 +642,13 @@ xfs_qm_init_quotainfo(
return error;
}
- INIT_LIST_HEAD(&qinf->qi_dqlist);
- mutex_init(&qinf->qi_dqlist_lock);
- lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
+ INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
+ INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
+ mutex_init(&qinf->qi_tree_lock);
- qinf->qi_dqreclaims = 0;
+ INIT_LIST_HEAD(&qinf->qi_lru_list);
+ qinf->qi_lru_count = 0;
+ mutex_init(&qinf->qi_lru_lock);
/* mutex used to serialize quotaoffs */
mutex_init(&qinf->qi_quotaofflock);
@@ -1034,18 +664,21 @@ xfs_qm_init_quotainfo(
/*
* We try to get the limits from the superuser's limits fields.
* This is quite hacky, but it is standard quota practice.
+ *
* We look at the USR dquot with id == 0 first, but if user quotas
* are not enabled we goto the GRP dquot with id == 0.
* We don't really care to keep separate default limits for user
* and group quotas, at least not at this point.
+ *
+ * Since we may not have done a quotacheck by this point, just read
+ * the dquot without attaching it to any hashtables or lists.
*/
- error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0,
- XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
- (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
- XFS_DQ_PROJ),
- XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN,
- &dqp);
- if (! error) {
+ error = xfs_qm_dqread(mp, 0,
+ XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
+ (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
+ XFS_DQ_PROJ),
+ XFS_QMOPT_DOWARN, &dqp);
+ if (!error) {
xfs_disk_dquot_t *ddqp = &dqp->q_core;
/*
@@ -1072,11 +705,6 @@ xfs_qm_init_quotainfo(
qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
- /*
- * We sent the XFS_QMOPT_DQSUSER flag to dqget because
- * we don't want this dquot cached. We haven't done a
- * quotacheck yet, and quotacheck doesn't like incore dquots.
- */
xfs_qm_dqdestroy(dqp);
} else {
qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
@@ -1087,6 +715,9 @@ xfs_qm_init_quotainfo(
qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
}
+ qinf->qi_shrinker.shrink = xfs_qm_shake;
+ qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
+ register_shrinker(&qinf->qi_shrinker);
return 0;
}
@@ -1104,17 +735,8 @@ xfs_qm_destroy_quotainfo(
qi = mp->m_quotainfo;
ASSERT(qi != NULL);
- ASSERT(xfs_Gqm != NULL);
-
- /*
- * Release the reference that XQM kept, so that we know
- * when the XQM structure should be freed. We cannot assume
- * that xfs_Gqm is non-null after this point.
- */
- xfs_qm_rele_quotafs_ref(mp);
- ASSERT(list_empty(&qi->qi_dqlist));
- mutex_destroy(&qi->qi_dqlist_lock);
+ unregister_shrinker(&qi->qi_shrinker);
if (qi->qi_uquotaip) {
IRELE(qi->qi_uquotaip);
@@ -1129,30 +751,6 @@ xfs_qm_destroy_quotainfo(
mp->m_quotainfo = NULL;
}
-
-
-/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
-
-/* ARGSUSED */
-STATIC void
-xfs_qm_list_init(
- xfs_dqlist_t *list,
- char *str,
- int n)
-{
- mutex_init(&list->qh_lock);
- INIT_LIST_HEAD(&list->qh_list);
- list->qh_version = 0;
- list->qh_nelems = 0;
-}
-
-STATIC void
-xfs_qm_list_destroy(
- xfs_dqlist_t *list)
-{
- mutex_destroy(&(list->qh_lock));
-}
-
/*
* Create an inode and return with a reference already taken, but unlocked
* This is how we create quota inodes
@@ -1265,15 +863,16 @@ xfs_qm_reset_dqcounts(
STATIC int
xfs_qm_dqiter_bufs(
- xfs_mount_t *mp,
- xfs_dqid_t firstid,
- xfs_fsblock_t bno,
- xfs_filblks_t blkcnt,
- uint flags)
+ struct xfs_mount *mp,
+ xfs_dqid_t firstid,
+ xfs_fsblock_t bno,
+ xfs_filblks_t blkcnt,
+ uint flags,
+ struct list_head *buffer_list)
{
- xfs_buf_t *bp;
- int error;
- int type;
+ struct xfs_buf *bp;
+ int error;
+ int type;
ASSERT(blkcnt > 0);
type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
@@ -1297,7 +896,7 @@ xfs_qm_dqiter_bufs(
break;
xfs_qm_reset_dqcounts(mp, bp, firstid, type);
- xfs_buf_delwri_queue(bp);
+ xfs_buf_delwri_queue(bp, buffer_list);
xfs_buf_relse(bp);
/*
* goto the next block.
@@ -1305,6 +904,7 @@ xfs_qm_dqiter_bufs(
bno++;
firstid += mp->m_quotainfo->qi_dqperchunk;
}
+
return error;
}
@@ -1314,11 +914,12 @@ xfs_qm_dqiter_bufs(
*/
STATIC int
xfs_qm_dqiterate(
- xfs_mount_t *mp,
- xfs_inode_t *qip,
- uint flags)
+ struct xfs_mount *mp,
+ struct xfs_inode *qip,
+ uint flags,
+ struct list_head *buffer_list)
{
- xfs_bmbt_irec_t *map;
+ struct xfs_bmbt_irec *map;
int i, nmaps; /* number of map entries */
int error; /* return value */
xfs_fileoff_t lblkno;
@@ -1385,21 +986,17 @@ xfs_qm_dqiterate(
* Iterate thru all the blks in the extent and
* reset the counters of all the dquots inside them.
*/
- if ((error = xfs_qm_dqiter_bufs(mp,
- firstid,
- map[i].br_startblock,
- map[i].br_blockcount,
- flags))) {
- break;
- }
+ error = xfs_qm_dqiter_bufs(mp, firstid,
+ map[i].br_startblock,
+ map[i].br_blockcount,
+ flags, buffer_list);
+ if (error)
+ goto out;
}
-
- if (error)
- break;
} while (nmaps > 0);
+out:
kmem_free(map);
-
return error;
}
@@ -1590,6 +1187,33 @@ error0:
return error;
}
+STATIC int
+xfs_qm_flush_one(
+ struct xfs_dquot *dqp,
+ void *data)
+{
+ struct list_head *buffer_list = data;
+ struct xfs_buf *bp = NULL;
+ int error = 0;
+
+ xfs_dqlock(dqp);
+ if (dqp->dq_flags & XFS_DQ_FREEING)
+ goto out_unlock;
+ if (!XFS_DQ_IS_DIRTY(dqp))
+ goto out_unlock;
+
+ xfs_dqflock(dqp);
+ error = xfs_qm_dqflush(dqp, &bp);
+ if (error)
+ goto out_unlock;
+
+ xfs_buf_delwri_queue(bp, buffer_list);
+ xfs_buf_relse(bp);
+out_unlock:
+ xfs_dqunlock(dqp);
+ return error;
+}
+
/*
* Walk thru all the filesystem inodes and construct a consistent view
* of the disk quota world. If the quotacheck fails, disable quotas.
@@ -1598,11 +1222,12 @@ int
xfs_qm_quotacheck(
xfs_mount_t *mp)
{
- int done, count, error;
+ int done, count, error, error2;
xfs_ino_t lastino;
size_t structsz;
xfs_inode_t *uip, *gip;
uint flags;
+ LIST_HEAD (buffer_list);
count = INT_MAX;
structsz = 1;
@@ -1612,12 +1237,6 @@ xfs_qm_quotacheck(
ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- /*
- * There should be no cached dquots. The (simplistic) quotacheck
- * algorithm doesn't like that.
- */
- ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
-
xfs_notice(mp, "Quotacheck needed: Please wait.");
/*
@@ -1627,7 +1246,8 @@ xfs_qm_quotacheck(
*/
uip = mp->m_quotainfo->qi_uquotaip;
if (uip) {
- error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
+ error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
+ &buffer_list);
if (error)
goto error_return;
flags |= XFS_UQUOTA_CHKD;
@@ -1636,7 +1256,8 @@ xfs_qm_quotacheck(
gip = mp->m_quotainfo->qi_gquotaip;
if (gip) {
error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
- XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+ XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
+ &buffer_list);
if (error)
goto error_return;
flags |= XFS_OQUOTA_CHKD;
@@ -1656,12 +1277,29 @@ xfs_qm_quotacheck(
} while (!done);
/*
- * We've made all the changes that we need to make incore.
- * Flush them down to disk buffers if everything was updated
- * successfully.
+ * We've made all the changes that we need to make incore. Flush them
+ * down to disk buffers if everything was updated successfully.
*/
+ if (XFS_IS_UQUOTA_ON(mp)) {
+ error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one,
+ &buffer_list);
+ }
+ if (XFS_IS_GQUOTA_ON(mp)) {
+ error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one,
+ &buffer_list);
+ if (!error)
+ error = error2;
+ }
+ if (XFS_IS_PQUOTA_ON(mp)) {
+ error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one,
+ &buffer_list);
+ if (!error)
+ error = error2;
+ }
+
+ error2 = xfs_buf_delwri_submit(&buffer_list);
if (!error)
- error = xfs_qm_dqflush_all(mp, 0);
+ error = error2;
/*
* We can get this error if we couldn't do a dquot allocation inside
@@ -1676,23 +1314,21 @@ xfs_qm_quotacheck(
}
/*
- * We didn't log anything, because if we crashed, we'll have to
- * start the quotacheck from scratch anyway. However, we must make
- * sure that our dquot changes are secure before we put the
- * quotacheck'd stamp on the superblock. So, here we do a synchronous
- * flush.
- */
- xfs_flush_buftarg(mp->m_ddev_targp, 1);
-
- /*
* If one type of quotas is off, then it will lose its
* quotachecked status, since we won't be doing accounting for
* that type anymore.
*/
- mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
+ mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD;
mp->m_qflags |= flags;
error_return:
+ while (!list_empty(&buffer_list)) {
+ struct xfs_buf *bp =
+ list_first_entry(&buffer_list, struct xfs_buf, b_list);
+ list_del_init(&bp->b_list);
+ xfs_buf_relse(bp);
+ }
+
if (error) {
xfs_warn(mp,
"Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
@@ -1701,7 +1337,6 @@ xfs_qm_quotacheck(
* We must turn off quotas.
*/
ASSERT(mp->m_quotainfo != NULL);
- ASSERT(xfs_Gqm != NULL);
xfs_qm_destroy_quotainfo(mp);
if (xfs_mount_reset_sbqflags(mp)) {
xfs_warn(mp,
@@ -1790,257 +1425,150 @@ xfs_qm_init_quotainos(
return 0;
}
+STATIC void
+xfs_qm_dqfree_one(
+ struct xfs_dquot *dqp)
+{
+ struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ mutex_lock(&qi->qi_tree_lock);
+ radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+ be32_to_cpu(dqp->q_core.d_id));
-/*
- * Just pop the least recently used dquot off the freelist and
- * recycle it. The returned dquot is locked.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqreclaim_one(void)
-{
- xfs_dquot_t *dqpout;
- xfs_dquot_t *dqp;
- int restarts;
- int startagain;
+ qi->qi_dquots--;
+ mutex_unlock(&qi->qi_tree_lock);
- restarts = 0;
- dqpout = NULL;
+ xfs_qm_dqdestroy(dqp);
+}
- /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
-again:
- startagain = 0;
- mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+STATIC void
+xfs_qm_dqreclaim_one(
+ struct xfs_dquot *dqp,
+ struct list_head *buffer_list,
+ struct list_head *dispose_list)
+{
+ struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ int error;
- list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
- struct xfs_mount *mp = dqp->q_mount;
- xfs_dqlock(dqp);
+ if (!xfs_dqlock_nowait(dqp))
+ goto out_busy;
- /*
- * We are racing with dqlookup here. Naturally we don't
- * want to reclaim a dquot that lookup wants. We release the
- * freelist lock and start over, so that lookup will grab
- * both the dquot and the freelistlock.
- */
- if (dqp->dq_flags & XFS_DQ_WANT) {
- ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
-
- trace_xfs_dqreclaim_want(dqp);
- XQM_STATS_INC(xqmstats.xs_qm_dqwants);
- restarts++;
- startagain = 1;
- goto dqunlock;
- }
+ /*
+ * This dquot has acquired a reference in the meantime remove it from
+ * the freelist and try again.
+ */
+ if (dqp->q_nrefs) {
+ xfs_dqunlock(dqp);
- /*
- * If the dquot is inactive, we are assured that it is
- * not on the mplist or the hashlist, and that makes our
- * life easier.
- */
- if (dqp->dq_flags & XFS_DQ_INACTIVE) {
- ASSERT(mp == NULL);
- ASSERT(! XFS_DQ_IS_DIRTY(dqp));
- ASSERT(list_empty(&dqp->q_hashlist));
- ASSERT(list_empty(&dqp->q_mplist));
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
- dqpout = dqp;
- XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
- goto dqunlock;
- }
+ trace_xfs_dqreclaim_want(dqp);
+ XFS_STATS_INC(xs_qm_dqwants);
- ASSERT(dqp->q_hash);
- ASSERT(!list_empty(&dqp->q_mplist));
+ list_del_init(&dqp->q_lru);
+ qi->qi_lru_count--;
+ XFS_STATS_DEC(xs_qm_dquot_unused);
+ return;
+ }
- /*
- * Try to grab the flush lock. If this dquot is in the process
- * of getting flushed to disk, we don't want to reclaim it.
- */
- if (!xfs_dqflock_nowait(dqp))
- goto dqunlock;
+ /*
+ * Try to grab the flush lock. If this dquot is in the process of
+ * getting flushed to disk, we don't want to reclaim it.
+ */
+ if (!xfs_dqflock_nowait(dqp))
+ goto out_busy;
- /*
- * We have the flush lock so we know that this is not in the
- * process of being flushed. So, if this is dirty, flush it
- * DELWRI so that we don't get a freelist infested with
- * dirty dquots.
- */
- if (XFS_DQ_IS_DIRTY(dqp)) {
- int error;
+ if (XFS_DQ_IS_DIRTY(dqp)) {
+ struct xfs_buf *bp = NULL;
- trace_xfs_dqreclaim_dirty(dqp);
+ trace_xfs_dqreclaim_dirty(dqp);
- /*
- * We flush it delayed write, so don't bother
- * releasing the freelist lock.
- */
- error = xfs_qm_dqflush(dqp, 0);
- if (error) {
- xfs_warn(mp, "%s: dquot %p flush failed",
- __func__, dqp);
- }
- goto dqunlock;
+ error = xfs_qm_dqflush(dqp, &bp);
+ if (error) {
+ xfs_warn(mp, "%s: dquot %p flush failed",
+ __func__, dqp);
+ goto out_busy;
}
+ xfs_buf_delwri_queue(bp, buffer_list);
+ xfs_buf_relse(bp);
/*
- * We're trying to get the hashlock out of order. This races
- * with dqlookup; so, we giveup and goto the next dquot if
- * we couldn't get the hashlock. This way, we won't starve
- * a dqlookup process that holds the hashlock that is
- * waiting for the freelist lock.
+ * Give the dquot another try on the freelist, as the
+ * flushing will take some time.
*/
- if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
- restarts++;
- goto dqfunlock;
- }
+ goto out_busy;
+ }
+ xfs_dqfunlock(dqp);
- /*
- * This races with dquot allocation code as well as dqflush_all
- * and reclaim code. So, if we failed to grab the mplist lock,
- * giveup everything and start over.
- */
- if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
- restarts++;
- startagain = 1;
- goto qhunlock;
- }
+ /*
+ * Prevent lookups now that we are past the point of no return.
+ */
+ dqp->dq_flags |= XFS_DQ_FREEING;
+ xfs_dqunlock(dqp);
- ASSERT(dqp->q_nrefs == 0);
- list_del_init(&dqp->q_mplist);
- mp->m_quotainfo->qi_dquots--;
- mp->m_quotainfo->qi_dqreclaims++;
- list_del_init(&dqp->q_hashlist);
- dqp->q_hash->qh_version++;
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
- dqpout = dqp;
- mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
-qhunlock:
- mutex_unlock(&dqp->q_hash->qh_lock);
-dqfunlock:
- xfs_dqfunlock(dqp);
-dqunlock:
- xfs_dqunlock(dqp);
- if (dqpout)
- break;
- if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
- break;
- if (startagain) {
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- goto again;
- }
- }
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- return dqpout;
-}
+ ASSERT(dqp->q_nrefs == 0);
+ list_move_tail(&dqp->q_lru, dispose_list);
+ qi->qi_lru_count--;
+ XFS_STATS_DEC(xs_qm_dquot_unused);
-/*
- * Traverse the freelist of dquots and attempt to reclaim a maximum of
- * 'howmany' dquots. This operation races with dqlookup(), and attempts to
- * favor the lookup function ...
- */
-STATIC int
-xfs_qm_shake_freelist(
- int howmany)
-{
- int nreclaimed = 0;
- xfs_dquot_t *dqp;
+ trace_xfs_dqreclaim_done(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaims);
+ return;
- if (howmany <= 0)
- return 0;
+out_busy:
+ xfs_dqunlock(dqp);
- while (nreclaimed < howmany) {
- dqp = xfs_qm_dqreclaim_one();
- if (!dqp)
- return nreclaimed;
- xfs_qm_dqdestroy(dqp);
- nreclaimed++;
- }
- return nreclaimed;
+ /*
+ * Move the dquot to the tail of the list so that we don't spin on it.
+ */
+ list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
+
+ trace_xfs_dqreclaim_busy(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaim_misses);
}
-/*
- * The kmem_shake interface is invoked when memory is running low.
- */
-/* ARGSUSED */
STATIC int
xfs_qm_shake(
- struct shrinker *shrink,
- struct shrink_control *sc)
+ struct shrinker *shrink,
+ struct shrink_control *sc)
{
- int ndqused, nfree, n;
- gfp_t gfp_mask = sc->gfp_mask;
-
- if (!kmem_shake_allow(gfp_mask))
- return 0;
- if (!xfs_Gqm)
- return 0;
-
- nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
- /* incore dquots in all f/s's */
- ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
-
- ASSERT(ndqused >= 0);
+ struct xfs_quotainfo *qi =
+ container_of(shrink, struct xfs_quotainfo, qi_shrinker);
+ int nr_to_scan = sc->nr_to_scan;
+ LIST_HEAD (buffer_list);
+ LIST_HEAD (dispose_list);
+ struct xfs_dquot *dqp;
+ int error;
- if (nfree <= ndqused && nfree < ndquot)
+ if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
return 0;
+ if (!nr_to_scan)
+ goto out;
- ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */
- n = nfree - ndqused - ndquot; /* # over target */
-
- return xfs_qm_shake_freelist(MAX(nfree, n));
-}
-
-
-/*------------------------------------------------------------------*/
+ mutex_lock(&qi->qi_lru_lock);
+ while (!list_empty(&qi->qi_lru_list)) {
+ if (nr_to_scan-- <= 0)
+ break;
+ dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
+ q_lru);
+ xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list);
+ }
+ mutex_unlock(&qi->qi_lru_lock);
-/*
- * Return a new incore dquot. Depending on the number of
- * dquots in the system, we either allocate a new one on the kernel heap,
- * or reclaim a free one.
- * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
- * to reclaim an existing one from the freelist.
- */
-boolean_t
-xfs_qm_dqalloc_incore(
- xfs_dquot_t **O_dqpp)
-{
- xfs_dquot_t *dqp;
+ error = xfs_buf_delwri_submit(&buffer_list);
+ if (error)
+ xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
- /*
- * Check against high water mark to see if we want to pop
- * a nincompoop dquot off the freelist.
- */
- if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
- /*
- * Try to recycle a dquot from the freelist.
- */
- if ((dqp = xfs_qm_dqreclaim_one())) {
- XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
- /*
- * Just zero the core here. The rest will get
- * reinitialized by caller. XXX we shouldn't even
- * do this zero ...
- */
- memset(&dqp->q_core, 0, sizeof(dqp->q_core));
- *O_dqpp = dqp;
- return B_FALSE;
- }
- XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
+ while (!list_empty(&dispose_list)) {
+ dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
+ list_del_init(&dqp->q_lru);
+ xfs_qm_dqfree_one(dqp);
}
- /*
- * Allocate a brand new dquot on the kernel heap and return it
- * to the caller to initialize.
- */
- ASSERT(xfs_Gqm->qm_dqzone != NULL);
- *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
- atomic_inc(&xfs_Gqm->qm_totaldquots);
-
- return B_TRUE;
+out:
+ return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
}
-
/*
* Start a transaction and write the incore superblock changes to
* disk. flags parameter indicates which fields have changed.
@@ -2151,10 +1679,7 @@ xfs_qm_vop_dqalloc(
* this to caller
*/
ASSERT(ip->i_udquot);
- uq = ip->i_udquot;
- xfs_dqlock(uq);
- XFS_DQHOLD(uq);
- xfs_dqunlock(uq);
+ uq = xfs_qm_dqhold(ip->i_udquot);
}
}
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
@@ -2175,10 +1700,7 @@ xfs_qm_vop_dqalloc(
xfs_ilock(ip, lockflags);
} else {
ASSERT(ip->i_gdquot);
- gq = ip->i_gdquot;
- xfs_dqlock(gq);
- XFS_DQHOLD(gq);
- xfs_dqunlock(gq);
+ gq = xfs_qm_dqhold(ip->i_gdquot);
}
} else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
if (xfs_get_projid(ip) != prid) {
@@ -2198,10 +1720,7 @@ xfs_qm_vop_dqalloc(
xfs_ilock(ip, lockflags);
} else {
ASSERT(ip->i_gdquot);
- gq = ip->i_gdquot;
- xfs_dqlock(gq);
- XFS_DQHOLD(gq);
- xfs_dqunlock(gq);
+ gq = xfs_qm_dqhold(ip->i_gdquot);
}
}
if (uq)
@@ -2251,14 +1770,10 @@ xfs_qm_vop_chown(
xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
/*
- * Take an extra reference, because the inode
- * is going to keep this dquot pointer even
- * after the trans_commit.
+ * Take an extra reference, because the inode is going to keep
+ * this dquot pointer even after the trans_commit.
*/
- xfs_dqlock(newdq);
- XFS_DQHOLD(newdq);
- xfs_dqunlock(newdq);
- *IO_olddq = newdq;
+ *IO_olddq = xfs_qm_dqhold(newdq);
return prevdq;
}
@@ -2390,25 +1905,21 @@ xfs_qm_vop_create_dqattach(
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
if (udqp) {
- xfs_dqlock(udqp);
- XFS_DQHOLD(udqp);
- xfs_dqunlock(udqp);
ASSERT(ip->i_udquot == NULL);
- ip->i_udquot = udqp;
ASSERT(XFS_IS_UQUOTA_ON(mp));
ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
+
+ ip->i_udquot = xfs_qm_dqhold(udqp);
xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (gdqp) {
- xfs_dqlock(gdqp);
- XFS_DQHOLD(gdqp);
- xfs_dqunlock(gdqp);
ASSERT(ip->i_gdquot == NULL);
- ip->i_gdquot = gdqp;
ASSERT(XFS_IS_OQUOTA_ON(mp));
ASSERT((XFS_IS_GQUOTA_ON(mp) ?
ip->i_d.di_gid : xfs_get_projid(ip)) ==
be32_to_cpu(gdqp->q_core.d_id));
+
+ ip->i_gdquot = xfs_qm_dqhold(gdqp);
xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 43b9abe1052..44b858b79d7 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -21,39 +21,10 @@
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_quota_priv.h"
-#include "xfs_qm_stats.h"
-struct xfs_qm;
struct xfs_inode;
-extern uint ndquot;
-extern struct mutex xfs_Gqm_lock;
-extern struct xfs_qm *xfs_Gqm;
-extern kmem_zone_t *qm_dqzone;
-extern kmem_zone_t *qm_dqtrxzone;
-
-/*
- * Used in xfs_qm_sync called by xfs_sync to count the max times that it can
- * iterate over the mountpt's dquot list in one call.
- */
-#define XFS_QM_SYNC_MAX_RESTARTS 7
-
-/*
- * Ditto, for xfs_qm_dqreclaim_one.
- */
-#define XFS_QM_RECLAIM_MAX_RESTARTS 4
-
-/*
- * Ideal ratio of free to in use dquots. Quota manager makes an attempt
- * to keep this balance.
- */
-#define XFS_QM_DQFREE_RATIO 2
-
-/*
- * Dquot hashtable constants/threshold values.
- */
-#define XFS_QM_HASHSIZE_LOW (PAGE_SIZE / sizeof(xfs_dqhash_t))
-#define XFS_QM_HASHSIZE_HIGH ((PAGE_SIZE * 4) / sizeof(xfs_dqhash_t))
+extern struct kmem_zone *xfs_qm_dqtrxzone;
/*
* This defines the unit of allocation of dquots.
@@ -66,37 +37,20 @@ extern kmem_zone_t *qm_dqtrxzone;
*/
#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
-typedef xfs_dqhash_t xfs_dqlist_t;
-
-/*
- * Quota Manager (global) structure. Lives only in core.
- */
-typedef struct xfs_qm {
- xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */
- xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */
- uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */
- struct list_head qm_dqfrlist; /* freelist of dquots */
- struct mutex qm_dqfrlist_lock;
- int qm_dqfrlist_cnt;
- atomic_t qm_totaldquots; /* total incore dquots */
- uint qm_nrefs; /* file systems with quota on */
- int qm_dqfree_ratio;/* ratio of free to inuse dquots */
- kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */
- kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */
-} xfs_qm_t;
-
/*
* Various quota information for individual filesystems.
* The mount structure keeps a pointer to this.
*/
typedef struct xfs_quotainfo {
+ struct radix_tree_root qi_uquota_tree;
+ struct radix_tree_root qi_gquota_tree;
+ struct mutex qi_tree_lock;
xfs_inode_t *qi_uquotaip; /* user quota inode */
xfs_inode_t *qi_gquotaip; /* group quota inode */
- struct list_head qi_dqlist; /* all dquots in filesys */
- struct mutex qi_dqlist_lock;
+ struct list_head qi_lru_list;
+ struct mutex qi_lru_lock;
+ int qi_lru_count;
int qi_dquots;
- int qi_dqreclaims; /* a change here indicates
- a removal in the dqlist */
time_t qi_btimelimit; /* limit for blks timer */
time_t qi_itimelimit; /* limit for inodes timer */
time_t qi_rtbtimelimit;/* limit for rt blks timer */
@@ -112,8 +66,14 @@ typedef struct xfs_quotainfo {
xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */
xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */
xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */
+ struct shrinker qi_shrinker;
} xfs_quotainfo_t;
+#define XFS_DQUOT_TREE(qi, type) \
+ ((type & XFS_DQ_USER) ? \
+ &((qi)->qi_uquota_tree) : \
+ &((qi)->qi_gquota_tree))
+
extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
@@ -149,8 +109,7 @@ extern int xfs_qm_quotacheck(xfs_mount_t *);
extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
/* dquot stuff */
-extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **);
-extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint);
+extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint);
extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
/* quota ops */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index a0a829addca..6b39115bf14 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -17,9 +17,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -40,28 +38,28 @@
STATIC void
xfs_fill_statvfs_from_dquot(
struct kstatfs *statp,
- xfs_disk_dquot_t *dp)
+ struct xfs_dquot *dqp)
{
__uint64_t limit;
- limit = dp->d_blk_softlimit ?
- be64_to_cpu(dp->d_blk_softlimit) :
- be64_to_cpu(dp->d_blk_hardlimit);
+ limit = dqp->q_core.d_blk_softlimit ?
+ be64_to_cpu(dqp->q_core.d_blk_softlimit) :
+ be64_to_cpu(dqp->q_core.d_blk_hardlimit);
if (limit && statp->f_blocks > limit) {
statp->f_blocks = limit;
statp->f_bfree = statp->f_bavail =
- (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
- (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
+ (statp->f_blocks > dqp->q_res_bcount) ?
+ (statp->f_blocks - dqp->q_res_bcount) : 0;
}
- limit = dp->d_ino_softlimit ?
- be64_to_cpu(dp->d_ino_softlimit) :
- be64_to_cpu(dp->d_ino_hardlimit);
+ limit = dqp->q_core.d_ino_softlimit ?
+ be64_to_cpu(dqp->q_core.d_ino_softlimit) :
+ be64_to_cpu(dqp->q_core.d_ino_hardlimit);
if (limit && statp->f_files > limit) {
statp->f_files = limit;
statp->f_ffree =
- (statp->f_files > be64_to_cpu(dp->d_icount)) ?
- (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0;
+ (statp->f_files > dqp->q_res_icount) ?
+ (statp->f_ffree - dqp->q_res_icount) : 0;
}
}
@@ -82,7 +80,7 @@ xfs_qm_statvfs(
xfs_dquot_t *dqp;
if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
- xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
+ xfs_fill_statvfs_from_dquot(statp, dqp);
xfs_qm_dqput(dqp);
}
}
@@ -156,21 +154,3 @@ xfs_qm_newmount(
return 0;
}
-
-void __init
-xfs_qm_init(void)
-{
- printk(KERN_INFO "SGI XFS Quota Management subsystem\n");
- mutex_init(&xfs_Gqm_lock);
- xfs_qm_init_procfs();
-}
-
-void __exit
-xfs_qm_exit(void)
-{
- xfs_qm_cleanup_procfs();
- if (qm_dqzone)
- kmem_zone_destroy(qm_dqzone);
- if (qm_dqtrxzone)
- kmem_zone_destroy(qm_dqtrxzone);
-}
diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c
deleted file mode 100644
index 8671a0b3264..00000000000
--- a/fs/xfs/xfs_qm_stats.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_qm.h"
-
-struct xqmstats xqmstats;
-
-static int xqm_proc_show(struct seq_file *m, void *v)
-{
- /* maximum; incore; ratio free to inuse; freelist */
- seq_printf(m, "%d\t%d\t%d\t%u\n",
- ndquot,
- xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
- xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
- xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
- return 0;
-}
-
-static int xqm_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, xqm_proc_show, NULL);
-}
-
-static const struct file_operations xqm_proc_fops = {
- .owner = THIS_MODULE,
- .open = xqm_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int xqmstat_proc_show(struct seq_file *m, void *v)
-{
- /* quota performance statistics */
- seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
- xqmstats.xs_qm_dqreclaims,
- xqmstats.xs_qm_dqreclaim_misses,
- xqmstats.xs_qm_dquot_dups,
- xqmstats.xs_qm_dqcachemisses,
- xqmstats.xs_qm_dqcachehits,
- xqmstats.xs_qm_dqwants,
- xqmstats.xs_qm_dqshake_reclaims,
- xqmstats.xs_qm_dqinact_reclaims);
- return 0;
-}
-
-static int xqmstat_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, xqmstat_proc_show, NULL);
-}
-
-static const struct file_operations xqmstat_proc_fops = {
- .owner = THIS_MODULE,
- .open = xqmstat_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-void
-xfs_qm_init_procfs(void)
-{
- proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
- proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
-}
-
-void
-xfs_qm_cleanup_procfs(void)
-{
- remove_proc_entry("fs/xfs/xqm", NULL);
- remove_proc_entry("fs/xfs/xqmstat", NULL);
-}
diff --git a/fs/xfs/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h
deleted file mode 100644
index 5b964fc0dc0..00000000000
--- a/fs/xfs/xfs_qm_stats.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2002 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_QM_STATS_H__
-#define __XFS_QM_STATS_H__
-
-#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
-
-/*
- * XQM global statistics
- */
-struct xqmstats {
- __uint32_t xs_qm_dqreclaims;
- __uint32_t xs_qm_dqreclaim_misses;
- __uint32_t xs_qm_dquot_dups;
- __uint32_t xs_qm_dqcachemisses;
- __uint32_t xs_qm_dqcachehits;
- __uint32_t xs_qm_dqwants;
- __uint32_t xs_qm_dqshake_reclaims;
- __uint32_t xs_qm_dqinact_reclaims;
-};
-
-extern struct xqmstats xqmstats;
-
-# define XQM_STATS_INC(count) ( (count)++ )
-
-extern void xfs_qm_init_procfs(void);
-extern void xfs_qm_cleanup_procfs(void);
-
-#else
-
-# define XQM_STATS_INC(count) do { } while (0)
-
-static inline void xfs_qm_init_procfs(void) { };
-static inline void xfs_qm_cleanup_procfs(void) { };
-
-#endif
-
-#endif /* __XFS_QM_STATS_H__ */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 5cc3dde1bc9..858a3b18611 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -22,7 +22,6 @@
#include "xfs_fs.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -31,6 +30,7 @@
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
+#include "xfs_inode_item.h"
#include "xfs_itable.h"
#include "xfs_bmap.h"
#include "xfs_rtalloc.h"
@@ -46,9 +46,6 @@ STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
uint);
STATIC uint xfs_qm_export_flags(uint);
STATIC uint xfs_qm_export_qtype_flags(uint);
-STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
- fs_disk_quota_t *);
-
/*
* Turn off quota accounting and/or enforcement for all udquots and/or
@@ -68,7 +65,6 @@ xfs_qm_scall_quotaoff(
int error;
uint inactivate_flags;
xfs_qoff_logitem_t *qoffstart;
- int nculprits;
/*
* No file system can have quotas enabled on disk but not in core.
@@ -174,18 +170,13 @@ xfs_qm_scall_quotaoff(
* This isn't protected by a particular lock directly, because we
* don't want to take a mrlock every time we depend on quotas being on.
*/
- mp->m_qflags &= ~(flags);
+ mp->m_qflags &= ~flags;
/*
* Go through all the dquots of this file system and purge them,
- * according to what was turned off. We may not be able to get rid
- * of all dquots, because dquots can have temporary references that
- * are not attached to inodes. eg. xfs_setattr, xfs_create.
- * So, if we couldn't purge all the dquots from the filesystem,
- * we can't get rid of the incore data structures.
+ * according to what was turned off.
*/
- while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
- delay(10 * nculprits);
+ xfs_qm_dqpurge_all(mp, dqtype);
/*
* Transactions that had started before ACTIVE state bit was cleared
@@ -263,13 +254,18 @@ xfs_qm_scall_trunc_qfile(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_itruncate_data(&tp, ip, 0);
+ ip->i_d.di_size = 0;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
if (error) {
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
XFS_TRANS_ABORT);
goto out_unlock;
}
+ ASSERT(ip->i_d.di_nextents == 0);
+
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
@@ -629,42 +625,6 @@ xfs_qm_scall_setqlim(
return error;
}
-int
-xfs_qm_scall_getquota(
- xfs_mount_t *mp,
- xfs_dqid_t id,
- uint type,
- fs_disk_quota_t *out)
-{
- xfs_dquot_t *dqp;
- int error;
-
- /*
- * Try to get the dquot. We don't want it allocated on disk, so
- * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
- * exist, we'll get ENOENT back.
- */
- if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
- return (error);
- }
-
- /*
- * If everything's NULL, this dquot doesn't quite exist as far as
- * our utility programs are concerned.
- */
- if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
- xfs_qm_dqput(dqp);
- return XFS_ERROR(ENOENT);
- }
- /*
- * Convert the disk dquot to the exportable format
- */
- xfs_qm_export_dquot(mp, &dqp->q_core, out);
- xfs_qm_dqput(dqp);
- return (error ? XFS_ERROR(EFAULT) : 0);
-}
-
-
STATIC int
xfs_qm_log_quotaoff_end(
xfs_mount_t *mp,
@@ -753,50 +713,66 @@ error0:
}
-/*
- * Translate an internal style on-disk-dquot to the exportable format.
- * The main differences are that the counters/limits are all in Basic
- * Blocks (BBs) instead of the internal FSBs, and all on-disk data has
- * to be converted to the native endianness.
- */
-STATIC void
-xfs_qm_export_dquot(
- xfs_mount_t *mp,
- xfs_disk_dquot_t *src,
+int
+xfs_qm_scall_getquota(
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
struct fs_disk_quota *dst)
{
+ struct xfs_dquot *dqp;
+ int error;
+
+ /*
+ * Try to get the dquot. We don't want it allocated on disk, so
+ * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
+ * exist, we'll get ENOENT back.
+ */
+ error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp);
+ if (error)
+ return error;
+
+ /*
+ * If everything's NULL, this dquot doesn't quite exist as far as
+ * our utility programs are concerned.
+ */
+ if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+ error = XFS_ERROR(ENOENT);
+ goto out_put;
+ }
+
memset(dst, 0, sizeof(*dst));
- dst->d_version = FS_DQUOT_VERSION; /* different from src->d_version */
- dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags);
- dst->d_id = be32_to_cpu(src->d_id);
+ dst->d_version = FS_DQUOT_VERSION;
+ dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags);
+ dst->d_id = be32_to_cpu(dqp->q_core.d_id);
dst->d_blk_hardlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit));
+ XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
dst->d_blk_softlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit));
- dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit);
- dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit);
- dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount));
- dst->d_icount = be64_to_cpu(src->d_icount);
- dst->d_btimer = be32_to_cpu(src->d_btimer);
- dst->d_itimer = be32_to_cpu(src->d_itimer);
- dst->d_iwarns = be16_to_cpu(src->d_iwarns);
- dst->d_bwarns = be16_to_cpu(src->d_bwarns);
+ XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
+ dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+ dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
+ dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount);
+ dst->d_icount = dqp->q_res_icount;
+ dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer);
+ dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer);
+ dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns);
+ dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns);
dst->d_rtb_hardlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit));
+ XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
dst->d_rtb_softlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit));
- dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount));
- dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer);
- dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns);
+ XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
+ dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount);
+ dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer);
+ dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns);
/*
* Internally, we don't reset all the timers when quota enforcement
* gets turned off. No need to confuse the user level code,
* so return zeroes in that case.
*/
- if ((!XFS_IS_UQUOTA_ENFORCED(mp) && src->d_flags == XFS_DQ_USER) ||
+ if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) ||
(!XFS_IS_OQUOTA_ENFORCED(mp) &&
- (src->d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
+ (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
dst->d_btimer = 0;
dst->d_itimer = 0;
dst->d_rtbtimer = 0;
@@ -807,16 +783,19 @@ xfs_qm_export_dquot(
(XFS_IS_OQUOTA_ENFORCED(mp) &&
(dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
dst->d_id != 0) {
- if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
+ if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) &&
(dst->d_blk_softlimit > 0)) {
ASSERT(dst->d_btimer != 0);
}
- if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
+ if (((int) dst->d_icount > (int) dst->d_ino_softlimit) &&
(dst->d_ino_softlimit > 0)) {
ASSERT(dst->d_itimer != 0);
}
}
#endif
+out_put:
+ xfs_qm_dqput(dqp);
+ return error;
}
STATIC uint
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index a595f29567f..b50ec5b95d5 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -87,8 +87,7 @@ typedef struct xfs_dqblk {
#define XFS_DQ_PROJ 0x0002 /* project quota */
#define XFS_DQ_GROUP 0x0004 /* a group quota */
#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
-#define XFS_DQ_WANT 0x0010 /* for lookup/reclaim race */
-#define XFS_DQ_INACTIVE 0x0020 /* dq off mplist & hashlist */
+#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */
#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
@@ -97,8 +96,7 @@ typedef struct xfs_dqblk {
{ XFS_DQ_PROJ, "PROJ" }, \
{ XFS_DQ_GROUP, "GROUP" }, \
{ XFS_DQ_DIRTY, "DIRTY" }, \
- { XFS_DQ_WANT, "WANT" }, \
- { XFS_DQ_INACTIVE, "INACTIVE" }
+ { XFS_DQ_FREEING, "FREEING" }
/*
* In the worst case, when both user and group quotas are on,
@@ -176,6 +174,8 @@ typedef struct xfs_qoff_logformat {
#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */
#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */
#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */
+#define XFS_ALL_QUOTA_ACTIVE \
+ (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
/*
* Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
@@ -199,7 +199,6 @@ typedef struct xfs_qoff_logformat {
#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
-#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */
#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
@@ -326,7 +325,6 @@ extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
extern void xfs_qm_dqdetach(struct xfs_inode *);
extern void xfs_qm_dqrele(struct xfs_dquot *);
extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
-extern int xfs_qm_sync(struct xfs_mount *, int);
extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
extern void xfs_qm_mount_quotas(struct xfs_mount *);
extern void xfs_qm_unmount(struct xfs_mount *);
@@ -366,10 +364,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
#define xfs_qm_dqdetach(ip)
#define xfs_qm_dqrele(d)
#define xfs_qm_statvfs(ip, s)
-static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
-{
- return 0;
-}
#define xfs_qm_newmount(mp, a, b) (0)
#define xfs_qm_mount_quotas(mp)
#define xfs_qm_unmount(mp)
diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
index 94a3d927d71..6d86219d93d 100644
--- a/fs/xfs/xfs_quota_priv.h
+++ b/fs/xfs/xfs_quota_priv.h
@@ -24,17 +24,6 @@
*/
#define XFS_DQITER_MAP_SIZE 10
-/*
- * Hash into a bucket in the dquot hash table, based on <mp, id>.
- */
-#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
- (__psunsigned_t)(id)) & \
- (xfs_Gqm->qm_dqhashmask - 1))
-#define XFS_DQ_HASH(mp, id, type) (type == XFS_DQ_USER ? \
- (xfs_Gqm->qm_usr_dqhtable + \
- XFS_DQ_HASHVAL(mp, id)) : \
- (xfs_Gqm->qm_grp_dqhtable + \
- XFS_DQ_HASHVAL(mp, id)))
#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
!dqp->q_core.d_blk_hardlimit && \
!dqp->q_core.d_blk_softlimit && \
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 7e76f537abb..fed504fc299 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -17,7 +17,6 @@
*/
#include "xfs.h"
#include "xfs_sb.h"
-#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 866de277079..30ff5f401d2 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -19,7 +19,6 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -118,17 +117,6 @@ xfs_rename(
new_parent = (src_dp != target_dp);
src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
- if (src_is_directory) {
- /*
- * Check for link count overflow on target_dp
- */
- if (target_ip == NULL && new_parent &&
- target_dp->i_d.di_nlink >= XFS_MAXLINK) {
- error = XFS_ERROR(EMLINK);
- goto std_return;
- }
- }
-
xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
inodes, &num_inodes);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 87323f1ded6..92d4331cd4f 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -20,7 +20,6 @@
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -34,7 +33,6 @@
#include "xfs_rtalloc.h"
#include "xfs_fsops.h"
#include "xfs_error.h"
-#include "xfs_rw.h"
#include "xfs_inode_item.h"
#include "xfs_trans_space.h"
#include "xfs_utils.h"
@@ -183,6 +181,7 @@ error_cancel:
oblocks = map.br_startoff + map.br_blockcount;
}
return 0;
+
error:
return error;
}
@@ -1871,9 +1870,9 @@ xfs_growfs_rt(
/*
* Read in the last block of the device, make sure it exists.
*/
- bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+ bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
XFS_FSB_TO_BB(mp, nrblocks - 1),
- XFS_FSB_TO_B(mp, 1), 0);
+ XFS_FSB_TO_BB(mp, 1), 0);
if (!bp)
return EIO;
xfs_buf_relse(bp);
@@ -2139,11 +2138,9 @@ xfs_rtfree_extent(
xfs_buf_t *sumbp; /* summary file block buffer */
mp = tp->t_mountp;
- /*
- * Synchronize by locking the bitmap inode.
- */
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+ ASSERT(mp->m_rbmip->i_itemp != NULL);
+ ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
#if defined(__KERNEL__) && defined(DEBUG)
/*
@@ -2220,9 +2217,9 @@ xfs_rtmount_init(
(unsigned long long) mp->m_sb.sb_rblocks);
return XFS_ERROR(EFBIG);
}
- bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+ bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_B(mp, 1), 0);
+ XFS_FSB_TO_BB(mp, 1), 0);
if (!bp) {
xfs_warn(mp, "realtime device size check failed");
return EIO;
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
deleted file mode 100644
index 597d044a09a..00000000000
--- a/fs/xfs/xfs_rw.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-
-/*
- * Force a shutdown of the filesystem instantly while keeping
- * the filesystem consistent. We don't do an unmount here; just shutdown
- * the shop, make sure that absolutely nothing persistent happens to
- * this filesystem after this point.
- */
-void
-xfs_do_force_shutdown(
- xfs_mount_t *mp,
- int flags,
- char *fname,
- int lnnum)
-{
- int logerror;
-
- logerror = flags & SHUTDOWN_LOG_IO_ERROR;
-
- if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- xfs_notice(mp,
- "%s(0x%x) called from line %d of file %s. Return address = 0x%p",
- __func__, flags, lnnum, fname, __return_address);
- }
- /*
- * No need to duplicate efforts.
- */
- if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
- return;
-
- /*
- * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
- * queue up anybody new on the log reservations, and wakes up
- * everybody who's sleeping on log reservations to tell them
- * the bad news.
- */
- if (xfs_log_force_umount(mp, logerror))
- return;
-
- if (flags & SHUTDOWN_CORRUPT_INCORE) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
- "Corruption of in-memory data detected. Shutting down filesystem");
- if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
- xfs_stack_trace();
- } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- if (logerror) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
- "Log I/O Error Detected. Shutting down filesystem");
- } else if (flags & SHUTDOWN_DEVICE_REQ) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "All device paths lost. Shutting down filesystem");
- } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "I/O Error Detected. Shutting down filesystem");
- }
- }
- if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- xfs_alert(mp,
- "Please umount the filesystem and rectify the problem(s)");
- }
-}
-
-/*
- * This isn't an absolute requirement, but it is
- * just a good idea to call xfs_read_buf instead of
- * directly doing a read_buf call. For one, we shouldn't
- * be doing this disk read if we are in SHUTDOWN state anyway,
- * so this stops that from happening. Secondly, this does all
- * the error checking stuff and the brelse if appropriate for
- * the caller, so the code can be a little leaner.
- */
-
-int
-xfs_read_buf(
- struct xfs_mount *mp,
- xfs_buftarg_t *target,
- xfs_daddr_t blkno,
- int len,
- uint flags,
- xfs_buf_t **bpp)
-{
- xfs_buf_t *bp;
- int error;
-
- if (!flags)
- flags = XBF_LOCK | XBF_MAPPED;
-
- bp = xfs_buf_read(target, blkno, len, flags);
- if (!bp)
- return XFS_ERROR(EIO);
- error = bp->b_error;
- if (!error && !XFS_FORCED_SHUTDOWN(mp)) {
- *bpp = bp;
- } else {
- *bpp = NULL;
- if (error) {
- xfs_buf_ioerror_alert(bp, __func__);
- } else {
- error = XFS_ERROR(EIO);
- }
- if (bp) {
- XFS_BUF_UNDONE(bp);
- xfs_buf_stale(bp);
- /*
- * brelse clears B_ERROR and b_error
- */
- xfs_buf_relse(bp);
- }
- }
- return (error);
-}
-
-/*
- * helper function to extract extent size hint from inode
- */
-xfs_extlen_t
-xfs_get_extsz_hint(
- struct xfs_inode *ip)
-{
- if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
- return ip->i_d.di_extsize;
- if (XFS_IS_REALTIME_INODE(ip))
- return ip->i_mount->m_sb.sb_rextsize;
- return 0;
-}
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
deleted file mode 100644
index bbdb9ad6a4b..00000000000
--- a/fs/xfs/xfs_rw.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_RW_H__
-#define __XFS_RW_H__
-
-struct xfs_buf;
-struct xfs_inode;
-struct xfs_mount;
-
-/*
- * Convert the given file system block to a disk block.
- * We have to treat it differently based on whether the
- * file is a real time file or not, because the bmap code
- * does.
- */
-static inline xfs_daddr_t
-xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
-{
- return (XFS_IS_REALTIME_INODE(ip) ? \
- (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
- XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
-}
-
-/*
- * Prototypes for functions in xfs_rw.c.
- */
-extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
- xfs_daddr_t blkno, int len, uint flags,
- struct xfs_buf **bpp);
-extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
-
-#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index cb6ae715814..f429d9d5d32 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -529,7 +529,6 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
#define XFS_BB_TO_FSB(mp,bb) \
(((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log)
-#define XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1))
/*
* File system block to byte conversions.
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 76fdc586193..ce372b7d564 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -20,9 +20,18 @@
DEFINE_PER_CPU(struct xfsstats, xfsstats);
+static int counter_val(int idx)
+{
+ int val = 0, cpu;
+
+ for_each_possible_cpu(cpu)
+ val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx));
+ return val;
+}
+
static int xfs_stat_proc_show(struct seq_file *m, void *v)
{
- int c, i, j, val;
+ int i, j;
__uint64_t xs_xstrat_bytes = 0;
__uint64_t xs_write_bytes = 0;
__uint64_t xs_read_bytes = 0;
@@ -50,20 +59,16 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
{ "abtc2", XFSSTAT_END_ABTC_V2 },
{ "bmbt2", XFSSTAT_END_BMBT_V2 },
{ "ibt2", XFSSTAT_END_IBT_V2 },
+ /* we print both series of quota information together */
+ { "qm", XFSSTAT_END_QM },
};
/* Loop over all stats groups */
- for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
+ for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
seq_printf(m, "%s", xstats[i].desc);
/* inner loop does each group */
- while (j < xstats[i].endpoint) {
- val = 0;
- /* sum over all cpus */
- for_each_possible_cpu(c)
- val += *(((__u32*)&per_cpu(xfsstats, c) + j));
- seq_printf(m, " %u", val);
- j++;
- }
+ for (; j < xstats[i].endpoint; j++)
+ seq_printf(m, " %u", counter_val(j));
seq_putc(m, '\n');
}
/* extra precision counters */
@@ -97,6 +102,58 @@ static const struct file_operations xfs_stat_proc_fops = {
.release = single_release,
};
+/* legacy quota interfaces */
+#ifdef CONFIG_XFS_QUOTA
+static int xqm_proc_show(struct seq_file *m, void *v)
+{
+ /* maximum; incore; ratio free to inuse; freelist */
+ seq_printf(m, "%d\t%d\t%d\t%u\n",
+ 0,
+ counter_val(XFSSTAT_END_XQMSTAT),
+ 0,
+ counter_val(XFSSTAT_END_XQMSTAT + 1));
+ return 0;
+}
+
+static int xqm_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xqm_proc_show, NULL);
+}
+
+static const struct file_operations xqm_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xqm_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/* legacy quota stats interface no 2 */
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
+ int j;
+
+ seq_printf(m, "qm");
+ for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
+ seq_printf(m, " %u", counter_val(j));
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xqmstat_proc_show, NULL);
+}
+
+static const struct file_operations xqmstat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xqmstat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_XFS_QUOTA */
+
int
xfs_init_procfs(void)
{
@@ -105,10 +162,24 @@ xfs_init_procfs(void)
if (!proc_create("fs/xfs/stat", 0, NULL,
&xfs_stat_proc_fops))
- goto out_remove_entry;
+ goto out_remove_xfs_dir;
+#ifdef CONFIG_XFS_QUOTA
+ if (!proc_create("fs/xfs/xqmstat", 0, NULL,
+ &xqmstat_proc_fops))
+ goto out_remove_stat_file;
+ if (!proc_create("fs/xfs/xqm", 0, NULL,
+ &xqm_proc_fops))
+ goto out_remove_xqmstat_file;
+#endif
return 0;
- out_remove_entry:
+#ifdef CONFIG_XFS_QUOTA
+ out_remove_xqmstat_file:
+ remove_proc_entry("fs/xfs/xqmstat", NULL);
+ out_remove_stat_file:
+ remove_proc_entry("fs/xfs/stat", NULL);
+#endif
+ out_remove_xfs_dir:
remove_proc_entry("fs/xfs", NULL);
out:
return -ENOMEM;
@@ -117,6 +188,10 @@ xfs_init_procfs(void)
void
xfs_cleanup_procfs(void)
{
+#ifdef CONFIG_XFS_QUOTA
+ remove_proc_entry("fs/xfs/xqm", NULL);
+ remove_proc_entry("fs/xfs/xqmstat", NULL);
+#endif
remove_proc_entry("fs/xfs/stat", NULL);
remove_proc_entry("fs/xfs", NULL);
}
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 736854b1ca1..c03ad38ceae 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -183,6 +183,16 @@ struct xfsstats {
__uint32_t xs_ibt_2_alloc;
__uint32_t xs_ibt_2_free;
__uint32_t xs_ibt_2_moves;
+#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_IBT_V2+6)
+ __uint32_t xs_qm_dqreclaims;
+ __uint32_t xs_qm_dqreclaim_misses;
+ __uint32_t xs_qm_dquot_dups;
+ __uint32_t xs_qm_dqcachemisses;
+ __uint32_t xs_qm_dqcachehits;
+ __uint32_t xs_qm_dqwants;
+#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2)
+ __uint32_t xs_qm_dquot;
+ __uint32_t xs_qm_dquot_unused;
/* Extra precision counters */
__uint64_t xs_xstrat_bytes;
__uint64_t xs_write_bytes;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8a899496fd5..0d9de41a715 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -17,7 +17,6 @@
*/
#include "xfs.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
@@ -199,7 +198,6 @@ xfs_parseargs(
mp->m_flags |= XFS_MOUNT_BARRIER;
mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
- mp->m_flags |= XFS_MOUNT_DELAYLOG;
/*
* These can be overridden by the mount option parsing.
@@ -325,10 +323,9 @@ xfs_parseargs(
} else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
mp->m_flags |= XFS_MOUNT_FILESTREAMS;
} else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
- mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
- XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
- XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
- XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
+ mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
+ mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
+ mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
} else if (!strcmp(this_char, MNTOPT_QUOTA) ||
!strcmp(this_char, MNTOPT_UQUOTA) ||
!strcmp(this_char, MNTOPT_USRQUOTA)) {
@@ -353,11 +350,11 @@ xfs_parseargs(
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
mp->m_qflags &= ~XFS_OQUOTA_ENFD;
} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
- mp->m_flags |= XFS_MOUNT_DELAYLOG;
+ xfs_warn(mp,
+ "delaylog is the default now, option is deprecated.");
} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
- mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
xfs_warn(mp,
- "nodelaylog is deprecated and will be removed in Linux 3.3");
+ "nodelaylog support has been removed, option is deprecated.");
} else if (!strcmp(this_char, MNTOPT_DISCARD)) {
mp->m_flags |= XFS_MOUNT_DISCARD;
} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
@@ -395,13 +392,6 @@ xfs_parseargs(
return EINVAL;
}
- if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
- !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
- xfs_warn(mp,
- "the discard option is incompatible with the nodelaylog option");
- return EINVAL;
- }
-
#ifndef CONFIG_XFS_QUOTA
if (XFS_IS_QUOTA_RUNNING(mp)) {
xfs_warn(mp, "quota support not available in this kernel.");
@@ -501,7 +491,6 @@ xfs_showargs(
{ XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 },
{ XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
- { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
{ XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
{ 0, NULL }
};
@@ -632,7 +621,7 @@ void
xfs_blkdev_issue_flush(
xfs_buftarg_t *buftarg)
{
- blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
+ blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL);
}
STATIC void
@@ -769,6 +758,43 @@ xfs_setup_devices(
return 0;
}
+STATIC int
+xfs_init_mount_workqueues(
+ struct xfs_mount *mp)
+{
+ mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
+ WQ_MEM_RECLAIM, 0, mp->m_fsname);
+ if (!mp->m_data_workqueue)
+ goto out;
+
+ mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
+ WQ_MEM_RECLAIM, 0, mp->m_fsname);
+ if (!mp->m_unwritten_workqueue)
+ goto out_destroy_data_iodone_queue;
+
+ mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
+ WQ_MEM_RECLAIM, 0, mp->m_fsname);
+ if (!mp->m_cil_workqueue)
+ goto out_destroy_unwritten;
+ return 0;
+
+out_destroy_unwritten:
+ destroy_workqueue(mp->m_unwritten_workqueue);
+out_destroy_data_iodone_queue:
+ destroy_workqueue(mp->m_data_workqueue);
+out:
+ return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_mount_workqueues(
+ struct xfs_mount *mp)
+{
+ destroy_workqueue(mp->m_cil_workqueue);
+ destroy_workqueue(mp->m_data_workqueue);
+ destroy_workqueue(mp->m_unwritten_workqueue);
+}
+
/* Catch misguided souls that try to use this interface on XFS */
STATIC struct inode *
xfs_fs_alloc_inode(
@@ -837,105 +863,64 @@ xfs_fs_inode_init_once(
/* xfs inode */
atomic_set(&ip->i_pincount, 0);
spin_lock_init(&ip->i_flags_lock);
- init_waitqueue_head(&ip->i_ipin_wait);
- /*
- * Because we want to use a counting completion, complete
- * the flush completion once to allow a single access to
- * the flush completion without blocking.
- */
- init_completion(&ip->i_flush);
- complete(&ip->i_flush);
mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
"xfsino", ip->i_ino);
}
/*
- * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
- * we catch unlogged VFS level updates to the inode.
+ * This is called by the VFS when dirtying inode metadata. This can happen
+ * for a few reasons, but we only care about timestamp updates, given that
+ * we handled the rest ourselves. In theory no other calls should happen,
+ * but for example generic_write_end() keeps dirtying the inode after
+ * updating i_size. Thus we check that the flags are exactly I_DIRTY_SYNC,
+ * and skip this call otherwise.
*
- * We need the barrier() to maintain correct ordering between unlogged
- * updates and the transaction commit code that clears the i_update_core
- * field. This requires all updates to be completed before marking the
- * inode dirty.
+ * We'll hopefull get a different method just for updating timestamps soon,
+ * at which point this hack can go away, and maybe we'll also get real
+ * error handling here.
*/
STATIC void
xfs_fs_dirty_inode(
- struct inode *inode,
- int flags)
-{
- barrier();
- XFS_I(inode)->i_update_core = 1;
-}
-
-STATIC int
-xfs_fs_write_inode(
struct inode *inode,
- struct writeback_control *wbc)
+ int flags)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- int error = EAGAIN;
+ struct xfs_trans *tp;
+ int error;
- trace_xfs_write_inode(ip);
+ if (flags != I_DIRTY_SYNC)
+ return;
- if (XFS_FORCED_SHUTDOWN(mp))
- return -XFS_ERROR(EIO);
+ trace_xfs_dirty_inode(ip);
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) {
- /*
- * Make sure the inode has made it it into the log. Instead
- * of forcing it all the way to stable storage using a
- * synchronous transaction we let the log force inside the
- * ->sync_fs call do that for thus, which reduces the number
- * of synchronous log forces dramatically.
- */
- error = xfs_log_dirty_inode(ip, NULL, 0);
- if (error)
- goto out;
- return 0;
- } else {
- if (!ip->i_update_core)
- return 0;
-
- /*
- * We make this non-blocking if the inode is contended, return
- * EAGAIN to indicate to the caller that they did not succeed.
- * This prevents the flush path from blocking on inodes inside
- * another operation right now, they get caught later by
- * xfs_sync.
- */
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
- goto out;
-
- if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
- goto out_unlock;
-
- /*
- * Now we have the flush lock and the inode is not pinned, we
- * can check if the inode is really clean as we know that
- * there are no pending transaction completions, it is not
- * waiting on the delayed write queue and there is no IO in
- * progress.
- */
- if (xfs_inode_clean(ip)) {
- xfs_ifunlock(ip);
- error = 0;
- goto out_unlock;
- }
- error = xfs_iflush(ip, SYNC_TRYLOCK);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+ error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto trouble;
}
-
- out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- out:
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
/*
- * if we failed to write out the inode then mark
- * it dirty again so we'll try again later.
+ * Grab all the latest timestamps from the Linux inode.
*/
+ ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+ ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
+ ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
+ ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
+ ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
+ ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
+
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
+ error = xfs_trans_commit(tp, 0);
if (error)
- xfs_mark_inode_dirty_sync(ip);
- return -error;
+ goto trouble;
+ return;
+
+trouble:
+ xfs_warn(mp, "failed to update timestamps for inode 0x%llx", ip->i_ino);
}
STATIC void
@@ -947,7 +932,7 @@ xfs_fs_evict_inode(
trace_xfs_evict_inode(ip);
truncate_inode_pages(&inode->i_data, 0);
- end_writeback(inode);
+ clear_inode(inode);
XFS_STATS_INC(vn_rele);
XFS_STATS_INC(vn_remove);
XFS_STATS_DEC(vn_active);
@@ -971,6 +956,22 @@ xfs_fs_evict_inode(
xfs_inactive(ip);
}
+/*
+ * We do an unlocked check for XFS_IDONTCACHE here because we are already
+ * serialised against cache hits here via the inode->i_lock and igrab() in
+ * xfs_iget_cache_hit(). Hence a lookup that might clear this flag will not be
+ * racing with us, and it avoids needing to grab a spinlock here for every inode
+ * we drop the final reference on.
+ */
+STATIC int
+xfs_fs_drop_inode(
+ struct inode *inode)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE);
+}
+
STATIC void
xfs_free_fsname(
struct xfs_mount *mp)
@@ -986,20 +987,12 @@ xfs_fs_put_super(
{
struct xfs_mount *mp = XFS_M(sb);
- xfs_syncd_stop(mp);
-
- /*
- * Blow away any referenced inode in the filestreams cache.
- * This can and will cause log traffic as inodes go inactive
- * here.
- */
xfs_filestream_unmount(mp);
-
- xfs_flush_buftarg(mp->m_ddev_targp, 1);
-
xfs_unmountfs(mp);
+ xfs_syncd_stop(mp);
xfs_freesb(mp);
xfs_icsb_destroy_counters(mp);
+ xfs_destroy_mount_workqueues(mp);
xfs_close_devices(mp);
xfs_free_fsname(mp);
kfree(mp);
@@ -1014,17 +1007,10 @@ xfs_fs_sync_fs(
int error;
/*
- * Not much we can do for the first async pass. Writing out the
- * superblock would be counter-productive as we are going to redirty
- * when writing out other data and metadata (and writing out a single
- * block is quite fast anyway).
- *
- * Try to asynchronously kick off quota syncing at least.
+ * Doing anything during the async pass would be counterproductive.
*/
- if (!wait) {
- xfs_qm_sync(mp, SYNC_TRYLOCK);
+ if (!wait)
return 0;
- }
error = xfs_quiesce_data(mp);
if (error)
@@ -1083,7 +1069,7 @@ xfs_fs_statfs(
spin_unlock(&mp->m_sb_lock);
- if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
+ if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
(XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
xfs_qm_statvfs(ip, statp);
@@ -1238,9 +1224,9 @@ xfs_fs_unfreeze(
STATIC int
xfs_fs_show_options(
struct seq_file *m,
- struct vfsmount *mnt)
+ struct dentry *root)
{
- return -xfs_showargs(XFS_M(mnt->mnt_sb), m);
+ return -xfs_showargs(XFS_M(root->d_sb), m);
}
/*
@@ -1333,10 +1319,14 @@ xfs_fs_fill_super(
if (error)
goto out_free_fsname;
- error = xfs_icsb_init_counters(mp);
+ error = xfs_init_mount_workqueues(mp);
if (error)
goto out_close_devices;
+ error = xfs_icsb_init_counters(mp);
+ if (error)
+ goto out_destroy_workqueues;
+
error = xfs_readsb(mp, flags);
if (error)
goto out_destroy_counters;
@@ -1365,40 +1355,44 @@ xfs_fs_fill_super(
sb->s_blocksize = mp->m_sb.sb_blocksize;
sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
+ sb->s_max_links = XFS_MAXLINK;
sb->s_time_gran = 1;
set_posix_acl_flag(sb);
- error = xfs_mountfs(mp);
+ error = xfs_syncd_init(mp);
if (error)
goto out_filestream_unmount;
- error = xfs_syncd_init(mp);
+ error = xfs_mountfs(mp);
if (error)
- goto out_unmount;
+ goto out_syncd_stop;
root = igrab(VFS_I(mp->m_rootip));
if (!root) {
error = ENOENT;
- goto out_syncd_stop;
+ goto out_unmount;
}
if (is_bad_inode(root)) {
error = EINVAL;
- goto out_syncd_stop;
+ goto out_unmount;
}
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root) {
error = ENOMEM;
- goto out_iput;
+ goto out_unmount;
}
return 0;
-
+ out_syncd_stop:
+ xfs_syncd_stop(mp);
out_filestream_unmount:
xfs_filestream_unmount(mp);
out_free_sb:
xfs_freesb(mp);
out_destroy_counters:
xfs_icsb_destroy_counters(mp);
+out_destroy_workqueues:
+ xfs_destroy_mount_workqueues(mp);
out_close_devices:
xfs_close_devices(mp);
out_free_fsname:
@@ -1407,21 +1401,10 @@ xfs_fs_fill_super(
out:
return -error;
- out_iput:
- iput(root);
- out_syncd_stop:
- xfs_syncd_stop(mp);
out_unmount:
- /*
- * Blow away any referenced inode in the filestreams cache.
- * This can and will cause log traffic as inodes go inactive
- * here.
- */
xfs_filestream_unmount(mp);
-
- xfs_flush_buftarg(mp->m_ddev_targp, 1);
-
xfs_unmountfs(mp);
+ xfs_syncd_stop(mp);
goto out_free_sb;
}
@@ -1454,8 +1437,8 @@ static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
.dirty_inode = xfs_fs_dirty_inode,
- .write_inode = xfs_fs_write_inode,
.evict_inode = xfs_fs_evict_inode,
+ .drop_inode = xfs_fs_drop_inode,
.put_super = xfs_fs_put_super,
.sync_fs = xfs_fs_sync_fs,
.freeze_fs = xfs_fs_freeze,
@@ -1621,20 +1604,36 @@ STATIC int __init
xfs_init_workqueues(void)
{
/*
- * max_active is set to 8 to give enough concurency to allow
- * multiple work operations on each CPU to run. This allows multiple
- * filesystems to be running sync work concurrently, and scales with
- * the number of CPUs in the system.
+ * We never want to the same work item to run twice, reclaiming inodes
+ * or idling the log is not going to get any faster by multiple CPUs
+ * competing for ressources. Use the default large max_active value
+ * so that even lots of filesystems can perform these task in parallel.
*/
- xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
+ xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
if (!xfs_syncd_wq)
return -ENOMEM;
+
+ /*
+ * The allocation workqueue can be used in memory reclaim situations
+ * (writepage path), and parallelism is only limited by the number of
+ * AGs in all the filesystems mounted. Hence use the default large
+ * max_active value for this workqueue.
+ */
+ xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
+ if (!xfs_alloc_wq)
+ goto out_destroy_syncd;
+
return 0;
+
+out_destroy_syncd:
+ destroy_workqueue(xfs_syncd_wq);
+ return -ENOMEM;
}
STATIC void
xfs_destroy_workqueues(void)
{
+ destroy_workqueue(xfs_alloc_wq);
destroy_workqueue(xfs_syncd_wq);
}
@@ -1676,13 +1675,17 @@ init_xfs_fs(void)
if (error)
goto out_cleanup_procfs;
- vfs_initquota();
+ error = xfs_qm_init();
+ if (error)
+ goto out_sysctl_unregister;
error = register_filesystem(&xfs_fs_type);
if (error)
- goto out_sysctl_unregister;
+ goto out_qm_exit;
return 0;
+ out_qm_exit:
+ xfs_qm_exit();
out_sysctl_unregister:
xfs_sysctl_unregister();
out_cleanup_procfs:
@@ -1704,7 +1707,7 @@ init_xfs_fs(void)
STATIC void __exit
exit_xfs_fs(void)
{
- vfs_exitquota();
+ xfs_qm_exit();
unregister_filesystem(&xfs_fs_type);
xfs_sysctl_unregister();
xfs_cleanup_procfs();
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 50a3266c999..09b0c26b224 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -21,13 +21,11 @@
#include <linux/exportfs.h>
#ifdef CONFIG_XFS_QUOTA
-extern void xfs_qm_init(void);
+extern int xfs_qm_init(void);
extern void xfs_qm_exit(void);
-# define vfs_initquota() xfs_qm_init()
-# define vfs_exitquota() xfs_qm_exit()
#else
-# define vfs_initquota() do { } while (0)
-# define vfs_exitquota() do { } while (0)
+# define xfs_qm_init() (0)
+# define xfs_qm_exit() do { } while (0)
#endif
#ifdef CONFIG_XFS_POSIX_ACL
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index f0994aedcd1..1e9ee064dbb 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -18,7 +18,6 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
@@ -241,45 +240,6 @@ xfs_sync_inode_data(
return error;
}
-STATIC int
-xfs_sync_inode_attr(
- struct xfs_inode *ip,
- struct xfs_perag *pag,
- int flags)
-{
- int error = 0;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_inode_clean(ip))
- goto out_unlock;
- if (!xfs_iflock_nowait(ip)) {
- if (!(flags & SYNC_WAIT))
- goto out_unlock;
- xfs_iflock(ip);
- }
-
- if (xfs_inode_clean(ip)) {
- xfs_ifunlock(ip);
- goto out_unlock;
- }
-
- error = xfs_iflush(ip, flags);
-
- /*
- * We don't want to try again on non-blocking flushes that can't run
- * again immediately. If an inode really must be written, then that's
- * what the SYNC_WAIT flag is for.
- */
- if (error == EAGAIN) {
- ASSERT(!(flags & SYNC_WAIT));
- error = 0;
- }
-
- out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return error;
-}
-
/*
* Write out pagecache data for the whole filesystem.
*/
@@ -300,19 +260,6 @@ xfs_sync_data(
return 0;
}
-/*
- * Write out inode metadata (attributes) for the whole filesystem.
- */
-STATIC int
-xfs_sync_attr(
- struct xfs_mount *mp,
- int flags)
-{
- ASSERT((flags & ~SYNC_WAIT) == 0);
-
- return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
-}
-
STATIC int
xfs_sync_fsdata(
struct xfs_mount *mp)
@@ -336,32 +283,6 @@ xfs_sync_fsdata(
return error;
}
-int
-xfs_log_dirty_inode(
- struct xfs_inode *ip,
- struct xfs_perag *pag,
- int flags)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
- int error;
-
- if (!ip->i_update_core)
- return 0;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return error;
- }
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- return xfs_trans_commit(tp, 0);
-}
-
/*
* When remounting a filesystem read-only or freezing the filesystem, we have
* two phases to execute. This first phase is syncing the data before we
@@ -376,7 +297,7 @@ xfs_log_dirty_inode(
* First stage of freeze - no writers will make progress now we are here,
* so we flush delwri and delalloc buffers here, then wait for all I/O to
* complete. Data is frozen at that point. Metadata is not frozen,
- * transactions can still occur here so don't bother flushing the buftarg
+ * transactions can still occur here so don't bother emptying the AIL
* because it'll just get dirty again.
*/
int
@@ -385,66 +306,19 @@ xfs_quiesce_data(
{
int error, error2 = 0;
- /*
- * Log all pending size and timestamp updates. The vfs writeback
- * code is supposed to do this, but due to its overagressive
- * livelock detection it will skip inodes where appending writes
- * were written out in the first non-blocking sync phase if their
- * completion took long enough that it happened after taking the
- * timestamp for the cut-off in the blocking phase.
- */
- xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
-
- xfs_qm_sync(mp, SYNC_TRYLOCK);
- xfs_qm_sync(mp, SYNC_WAIT);
-
- /* force out the newly dirtied log buffers */
+ /* force out the log */
xfs_log_force(mp, XFS_LOG_SYNC);
/* write superblock and hoover up shutdown errors */
error = xfs_sync_fsdata(mp);
- /* make sure all delwri buffers are written out */
- xfs_flush_buftarg(mp->m_ddev_targp, 1);
-
/* mark the log as covered if needed */
if (xfs_log_need_covered(mp))
error2 = xfs_fs_log_dummy(mp);
- /* flush data-only devices */
- if (mp->m_rtdev_targp)
- xfs_flush_buftarg(mp->m_rtdev_targp, 1);
-
return error ? error : error2;
}
-STATIC void
-xfs_quiesce_fs(
- struct xfs_mount *mp)
-{
- int count = 0, pincount;
-
- xfs_reclaim_inodes(mp, 0);
- xfs_flush_buftarg(mp->m_ddev_targp, 0);
-
- /*
- * This loop must run at least twice. The first instance of the loop
- * will flush most meta data but that will generate more meta data
- * (typically directory updates). Which then must be flushed and
- * logged before we can write the unmount record. We also so sync
- * reclaim of inodes to catch any that the above delwri flush skipped.
- */
- do {
- xfs_reclaim_inodes(mp, SYNC_WAIT);
- xfs_sync_attr(mp, SYNC_WAIT);
- pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
- if (!pincount) {
- delay(50);
- count++;
- }
- } while (count < 2);
-}
-
/*
* Second stage of a quiesce. The data is already synced, now we have to take
* care of the metadata. New transactions are already blocked, so we need to
@@ -460,8 +334,12 @@ xfs_quiesce_attr(
while (atomic_read(&mp->m_active_trans) > 0)
delay(100);
- /* flush inodes and push all remaining buffers out to disk */
- xfs_quiesce_fs(mp);
+ /* reclaim inodes to do any IO before the freeze completes */
+ xfs_reclaim_inodes(mp, 0);
+ xfs_reclaim_inodes(mp, SYNC_WAIT);
+
+ /* flush all pending changes from the AIL */
+ xfs_ail_push_all_sync(mp->m_ail);
/*
* Just warn here till VFS can correctly support
@@ -475,7 +353,12 @@ xfs_quiesce_attr(
xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
"Frozen image may not be consistent.");
xfs_log_unmount_write(mp);
- xfs_unmountfs_writesb(mp);
+
+ /*
+ * At this point we might have modified the superblock again and thus
+ * added an item to the AIL, thus flush it again.
+ */
+ xfs_ail_push_all_sync(mp->m_ail);
}
static void
@@ -499,16 +382,26 @@ xfs_sync_worker(
struct xfs_mount, m_sync_work);
int error;
- if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ /*
+ * We shouldn't write/force the log if we are in the mount/unmount
+ * process or on a read only filesystem. The workqueue still needs to be
+ * active in both cases, however, because it is used for inode reclaim
+ * during these times. Use the MS_ACTIVE flag to avoid doing anything
+ * during mount. Doing work during unmount is avoided by calling
+ * cancel_delayed_work_sync on this work queue before tearing down
+ * the ail and the log in xfs_log_unmount.
+ */
+ if (!(mp->m_super->s_flags & MS_ACTIVE) &&
+ !(mp->m_flags & XFS_MOUNT_RDONLY)) {
/* dgc: errors ignored here */
if (mp->m_super->s_frozen == SB_UNFROZEN &&
xfs_log_need_covered(mp))
error = xfs_fs_log_dummy(mp);
else
xfs_log_force(mp, 0);
- error = xfs_qm_sync(mp, SYNC_TRYLOCK);
- /* start pushing all the metadata that is currently dirty */
+ /* start pushing all the metadata that is currently
+ * dirty */
xfs_ail_push_all(mp->m_ail);
}
@@ -528,14 +421,6 @@ xfs_syncd_queue_reclaim(
struct xfs_mount *mp)
{
- /*
- * We can have inodes enter reclaim after we've shut down the syncd
- * workqueue during unmount, so don't allow reclaim work to be queued
- * during unmount.
- */
- if (!(mp->m_super->s_flags & MS_ACTIVE))
- return;
-
rcu_read_lock();
if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
@@ -604,7 +489,6 @@ xfs_syncd_init(
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
xfs_syncd_queue_sync(mp);
- xfs_syncd_queue_reclaim(mp);
return 0;
}
@@ -711,14 +595,13 @@ xfs_reclaim_inode_grab(
return 1;
/*
- * do some unlocked checks first to avoid unnecessary lock traffic.
- * The first is a flush lock check, the second is a already in reclaim
- * check. Only do these checks if we are not going to block on locks.
+ * If we are asked for non-blocking operation, do unlocked checks to
+ * see if the inode already is being flushed or in reclaim to avoid
+ * lock traffic.
*/
if ((flags & SYNC_TRYLOCK) &&
- (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+ __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
return 1;
- }
/*
* The radix tree lock here protects a thread in xfs_iget from racing
@@ -743,11 +626,8 @@ xfs_reclaim_inode_grab(
}
/*
- * Inodes in different states need to be treated differently, and the return
- * value of xfs_iflush is not sufficient to get this right. The following table
- * lists the inode states and the reclaim actions necessary for non-blocking
- * reclaim:
- *
+ * Inodes in different states need to be treated differently. The following
+ * table lists the inode states and the reclaim actions necessary:
*
* inode state iflush ret required action
* --------------- ---------- ---------------
@@ -757,39 +637,31 @@ xfs_reclaim_inode_grab(
* stale, unpinned 0 reclaim
* clean, pinned(*) 0 requeue
* stale, pinned EAGAIN requeue
- * dirty, delwri ok 0 requeue
- * dirty, delwri blocked EAGAIN requeue
- * dirty, sync flush 0 reclaim
+ * dirty, async - requeue
+ * dirty, sync 0 reclaim
*
* (*) dgc: I don't think the clean, pinned state is possible but it gets
* handled anyway given the order of checks implemented.
*
- * As can be seen from the table, the return value of xfs_iflush() is not
- * sufficient to correctly decide the reclaim action here. The checks in
- * xfs_iflush() might look like duplicates, but they are not.
- *
* Also, because we get the flush lock first, we know that any inode that has
* been flushed delwri has had the flush completed by the time we check that
- * the inode is clean. The clean inode check needs to be done before flushing
- * the inode delwri otherwise we would loop forever requeuing clean inodes as
- * we cannot tell apart a successful delwri flush and a clean inode from the
- * return value of xfs_iflush().
+ * the inode is clean.
*
- * Note that because the inode is flushed delayed write by background
- * writeback, the flush lock may already be held here and waiting on it can
- * result in very long latencies. Hence for sync reclaims, where we wait on the
- * flush lock, the caller should push out delayed write inodes first before
- * trying to reclaim them to minimise the amount of time spent waiting. For
- * background relaim, we just requeue the inode for the next pass.
+ * Note that because the inode is flushed delayed write by AIL pushing, the
+ * flush lock may already be held here and waiting on it can result in very
+ * long latencies. Hence for sync reclaims, where we wait on the flush lock,
+ * the caller should push the AIL first before trying to reclaim inodes to
+ * minimise the amount of time spent waiting. For background relaim, we only
+ * bother to reclaim clean inodes anyway.
*
* Hence the order of actions after gaining the locks should be:
* bad => reclaim
* shutdown => unpin and reclaim
- * pinned, delwri => requeue
+ * pinned, async => requeue
* pinned, sync => unpin
* stale => reclaim
* clean => reclaim
- * dirty, delwri => flush and requeue
+ * dirty, async => requeue
* dirty, sync => flush, wait and reclaim
*/
STATIC int
@@ -798,7 +670,8 @@ xfs_reclaim_inode(
struct xfs_perag *pag,
int sync_mode)
{
- int error;
+ struct xfs_buf *bp = NULL;
+ int error;
restart:
error = 0;
@@ -806,17 +679,6 @@ restart:
if (!xfs_iflock_nowait(ip)) {
if (!(sync_mode & SYNC_WAIT))
goto out;
-
- /*
- * If we only have a single dirty inode in a cluster there is
- * a fair chance that the AIL push may have pushed it into
- * the buffer, but xfsbufd won't touch it until 30 seconds
- * from now, and thus we will lock up here.
- *
- * Promote the inode buffer to the front of the delwri list
- * and wake up xfsbufd now.
- */
- xfs_promote_inode(ip);
xfs_iflock(ip);
}
@@ -824,13 +686,12 @@ restart:
goto reclaim;
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_iunpin_wait(ip);
+ xfs_iflush_abort(ip, false);
goto reclaim;
}
if (xfs_ipincount(ip)) {
- if (!(sync_mode & SYNC_WAIT)) {
- xfs_ifunlock(ip);
- goto out;
- }
+ if (!(sync_mode & SYNC_WAIT))
+ goto out_ifunlock;
xfs_iunpin_wait(ip);
}
if (xfs_iflags_test(ip, XFS_ISTALE))
@@ -839,60 +700,42 @@ restart:
goto reclaim;
/*
+ * Never flush out dirty data during non-blocking reclaim, as it would
+ * just contend with AIL pushing trying to do the same job.
+ */
+ if (!(sync_mode & SYNC_WAIT))
+ goto out_ifunlock;
+
+ /*
* Now we have an inode that needs flushing.
*
- * We do a nonblocking flush here even if we are doing a SYNC_WAIT
- * reclaim as we can deadlock with inode cluster removal.
+ * Note that xfs_iflush will never block on the inode buffer lock, as
* xfs_ifree_cluster() can lock the inode buffer before it locks the
- * ip->i_lock, and we are doing the exact opposite here. As a result,
- * doing a blocking xfs_itobp() to get the cluster buffer will result
+ * ip->i_lock, and we are doing the exact opposite here. As a result,
+ * doing a blocking xfs_itobp() to get the cluster buffer would result
* in an ABBA deadlock with xfs_ifree_cluster().
*
* As xfs_ifree_cluser() must gather all inodes that are active in the
* cache to mark them stale, if we hit this case we don't actually want
* to do IO here - we want the inode marked stale so we can simply
- * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
- * just unlock the inode, back off and try again. Hopefully the next
- * pass through will see the stale flag set on the inode.
+ * reclaim it. Hence if we get an EAGAIN error here, just unlock the
+ * inode, back off and try again. Hopefully the next pass through will
+ * see the stale flag set on the inode.
*/
- error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
- if (sync_mode & SYNC_WAIT) {
- if (error == EAGAIN) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /* backoff longer than in xfs_ifree_cluster */
- delay(2);
- goto restart;
- }
- xfs_iflock(ip);
- goto reclaim;
+ error = xfs_iflush(ip, &bp);
+ if (error == EAGAIN) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /* backoff longer than in xfs_ifree_cluster */
+ delay(2);
+ goto restart;
}
- /*
- * When we have to flush an inode but don't have SYNC_WAIT set, we
- * flush the inode out using a delwri buffer and wait for the next
- * call into reclaim to find it in a clean state instead of waiting for
- * it now. We also don't return errors here - if the error is transient
- * then the next reclaim pass will flush the inode, and if the error
- * is permanent then the next sync reclaim will reclaim the inode and
- * pass on the error.
- */
- if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- xfs_warn(ip->i_mount,
- "inode 0x%llx background reclaim flush failed with %d",
- (long long)ip->i_ino, error);
+ if (!error) {
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
}
-out:
- xfs_iflags_clear(ip, XFS_IRECLAIM);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /*
- * We could return EAGAIN here to make reclaim rescan the inode tree in
- * a short while. However, this just burns CPU time scanning the tree
- * waiting for IO to complete and xfssyncd never goes back to the idle
- * state. Instead, return 0 to let the next scheduled background reclaim
- * attempt to reclaim the inode again.
- */
- return 0;
+ xfs_iflock(ip);
reclaim:
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -918,17 +761,28 @@ reclaim:
* can reference the inodes in the cache without taking references.
*
* We make that OK here by ensuring that we wait until the inode is
- * unlocked after the lookup before we go ahead and free it. We get
- * both the ilock and the iolock because the code may need to drop the
- * ilock one but will still hold the iolock.
+ * unlocked after the lookup before we go ahead and free it.
*/
- xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_qm_dqdetach(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_inode_free(ip);
return error;
+out_ifunlock:
+ xfs_ifunlock(ip);
+out:
+ xfs_iflags_clear(ip, XFS_IRECLAIM);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /*
+ * We could return EAGAIN here to make reclaim rescan the inode tree in
+ * a short while. However, this just burns CPU time scanning the tree
+ * waiting for IO to complete and xfssyncd never goes back to the idle
+ * state. Instead, return 0 to let the next scheduled background reclaim
+ * attempt to reclaim the inode again.
+ */
+ return 0;
}
/*
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index fa965479d78..941202e7ac6 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -34,8 +34,6 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
void xfs_flush_inodes(struct xfs_inode *ip);
-int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags);
-
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
int xfs_reclaim_inodes_count(struct xfs_mount *mp);
void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 9010ce885e6..624bedd8135 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -18,9 +18,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 49403579887..caf5dabfd55 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -32,7 +32,7 @@ struct xfs_da_node_entry;
struct xfs_dquot;
struct xfs_log_item;
struct xlog_ticket;
-struct log;
+struct xlog;
struct xlog_recover;
struct xlog_recover_item;
struct xfs_buf_log_format;
@@ -281,7 +281,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_daddr_t, bno)
- __field(size_t, buffer_length)
+ __field(int, nblks)
__field(int, hold)
__field(int, pincount)
__field(unsigned, lockval)
@@ -291,18 +291,18 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
__entry->bno = bp->b_bn;
- __entry->buffer_length = bp->b_buffer_length;
+ __entry->nblks = bp->b_length;
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+ TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d "
"lock %d flags %s caller %pf",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
- __entry->buffer_length,
+ __entry->nblks,
__entry->hold,
__entry->pincount,
__entry->lockval,
@@ -328,7 +328,7 @@ DEFINE_BUF_EVENT(xfs_buf_unlock);
DEFINE_BUF_EVENT(xfs_buf_iowait);
DEFINE_BUF_EVENT(xfs_buf_iowait_done);
DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
-DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
+DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
DEFINE_BUF_EVENT(xfs_buf_delwri_split);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_bdstrat_shut);
@@ -362,7 +362,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
__entry->bno = bp->b_bn;
- __entry->buffer_length = bp->b_buffer_length;
+ __entry->buffer_length = BBTOB(bp->b_length);
__entry->flags = flags;
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
@@ -406,7 +406,7 @@ TRACE_EVENT(xfs_buf_ioerror,
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
__entry->bno = bp->b_bn;
- __entry->buffer_length = bp->b_buffer_length;
+ __entry->buffer_length = BBTOB(bp->b_length);
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
@@ -450,7 +450,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__entry->bli_recur = bip->bli_recur;
__entry->bli_refcount = atomic_read(&bip->bli_refcount);
__entry->buf_bno = bip->bli_buf->b_bn;
- __entry->buf_len = bip->bli_buf->b_buffer_length;
+ __entry->buf_len = BBTOB(bip->bli_buf->b_length);
__entry->buf_flags = bip->bli_buf->b_flags;
__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
@@ -486,12 +486,10 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
@@ -580,7 +578,7 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr);
DEFINE_INODE_EVENT(xfs_dir_fsync);
DEFINE_INODE_EVENT(xfs_file_fsync);
DEFINE_INODE_EVENT(xfs_destroy_inode);
-DEFINE_INODE_EVENT(xfs_write_inode);
+DEFINE_INODE_EVENT(xfs_dirty_inode);
DEFINE_INODE_EVENT(xfs_evict_inode);
DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
@@ -627,16 +625,19 @@ DECLARE_EVENT_CLASS(xfs_namespace_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, dp_ino)
+ __field(int, namelen)
__dynamic_array(char, name, name->len)
),
TP_fast_assign(
__entry->dev = VFS_I(dp)->i_sb->s_dev;
__entry->dp_ino = dp->i_ino;
+ __entry->namelen = name->len;
memcpy(__get_str(name), name->name, name->len);
),
- TP_printk("dev %d:%d dp ino 0x%llx name %s",
+ TP_printk("dev %d:%d dp ino 0x%llx name %.*s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->dp_ino,
+ __entry->namelen,
__get_str(name))
)
@@ -658,6 +659,8 @@ TRACE_EVENT(xfs_rename,
__field(dev_t, dev)
__field(xfs_ino_t, src_dp_ino)
__field(xfs_ino_t, target_dp_ino)
+ __field(int, src_namelen)
+ __field(int, target_namelen)
__dynamic_array(char, src_name, src_name->len)
__dynamic_array(char, target_name, target_name->len)
),
@@ -665,15 +668,20 @@ TRACE_EVENT(xfs_rename,
__entry->dev = VFS_I(src_dp)->i_sb->s_dev;
__entry->src_dp_ino = src_dp->i_ino;
__entry->target_dp_ino = target_dp->i_ino;
+ __entry->src_namelen = src_name->len;
+ __entry->target_namelen = target_name->len;
memcpy(__get_str(src_name), src_name->name, src_name->len);
- memcpy(__get_str(target_name), target_name->name, target_name->len);
+ memcpy(__get_str(target_name), target_name->name,
+ target_name->len);
),
TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
- " src name %s target name %s",
+ " src name %.*s target name %.*s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->src_dp_ino,
__entry->target_dp_ino,
+ __entry->src_namelen,
__get_str(src_name),
+ __entry->target_namelen,
__get_str(target_name))
)
@@ -733,21 +741,18 @@ DEFINE_EVENT(xfs_dquot_class, name, \
DEFINE_DQUOT_EVENT(xfs_dqadjust);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
-DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_done);
DEFINE_DQUOT_EVENT(xfs_dqattach_found);
DEFINE_DQUOT_EVENT(xfs_dqattach_get);
-DEFINE_DQUOT_EVENT(xfs_dqinit);
-DEFINE_DQUOT_EVENT(xfs_dqreuse);
DEFINE_DQUOT_EVENT(xfs_dqalloc);
DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
DEFINE_DQUOT_EVENT(xfs_dqread);
DEFINE_DQUOT_EVENT(xfs_dqread_fail);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
DEFINE_DQUOT_EVENT(xfs_dqget_hit);
DEFINE_DQUOT_EVENT(xfs_dqget_miss);
+DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
+DEFINE_DQUOT_EVENT(xfs_dqget_dup);
DEFINE_DQUOT_EVENT(xfs_dqput);
DEFINE_DQUOT_EVENT(xfs_dqput_wait);
DEFINE_DQUOT_EVENT(xfs_dqput_free);
@@ -757,7 +762,7 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_force);
DEFINE_DQUOT_EVENT(xfs_dqflush_done);
DECLARE_EVENT_CLASS(xfs_loggrant_class,
- TP_PROTO(struct log *log, struct xlog_ticket *tic),
+ TP_PROTO(struct xlog *log, struct xlog_ticket *tic),
TP_ARGS(log, tic),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -785,12 +790,12 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
__entry->curr_res = tic->t_curr_res;
__entry->unit_res = tic->t_unit_res;
__entry->flags = tic->t_flags;
- __entry->reserveq = list_empty(&log->l_reserveq);
- __entry->writeq = list_empty(&log->l_writeq);
- xlog_crack_grant_head(&log->l_grant_reserve_head,
+ __entry->reserveq = list_empty(&log->l_reserve_head.waiters);
+ __entry->writeq = list_empty(&log->l_write_head.waiters);
+ xlog_crack_grant_head(&log->l_reserve_head.grant,
&__entry->grant_reserve_cycle,
&__entry->grant_reserve_bytes);
- xlog_crack_grant_head(&log->l_grant_write_head,
+ xlog_crack_grant_head(&log->l_write_head.grant,
&__entry->grant_write_cycle,
&__entry->grant_write_bytes);
__entry->curr_cycle = log->l_curr_cycle;
@@ -825,24 +830,18 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
#define DEFINE_LOGGRANT_EVENT(name) \
DEFINE_EVENT(xfs_loggrant_class, name, \
- TP_PROTO(struct log *log, struct xlog_ticket *tic), \
+ TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \
TP_ARGS(log, tic))
DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
-DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -875,15 +874,30 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
__print_flags(__entry->flags, "|", XFS_LI_FLAGS))
)
+TRACE_EVENT(xfs_log_force,
+ TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn),
+ TP_ARGS(mp, lsn),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_lsn_t, lsn)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->lsn = lsn;
+ ),
+ TP_printk("dev %d:%d lsn 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->lsn)
+)
+
#define DEFINE_LOG_ITEM_EVENT(name) \
DEFINE_EVENT(xfs_log_item_class, name, \
TP_PROTO(struct xfs_log_item *lip), \
TP_ARGS(lip))
DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
-DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf);
-DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned);
DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
DECLARE_EVENT_CLASS(xfs_file_class,
@@ -893,7 +907,6 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(xfs_fsize_t, size)
- __field(xfs_fsize_t, new_size)
__field(loff_t, offset)
__field(size_t, count)
__field(int, flags)
@@ -902,17 +915,15 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->size = ip->i_d.di_size;
- __entry->new_size = ip->i_new_size;
__entry->offset = offset;
__entry->count = count;
__entry->flags = flags;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
"offset 0x%llx count 0x%zx ioflags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
- __entry->new_size,
__entry->offset,
__entry->count,
__print_flags(__entry->flags, "|", XFS_IO_FLAGS))
@@ -980,7 +991,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(loff_t, size)
- __field(loff_t, new_size)
__field(loff_t, offset)
__field(size_t, count)
__field(int, type)
@@ -992,7 +1002,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->size = ip->i_d.di_size;
- __entry->new_size = ip->i_new_size;
__entry->offset = offset;
__entry->count = count;
__entry->type = type;
@@ -1000,13 +1009,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->startblock = irec ? irec->br_startblock : 0;
__entry->blockcount = irec ? irec->br_blockcount : 0;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
- "offset 0x%llx count %zd type %s "
- "startoff 0x%llx startblock %lld blockcount 0x%llx",
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
+ "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
- __entry->new_size,
__entry->offset,
__entry->count,
__print_symbolic(__entry->type, XFS_IO_TYPES),
@@ -1033,26 +1040,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
__field(xfs_ino_t, ino)
__field(loff_t, isize)
__field(loff_t, disize)
- __field(loff_t, new_size)
__field(loff_t, offset)
__field(size_t, count)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->isize = ip->i_size;
+ __entry->isize = VFS_I(ip)->i_size;
__entry->disize = ip->i_d.di_size;
- __entry->new_size = ip->i_new_size;
__entry->offset = offset;
__entry->count = count;
),
- TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx "
+ TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
"offset 0x%llx count %zd",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->isize,
__entry->disize,
- __entry->new_size,
__entry->offset,
__entry->count)
);
@@ -1092,8 +1096,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
DEFINE_EVENT(xfs_itrunc_class, name, \
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
TP_ARGS(ip, new_size))
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start);
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
TRACE_EVENT(xfs_pagecache_inval,
TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
@@ -1154,7 +1158,7 @@ TRACE_EVENT(xfs_bunmap,
);
-DECLARE_EVENT_CLASS(xfs_busy_class,
+DECLARE_EVENT_CLASS(xfs_extent_busy_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len),
TP_ARGS(mp, agno, agbno, len),
@@ -1177,17 +1181,17 @@ DECLARE_EVENT_CLASS(xfs_busy_class,
__entry->len)
);
#define DEFINE_BUSY_EVENT(name) \
-DEFINE_EVENT(xfs_busy_class, name, \
+DEFINE_EVENT(xfs_extent_busy_class, name, \
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
xfs_agblock_t agbno, xfs_extlen_t len), \
TP_ARGS(mp, agno, agbno, len))
-DEFINE_BUSY_EVENT(xfs_alloc_busy);
-DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
-DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
-DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
-DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
+DEFINE_BUSY_EVENT(xfs_extent_busy);
+DEFINE_BUSY_EVENT(xfs_extent_busy_enomem);
+DEFINE_BUSY_EVENT(xfs_extent_busy_force);
+DEFINE_BUSY_EVENT(xfs_extent_busy_reuse);
+DEFINE_BUSY_EVENT(xfs_extent_busy_clear);
-TRACE_EVENT(xfs_alloc_busy_trim,
+TRACE_EVENT(xfs_extent_busy_trim,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len,
xfs_agblock_t tbno, xfs_extlen_t tlen),
@@ -1427,7 +1431,7 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
-DECLARE_EVENT_CLASS(xfs_dir2_class,
+DECLARE_EVENT_CLASS(xfs_da_class,
TP_PROTO(struct xfs_da_args *args),
TP_ARGS(args),
TP_STRUCT__entry(
@@ -1462,7 +1466,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_class,
)
#define DEFINE_DIR2_EVENT(name) \
-DEFINE_EVENT(xfs_dir2_class, name, \
+DEFINE_EVENT(xfs_da_class, name, \
TP_PROTO(struct xfs_da_args *args), \
TP_ARGS(args))
DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
@@ -1491,6 +1495,64 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
+#define DEFINE_ATTR_EVENT(name) \
+DEFINE_EVENT(xfs_da_class, name, \
+ TP_PROTO(struct xfs_da_args *args), \
+ TP_ARGS(args))
+DEFINE_ATTR_EVENT(xfs_attr_sf_add);
+DEFINE_ATTR_EVENT(xfs_attr_sf_addname);
+DEFINE_ATTR_EVENT(xfs_attr_sf_create);
+DEFINE_ATTR_EVENT(xfs_attr_sf_lookup);
+DEFINE_ATTR_EVENT(xfs_attr_sf_remove);
+DEFINE_ATTR_EVENT(xfs_attr_sf_removename);
+DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
+
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_split_after);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_clearflag);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_setflag);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_flipflags);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
+
+DEFINE_ATTR_EVENT(xfs_attr_node_addname);
+DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
+DEFINE_ATTR_EVENT(xfs_attr_node_replace);
+DEFINE_ATTR_EVENT(xfs_attr_node_removename);
+
+#define DEFINE_DA_EVENT(name) \
+DEFINE_EVENT(xfs_da_class, name, \
+ TP_PROTO(struct xfs_da_args *args), \
+ TP_ARGS(args))
+DEFINE_DA_EVENT(xfs_da_split);
+DEFINE_DA_EVENT(xfs_da_join);
+DEFINE_DA_EVENT(xfs_da_link_before);
+DEFINE_DA_EVENT(xfs_da_link_after);
+DEFINE_DA_EVENT(xfs_da_unlink_back);
+DEFINE_DA_EVENT(xfs_da_unlink_forward);
+DEFINE_DA_EVENT(xfs_da_root_split);
+DEFINE_DA_EVENT(xfs_da_root_join);
+DEFINE_DA_EVENT(xfs_da_node_add);
+DEFINE_DA_EVENT(xfs_da_node_create);
+DEFINE_DA_EVENT(xfs_da_node_split);
+DEFINE_DA_EVENT(xfs_da_node_remove);
+DEFINE_DA_EVENT(xfs_da_node_rebalance);
+DEFINE_DA_EVENT(xfs_da_node_unbalance);
+DEFINE_DA_EVENT(xfs_da_swap_lastblock);
+DEFINE_DA_EVENT(xfs_da_grow_inode);
+DEFINE_DA_EVENT(xfs_da_shrink_inode);
+
DECLARE_EVENT_CLASS(xfs_dir2_space_class,
TP_PROTO(struct xfs_da_args *args, int idx),
TP_ARGS(args, idx),
@@ -1570,7 +1632,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__field(xfs_ino_t, ino)
__field(int, format)
__field(int, nex)
- __field(int, max_nex)
__field(int, broot_size)
__field(int, fork_off)
),
@@ -1580,18 +1641,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__entry->ino = ip->i_ino;
__entry->format = ip->i_d.di_format;
__entry->nex = ip->i_d.di_nextents;
- __entry->max_nex = ip->i_df.if_ext_max;
__entry->broot_size = ip->i_df.if_broot_bytes;
__entry->fork_off = XFS_IFORK_BOFF(ip);
),
TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
- "Max in-fork extents %d, broot size %d, fork offset %d",
+ "broot size %d, fork offset %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
__print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
__entry->nex,
- __entry->max_nex,
__entry->broot_size,
__entry->fork_off)
)
@@ -1605,7 +1664,7 @@ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
- TP_PROTO(struct log *log, struct xlog_recover *trans,
+ TP_PROTO(struct xlog *log, struct xlog_recover *trans,
struct xlog_recover_item *item, int pass),
TP_ARGS(log, trans, item, pass),
TP_STRUCT__entry(
@@ -1639,7 +1698,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
#define DEFINE_LOG_RECOVER_ITEM(name) \
DEFINE_EVENT(xfs_log_recover_item_class, name, \
- TP_PROTO(struct log *log, struct xlog_recover *trans, \
+ TP_PROTO(struct xlog *log, struct xlog_recover *trans, \
struct xlog_recover_item *item, int pass), \
TP_ARGS(log, trans, item, pass))
@@ -1650,7 +1709,7 @@ DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
- TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
+ TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f),
TP_ARGS(log, buf_f),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -1680,7 +1739,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
- TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
+ TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), \
TP_ARGS(log, buf_f))
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
@@ -1693,7 +1752,7 @@ DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
- TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
+ TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f),
TP_ARGS(log, in_f),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -1731,7 +1790,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
)
#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
- TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
+ TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), \
TP_ARGS(log, in_f))
DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 1f35b2feca9..fdf324508c5 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -19,9 +19,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
-#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -36,6 +34,7 @@
#include "xfs_btree.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
#include "xfs_bmap.h"
#include "xfs_quota.h"
#include "xfs_trans_priv.h"
@@ -585,7 +584,7 @@ xfs_trans_t *
_xfs_trans_alloc(
xfs_mount_t *mp,
uint type,
- uint memflags)
+ xfs_km_flags_t memflags)
{
xfs_trans_t *tp;
@@ -608,8 +607,8 @@ STATIC void
xfs_trans_free(
struct xfs_trans *tp)
{
- xfs_alloc_busy_sort(&tp->t_busy);
- xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false);
+ xfs_extent_busy_sort(&tp->t_busy);
+ xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
atomic_dec(&tp->t_mountp->m_active_trans);
xfs_trans_free_dqinfo(tp);
@@ -681,7 +680,6 @@ xfs_trans_reserve(
uint flags,
uint logcount)
{
- int log_flags;
int error = 0;
int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
@@ -707,24 +705,32 @@ xfs_trans_reserve(
* Reserve the log space needed for this transaction.
*/
if (logspace > 0) {
- ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace));
- ASSERT((tp->t_log_count == 0) ||
- (tp->t_log_count == logcount));
+ bool permanent = false;
+
+ ASSERT(tp->t_log_res == 0 || tp->t_log_res == logspace);
+ ASSERT(tp->t_log_count == 0 || tp->t_log_count == logcount);
+
if (flags & XFS_TRANS_PERM_LOG_RES) {
- log_flags = XFS_LOG_PERM_RESERV;
tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
+ permanent = true;
} else {
ASSERT(tp->t_ticket == NULL);
ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
- log_flags = 0;
}
- error = xfs_log_reserve(tp->t_mountp, logspace, logcount,
- &tp->t_ticket,
- XFS_TRANSACTION, log_flags, tp->t_type);
- if (error) {
- goto undo_blocks;
+ if (tp->t_ticket != NULL) {
+ ASSERT(flags & XFS_TRANS_PERM_LOG_RES);
+ error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
+ } else {
+ error = xfs_log_reserve(tp->t_mountp, logspace,
+ logcount, &tp->t_ticket,
+ XFS_TRANSACTION, permanent,
+ tp->t_type);
}
+
+ if (error)
+ goto undo_blocks;
+
tp->t_log_res = logspace;
tp->t_log_count = logcount;
}
@@ -752,6 +758,8 @@ xfs_trans_reserve(
*/
undo_log:
if (logspace > 0) {
+ int log_flags;
+
if (flags & XFS_TRANS_PERM_LOG_RES) {
log_flags = XFS_LOG_REL_PERM_RESERV;
} else {
@@ -1151,14 +1159,13 @@ xfs_trans_add_item(
{
struct xfs_log_item_desc *lidp;
- ASSERT(lip->li_mountp = tp->t_mountp);
- ASSERT(lip->li_ailp = tp->t_mountp->m_ail);
+ ASSERT(lip->li_mountp == tp->t_mountp);
+ ASSERT(lip->li_ailp == tp->t_mountp->m_ail);
lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
lidp->lid_item = lip;
lidp->lid_flags = 0;
- lidp->lid_size = 0;
list_add_tail(&lidp->lid_trans, &tp->t_items);
lip->li_desc = lidp;
@@ -1210,219 +1217,6 @@ xfs_trans_free_items(
}
}
-/*
- * Unlock the items associated with a transaction.
- *
- * Items which were not logged should be freed. Those which were logged must
- * still be tracked so they can be unpinned when the transaction commits.
- */
-STATIC void
-xfs_trans_unlock_items(
- struct xfs_trans *tp,
- xfs_lsn_t commit_lsn)
-{
- struct xfs_log_item_desc *lidp, *next;
-
- list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
- struct xfs_log_item *lip = lidp->lid_item;
-
- lip->li_desc = NULL;
-
- if (commit_lsn != NULLCOMMITLSN)
- IOP_COMMITTING(lip, commit_lsn);
- IOP_UNLOCK(lip);
-
- /*
- * Free the descriptor if the item is not dirty
- * within this transaction.
- */
- if (!(lidp->lid_flags & XFS_LID_DIRTY))
- xfs_trans_free_item_desc(lidp);
- }
-}
-
-/*
- * Total up the number of log iovecs needed to commit this
- * transaction. The transaction itself needs one for the
- * transaction header. Ask each dirty item in turn how many
- * it needs to get the total.
- */
-static uint
-xfs_trans_count_vecs(
- struct xfs_trans *tp)
-{
- int nvecs;
- struct xfs_log_item_desc *lidp;
-
- nvecs = 1;
-
- /* In the non-debug case we need to start bailing out if we
- * didn't find a log_item here, return zero and let trans_commit
- * deal with it.
- */
- if (list_empty(&tp->t_items)) {
- ASSERT(0);
- return 0;
- }
-
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- /*
- * Skip items which aren't dirty in this transaction.
- */
- if (!(lidp->lid_flags & XFS_LID_DIRTY))
- continue;
- lidp->lid_size = IOP_SIZE(lidp->lid_item);
- nvecs += lidp->lid_size;
- }
-
- return nvecs;
-}
-
-/*
- * Fill in the vector with pointers to data to be logged
- * by this transaction. The transaction header takes
- * the first vector, and then each dirty item takes the
- * number of vectors it indicated it needed in xfs_trans_count_vecs().
- *
- * As each item fills in the entries it needs, also pin the item
- * so that it cannot be flushed out until the log write completes.
- */
-static void
-xfs_trans_fill_vecs(
- struct xfs_trans *tp,
- struct xfs_log_iovec *log_vector)
-{
- struct xfs_log_item_desc *lidp;
- struct xfs_log_iovec *vecp;
- uint nitems;
-
- /*
- * Skip over the entry for the transaction header, we'll
- * fill that in at the end.
- */
- vecp = log_vector + 1;
-
- nitems = 0;
- ASSERT(!list_empty(&tp->t_items));
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- /* Skip items which aren't dirty in this transaction. */
- if (!(lidp->lid_flags & XFS_LID_DIRTY))
- continue;
-
- /*
- * The item may be marked dirty but not log anything. This can
- * be used to get called when a transaction is committed.
- */
- if (lidp->lid_size)
- nitems++;
- IOP_FORMAT(lidp->lid_item, vecp);
- vecp += lidp->lid_size;
- IOP_PIN(lidp->lid_item);
- }
-
- /*
- * Now that we've counted the number of items in this transaction, fill
- * in the transaction header. Note that the transaction header does not
- * have a log item.
- */
- tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
- tp->t_header.th_type = tp->t_type;
- tp->t_header.th_num_items = nitems;
- log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
- log_vector->i_len = sizeof(xfs_trans_header_t);
- log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
-}
-
-/*
- * The committed item processing consists of calling the committed routine of
- * each logged item, updating the item's position in the AIL if necessary, and
- * unpinning each item. If the committed routine returns -1, then do nothing
- * further with the item because it may have been freed.
- *
- * Since items are unlocked when they are copied to the incore log, it is
- * possible for two transactions to be completing and manipulating the same
- * item simultaneously. The AIL lock will protect the lsn field of each item.
- * The value of this field can never go backwards.
- *
- * We unpin the items after repositioning them in the AIL, because otherwise
- * they could be immediately flushed and we'd have to race with the flusher
- * trying to pull the item from the AIL as we add it.
- */
-static void
-xfs_trans_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn,
- int aborted)
-{
- xfs_lsn_t item_lsn;
- struct xfs_ail *ailp;
-
- if (aborted)
- lip->li_flags |= XFS_LI_ABORTED;
- item_lsn = IOP_COMMITTED(lip, commit_lsn);
-
- /* item_lsn of -1 means the item needs no further processing */
- if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
- return;
-
- /*
- * If the returned lsn is greater than what it contained before, update
- * the location of the item in the AIL. If it is not, then do nothing.
- * Items can never move backwards in the AIL.
- *
- * While the new lsn should usually be greater, it is possible that a
- * later transaction completing simultaneously with an earlier one
- * using the same item could complete first with a higher lsn. This
- * would cause the earlier transaction to fail the test below.
- */
- ailp = lip->li_ailp;
- spin_lock(&ailp->xa_lock);
- if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
- /*
- * This will set the item's lsn to item_lsn and update the
- * position of the item in the AIL.
- *
- * xfs_trans_ail_update() drops the AIL lock.
-