diff options
author | Patrick McHardy <kaber@trash.net> | 2012-08-08 21:03:47 +0200 |
---|---|---|
committer | Patrick McHardy <kaber@trash.net> | 2012-08-08 21:03:47 +0200 |
commit | d53b4ed072d9779cdf53582c46436dec06d0961f (patch) | |
tree | ac95ecab33e31cd79aae69c475e8348adac51230 /fs/xfs | |
parent | 5d4dff7f1011a81a693a9c7b1f6a0b9c842eb60c (diff) | |
parent | 28a33cbc24e4256c143dce96c7d93bf423229f92 (diff) |
Merge tag 'v3.5' of 192.168.0.154:/repos/git/linux-2.6
Conflicts:
drivers/Kconfig
Signed-off-by: Patrick McHardy <kaber@trash.net>
Diffstat (limited to 'fs/xfs')
97 files changed, 4985 insertions, 7248 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 427a4e82a58..d2bf974b1a2 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -33,6 +33,7 @@ xfs-y += xfs_aops.o \ xfs_discard.o \ xfs_error.o \ xfs_export.o \ + xfs_extent_busy.o \ xfs_file.o \ xfs_filestream.o \ xfs_fsops.o \ @@ -49,7 +50,6 @@ xfs-y += xfs_aops.o \ xfs_sync.o \ xfs_xattr.o \ xfs_rename.o \ - xfs_rw.o \ xfs_utils.o \ xfs_vnodeops.o \ kmem.o \ @@ -96,9 +96,6 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ xfs_qm_bhv.o \ xfs_qm.o \ xfs_quotaops.o -ifeq ($(CONFIG_XFS_QUOTA),y) -xfs-$(CONFIG_PROC_FS) += xfs_qm_stats.o -endif xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_PROC_FS) += xfs_stats.o diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index a907de565db..4a7286c1dc8 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -46,7 +46,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) } void * -kmem_alloc(size_t size, unsigned int __nocast flags) +kmem_alloc(size_t size, xfs_km_flags_t flags) { int retries = 0; gfp_t lflags = kmem_flags_convert(flags); @@ -65,7 +65,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags) } void * -kmem_zalloc(size_t size, unsigned int __nocast flags) +kmem_zalloc(size_t size, xfs_km_flags_t flags) { void *ptr; @@ -87,7 +87,7 @@ kmem_free(const void *ptr) void * kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, - unsigned int __nocast flags) + xfs_km_flags_t flags) { void *new; @@ -102,7 +102,7 @@ kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, } void * -kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) +kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) { int retries = 0; gfp_t lflags = kmem_flags_convert(flags); @@ -121,7 +121,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) } void * -kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags) +kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) { void *ptr; diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 292eff19803..b2f2620f9a8 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -27,10 +27,11 @@ * General memory allocation interfaces */ -#define KM_SLEEP 0x0001u -#define KM_NOSLEEP 0x0002u -#define KM_NOFS 0x0004u -#define KM_MAYFAIL 0x0008u +typedef unsigned __bitwise xfs_km_flags_t; +#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u) +#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u) +#define KM_NOFS ((__force xfs_km_flags_t)0x0004u) +#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) /* * We use a special process flag to avoid recursive callbacks into @@ -38,7 +39,7 @@ * warnings, so we explicitly skip any generic ones (silly of us). */ static inline gfp_t -kmem_flags_convert(unsigned int __nocast flags) +kmem_flags_convert(xfs_km_flags_t flags) { gfp_t lflags; @@ -54,9 +55,9 @@ kmem_flags_convert(unsigned int __nocast flags) return lflags; } -extern void *kmem_alloc(size_t, unsigned int __nocast); -extern void *kmem_zalloc(size_t, unsigned int __nocast); -extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); +extern void *kmem_alloc(size_t, xfs_km_flags_t); +extern void *kmem_zalloc(size_t, xfs_km_flags_t); +extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); extern void kmem_free(const void *); static inline void *kmem_zalloc_large(size_t size) @@ -107,13 +108,7 @@ kmem_zone_destroy(kmem_zone_t *zone) kmem_cache_destroy(zone); } -extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); -extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); - -static inline int -kmem_shake_allow(gfp_t gfp_mask) -{ - return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)); -} +extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); +extern void *kmem_zone_zalloc(kmem_zone_t *, xfs_km_flags_t); #endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 76e4266d2e7..ac702a6eab9 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -39,7 +39,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp) struct posix_acl_entry *acl_e; struct posix_acl *acl; struct xfs_acl_entry *ace; - int count, i; + unsigned int count, i; count = be32_to_cpu(aclp->acl_cnt); if (count > XFS_ACL_MAX_ENTRIES) diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 4805f009f92..44d65c1533c 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -175,24 +175,6 @@ typedef struct xfs_agfl { } xfs_agfl_t; /* - * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that - * have been freed but whose transactions aren't committed to disk yet. - * - * Note that we use the transaction ID to record the transaction, not the - * transaction structure itself. See xfs_alloc_busy_insert() for details. - */ -struct xfs_busy_extent { - struct rb_node rb_node; /* ag by-bno indexed search tree */ - struct list_head list; /* transaction busy extent list */ - xfs_agnumber_t agno; - xfs_agblock_t bno; - xfs_extlen_t length; - unsigned int flags; -#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */ -#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */ -}; - -/* * Per-ag incore structure, copies of information in agf and agi, * to improve the performance of allocation group selection. */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index ce84ffd0264..4f33c32affe 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -32,9 +31,11 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" +struct workqueue_struct *xfs_alloc_wq; #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) @@ -46,8 +47,6 @@ STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); -STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *, - xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *); /* * Lookup the record equal to [bno, len] in the btree given by cur. @@ -68,7 +67,7 @@ xfs_alloc_lookup_eq( * Lookup the first record greater than or equal to [bno, len] * in the btree given by cur. */ -STATIC int /* error */ +int /* error */ xfs_alloc_lookup_ge( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -151,7 +150,7 @@ xfs_alloc_compute_aligned( xfs_extlen_t len; /* Trim busy sections out of found extent */ - xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len); + xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); if (args->alignment > 1 && len >= args->minlen) { xfs_agblock_t aligned_bno = roundup(bno, args->alignment); @@ -535,7 +534,7 @@ xfs_alloc_ag_vextent( if (error) return error; - ASSERT(!xfs_alloc_busy_search(args->mp, args->agno, + ASSERT(!xfs_extent_busy_search(args->mp, args->agno, args->agbno, args->len)); } @@ -602,7 +601,7 @@ xfs_alloc_ag_vextent_exact( /* * Check for overlapping busy extents. */ - xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen); + xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen); /* * Give up if the start of the extent is busy, or the freespace isn't @@ -1075,12 +1074,13 @@ restart: * If we couldn't get anything, give up. */ if (bno_cur_lt == NULL && bno_cur_gt == NULL) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + if (!forced++) { trace_xfs_alloc_near_busy(args); xfs_log_force(args->mp, XFS_LOG_SYNC); goto restart; } - trace_xfs_alloc_size_neither(args); args->agbno = NULLAGBLOCK; return 0; @@ -1390,7 +1390,7 @@ xfs_alloc_ag_vextent_small( if (error) goto error0; if (fbno != NULLAGBLOCK) { - xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1, + xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, args->userdata); if (args->userdata) { @@ -2207,7 +2207,7 @@ xfs_alloc_read_agf( * group or loop over the allocation groups to find the result. */ int /* error */ -xfs_alloc_vextent( +__xfs_alloc_vextent( xfs_alloc_arg_t *args) /* allocation argument structure */ { xfs_agblock_t agsize; /* allocation group size */ @@ -2417,6 +2417,46 @@ error0: return error; } +static void +xfs_alloc_vextent_worker( + struct work_struct *work) +{ + struct xfs_alloc_arg *args = container_of(work, + struct xfs_alloc_arg, work); + unsigned long pflags; + + /* we are in a transaction context here */ + current_set_flags_nested(&pflags, PF_FSTRANS); + + args->result = __xfs_alloc_vextent(args); + complete(args->done); + + current_restore_flags_nested(&pflags, PF_FSTRANS); +} + +/* + * Data allocation requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. Metadata + * requests, OTOH, are generally from low stack usage paths, so avoid the + * context switch overhead here. + */ +int +xfs_alloc_vextent( + struct xfs_alloc_arg *args) +{ + DECLARE_COMPLETION_ONSTACK(done); + + if (!args->userdata) + return __xfs_alloc_vextent(args); + + + args->done = &done; + INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); + queue_work(xfs_alloc_wq, &args->work); + wait_for_completion(&done); + return args->result; +} + /* * Free an extent. * Just break up the extent address and hand off to xfs_free_ag_extent @@ -2464,579 +2504,8 @@ xfs_free_extent( error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); if (!error) - xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0); + xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); error0: xfs_perag_put(args.pag); return error; } - -void -xfs_alloc_busy_insert( - struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len, - unsigned int flags) -{ - struct xfs_busy_extent *new; - struct xfs_busy_extent *busyp; - struct xfs_perag *pag; - struct rb_node **rbp; - struct rb_node *parent = NULL; - - new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); - if (!new) { - /* - * No Memory! Since it is now not possible to track the free - * block, make this a synchronous transaction to insure that - * the block is not reused before this transaction commits. - */ - trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len); - xfs_trans_set_sync(tp); - return; - } - - new->agno = agno; - new->bno = bno; - new->length = len; - INIT_LIST_HEAD(&new->list); - new->flags = flags; - - /* trace before insert to be able to see failed inserts */ - trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len); - - pag = xfs_perag_get(tp->t_mountp, new->agno); - spin_lock(&pag->pagb_lock); - rbp = &pag->pagb_tree.rb_node; - while (*rbp) { - parent = *rbp; - busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); - - if (new->bno < busyp->bno) { - rbp = &(*rbp)->rb_left; - ASSERT(new->bno + new->length <= busyp->bno); - } else if (new->bno > busyp->bno) { - rbp = &(*rbp)->rb_right; - ASSERT(bno >= busyp->bno + busyp->length); - } else { - ASSERT(0); - } - } - - rb_link_node(&new->rb_node, parent, rbp); - rb_insert_color(&new->rb_node, &pag->pagb_tree); - - list_add(&new->list, &tp->t_busy); - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); -} - -/* - * Search for a busy extent within the range of the extent we are about to - * allocate. You need to be holding the busy extent tree lock when calling - * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy - * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact - * match. This is done so that a non-zero return indicates an overlap that - * will require a synchronous transaction, but it can still be - * used to distinguish between a partial or exact match. - */ -int -xfs_alloc_busy_search( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) -{ - struct xfs_perag *pag; - struct rb_node *rbp; - struct xfs_busy_extent *busyp; - int match = 0; - - pag = xfs_perag_get(mp, agno); - spin_lock(&pag->pagb_lock); - - rbp = pag->pagb_tree.rb_node; - - /* find closest start bno overlap */ - while (rbp) { - busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node); - if (bno < busyp->bno) { - /* may overlap, but exact start block is lower */ - if (bno + len > busyp->bno) - match = -1; - rbp = rbp->rb_left; - } else if (bno > busyp->bno) { - /* may overlap, but exact start block is higher */ - if (bno < busyp->bno + busyp->length) - match = -1; - rbp = rbp->rb_right; - } else { - /* bno matches busyp, length determines exact match */ - match = (busyp->length == len) ? 1 : -1; - break; - } - } - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - return match; -} - -/* - * The found free extent [fbno, fend] overlaps part or all of the given busy - * extent. If the overlap covers the beginning, the end, or all of the busy - * extent, the overlapping portion can be made unbusy and used for the - * allocation. We can't split a busy extent because we can't modify a - * transaction/CIL context busy list, but we can update an entries block - * number or length. - * - * Returns true if the extent can safely be reused, or false if the search - * needs to be restarted. - */ -STATIC bool -xfs_alloc_busy_update_extent( - struct xfs_mount *mp, - struct xfs_perag *pag, - struct xfs_busy_extent *busyp, - xfs_agblock_t fbno, - xfs_extlen_t flen, - bool userdata) -{ - xfs_agblock_t fend = fbno + flen; - xfs_agblock_t bbno = busyp->bno; - xfs_agblock_t bend = bbno + busyp->length; - - /* - * This extent is currently being discarded. Give the thread - * performing the discard a chance to mark the extent unbusy - * and retry. - */ - if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) { - spin_unlock(&pag->pagb_lock); - delay(1); - spin_lock(&pag->pagb_lock); - return false; - } - - /* - * If there is a busy extent overlapping a user allocation, we have - * no choice but to force the log and retry the search. - * - * Fortunately this does not happen during normal operation, but - * only if the filesystem is very low on space and has to dip into - * the AGFL for normal allocations. - */ - if (userdata) - goto out_force_log; - - if (bbno < fbno && bend > fend) { - /* - * Case 1: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +---------+ - * fbno fend - */ - - /* - * We would have to split the busy extent to be able to track - * it correct, which we cannot do because we would have to - * modify the list of busy extents attached to the transaction - * or CIL context, which is immutable. - * - * Force out the log to clear the busy extent and retry the - * search. - */ - goto out_force_log; - } else if (bbno >= fbno && bend <= fend) { - /* - * Case 2: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------+ - * fbno fend - * - * Case 3: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Case 4: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Case 5: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------------------------+ - * fbno fend - * - */ - - /* - * The busy extent is fully covered by the extent we are - * allocating, and can simply be removed from the rbtree. - * However we cannot remove it from the immutable list - * tracking busy extents in the transaction or CIL context, - * so set the length to zero to mark it invalid. - * - * We also need to restart the busy extent search from the - * tree root, because erasing the node can rearrange the - * tree topology. - */ - rb_erase(&busyp->rb_node, &pag->pagb_tree); - busyp->length = 0; - return false; - } else if (fend < bend) { - /* - * Case 6: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +---------+ - * fbno fend - * - * Case 7: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +------------------+ - * fbno fend - * - */ - busyp->bno = fend; - } else if (bbno < fbno) { - /* - * Case 8: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-------------+ - * fbno fend - * - * Case 9: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +----------------------+ - * fbno fend - */ - busyp->length = fbno - busyp->bno; - } else { - ASSERT(0); - } - - trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen); - return true; - -out_force_log: - spin_unlock(&pag->pagb_lock); - xfs_log_force(mp, XFS_LOG_SYNC); - trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen); - spin_lock(&pag->pagb_lock); - return false; -} - - -/* - * For a given extent [fbno, flen], make sure we can reuse it safely. - */ -void -xfs_alloc_busy_reuse( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agblock_t fbno, - xfs_extlen_t flen, - bool userdata) -{ - struct xfs_perag *pag; - struct rb_node *rbp; - - ASSERT(flen > 0); - - pag = xfs_perag_get(mp, agno); - spin_lock(&pag->pagb_lock); -restart: - rbp = pag->pagb_tree.rb_node; - while (rbp) { - struct xfs_busy_extent *busyp = - rb_entry(rbp, struct xfs_busy_extent, rb_node); - xfs_agblock_t bbno = busyp->bno; - xfs_agblock_t bend = bbno + busyp->length; - - if (fbno + flen <= bbno) { - rbp = rbp->rb_left; - continue; - } else if (fbno >= bend) { - rbp = rbp->rb_right; - continue; - } - - if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen, - userdata)) - goto restart; - } - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); -} - -/* - * For a given extent [fbno, flen], search the busy extent list to find a - * subset of the extent that is not busy. If *rlen is smaller than - * args->minlen no suitable extent could be found, and the higher level - * code needs to force out the log and retry the allocation. - */ -STATIC void -xfs_alloc_busy_trim( - struct xfs_alloc_arg *args, - xfs_agblock_t bno, - xfs_extlen_t len, - xfs_agblock_t *rbno, - xfs_extlen_t *rlen) -{ - xfs_agblock_t fbno; - xfs_extlen_t flen; - struct rb_node *rbp; - - ASSERT(len > 0); - - spin_lock(&args->pag->pagb_lock); -restart: - fbno = bno; - flen = len; - rbp = args->pag->pagb_tree.rb_node; - while (rbp && flen >= args->minlen) { - struct xfs_busy_extent *busyp = - rb_entry(rbp, struct xfs_busy_extent, rb_node); - xfs_agblock_t fend = fbno + flen; - xfs_agblock_t bbno = busyp->bno; - xfs_agblock_t bend = bbno + busyp->length; - - if (fend <= bbno) { - rbp = rbp->rb_left; - continue; - } else if (fbno >= bend) { - rbp = rbp->rb_right; - continue; - } - - /* - * If this is a metadata allocation, try to reuse the busy - * extent instead of trimming the allocation. - */ - if (!args->userdata && - !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) { - if (!xfs_alloc_busy_update_extent(args->mp, args->pag, - busyp, fbno, flen, - false)) - goto restart; - continue; - } - - if (bbno <= fbno) { - /* start overlap */ - - /* - * Case 1: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +---------+ - * fbno fend - * - * Case 2: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-------------+ - * fbno fend - * - * Case 3: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-------------+ - * fbno fend - * - * Case 4: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------+ - * fbno fend - * - * No unbusy region in extent, return failure. - */ - if (fend <= bend) - goto fail; - - /* - * Case 5: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +----------------------+ - * fbno fend - * - * Case 6: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Needs to be trimmed to: - * +-------+ - * fbno fend - */ - fbno = bend; - } else if (bend >= fend) { - /* end overlap */ - - /* - * Case 7: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +------------------+ - * fbno fend - * - * Case 8: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Needs to be trimmed to: - * +-------+ - * fbno fend - */ - fend = bbno; - } else { - /* middle overlap */ - - /* - * Case 9: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------------------------+ - * fbno fend - * - * Can be trimmed to: - * +-------+ OR +-------+ - * fbno fend fbno fend - * - * Backward allocation leads to significant - * fragmentation of directories, which degrades - * directory performance, therefore we always want to - * choose the option that produces forward allocation - * patterns. - * Preferring the lower bno extent will make the next - * request use "fend" as the start of the next - * allocation; if the segment is no longer busy at - * that point, we'll get a contiguous allocation, but - * even if it is still busy, we will get a forward - * allocation. - * We try to avoid choosing the segment at "bend", - * because that can lead to the next allocation - * taking the segment at "fbno", which would be a - * backward allocation. We only use the segment at - * "fbno" if it is much larger than the current - * requested size, because in that case there's a - * good chance subsequent allocations will be - * contiguous. - */ - if (bbno - fbno >= args->maxlen) { - /* left candidate fits perfect */ - fend = bbno; - } else if (fend - bend >= args->maxlen * 4) { - /* right candidate has enough free space */ - fbno = bend; - } else if (bbno - fbno >= args->minlen) { - /* left candidate fits minimum requirement */ - fend = bbno; - } else { - goto fail; - } - } - - flen = fend - fbno; - } - spin_unlock(&args->pag->pagb_lock); - - if (fbno != bno || flen != len) { - trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, - fbno, flen); - } - *rbno = fbno; - *rlen = flen; - return; -fail: - /* - * Return a zero extent length as failure indications. All callers - * re-check if the trimmed extent satisfies the minlen requirement. - */ - spin_unlock(&args->pag->pagb_lock); - trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0); - *rbno = fbno; - *rlen = 0; -} - -static void -xfs_alloc_busy_clear_one( - struct xfs_mount *mp, - struct xfs_perag *pag, - struct xfs_busy_extent *busyp) -{ - if (busyp->length) { - trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno, - busyp->length); - rb_erase(&busyp->rb_node, &pag->pagb_tree); - } - - list_del_init(&busyp->list); - kmem_free(busyp); -} - -/* - * Remove all extents on the passed in list from the busy extents tree. - * If do_discard is set skip extents that need to be discarded, and mark - * these as undergoing a discard operation instead. - */ -void -xfs_alloc_busy_clear( - struct xfs_mount *mp, - struct list_head *list, - bool do_discard) -{ - struct xfs_busy_extent *busyp, *n; - struct xfs_perag *pag = NULL; - xfs_agnumber_t agno = NULLAGNUMBER; - - list_for_each_entry_safe(busyp, n, list, list) { - if (busyp->agno != agno) { - if (pag) { - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - } - pag = xfs_perag_get(mp, busyp->agno); - spin_lock(&pag->pagb_lock); - agno = busyp->agno; - } - - if (do_discard && busyp->length && - !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD)) - busyp->flags = XFS_ALLOC_BUSY_DISCARDED; - else - xfs_alloc_busy_clear_one(mp, pag, busyp); - } - - if (pag) { - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - } -} - -/* - * Callback for list_sort to sort busy extents by the AG they reside in. - */ -int -xfs_busy_extent_ag_cmp( - void *priv, - struct list_head *a, - struct list_head *b) -{ - return container_of(a, struct xfs_busy_extent, list)->agno - - container_of(b, struct xfs_busy_extent, list)->agno; -} diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 2f52b924be7..93be4a667ca 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -23,7 +23,8 @@ struct xfs_btree_cur; struct xfs_mount; struct xfs_perag; struct xfs_trans; -struct xfs_busy_extent; + +extern struct workqueue_struct *xfs_alloc_wq; /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. @@ -119,6 +120,9 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* set if this is user data */ xfs_fsblock_t firstblock; /* io first block allocated */ + struct completion *done; + struct work_struct work; + int result; } xfs_alloc_arg_t; /* @@ -134,33 +138,6 @@ xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, struct xfs_perag *pag); -#ifdef __KERNEL__ -void -xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); - -void -xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list, - bool do_discard); - -int -xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len); - -void -xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); - -int -xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b); - -static inline void xfs_alloc_busy_sort(struct list_head *list) -{ - list_sort(NULL, list, xfs_busy_extent_ag_cmp); -} - -#endif /* __KERNEL__ */ - /* * Compute and fill in value of m_ag_maxlevels. */ @@ -243,6 +220,13 @@ xfs_alloc_lookup_le( xfs_extlen_t len, /* length of extent */ int *stat); /* success/failure */ +int /* error */ +xfs_alloc_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + int /* error */ xfs_alloc_get_rec( struct xfs_btree_cur *cur, /* btree cursor */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index ffb3386e45c..f1647caace8 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -32,6 +30,7 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -94,7 +93,7 @@ xfs_allocbt_alloc_block( return 0; } - xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); @@ -119,8 +118,8 @@ xfs_allocbt_free_block( if (error) return error; - xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, - XFS_ALLOC_BUSY_SKIP_DISCARD); + xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 574d4ee9b62..8dad722c004 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -16,9 +16,7 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_trans.h" @@ -26,9 +24,9 @@ #include "xfs_bmap_btree.h" #include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_iomap.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" @@ -99,24 +97,6 @@ xfs_destroy_ioend( } /* - * If the end of the current ioend is beyond the current EOF, - * return the new EOF value, otherwise zero. - */ -STATIC xfs_fsize_t -xfs_ioend_new_eof( - xfs_ioend_t *ioend) -{ - xfs_inode_t *ip = XFS_I(ioend->io_inode); - xfs_fsize_t isize; - xfs_fsize_t bsize; - - bsize = ioend->io_offset + ioend->io_size; - isize = MAX(ip->i_size, ip->i_new_size); - isize = MIN(isize, bsize); - return isize > ip->i_d.di_size ? isize : 0; -} - -/* * Fast and loose check if this write could update the on-disk inode size. */ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) @@ -125,36 +105,65 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) XFS_I(ioend->io_inode)->i_d.di_size; } +STATIC int +xfs_setfilesize_trans_alloc( + struct xfs_ioend *ioend) +{ + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + ioend->io_append_trans = tp; + + /* + * We hand off the transaction to the completion thread now, so + * clear the flag here. + */ + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + return 0; +} + /* - * Update on-disk file size now that data has been written to disk. The - * current in-memory file size is i_size. If a write is beyond eof i_new_size - * will be the intended file size until i_size is updated. If this write does - * not extend all the way to the valid file size then restrict this update to - * the end of the write. - * - * This function does not block as blocking on the inode lock in IO completion - * can lead to IO completion order dependency deadlocks.. If it can't get the - * inode ilock it will return EAGAIN. Callers must handle this. + * Update on-disk file size now that data has been written to disk. */ STATIC int xfs_setfilesize( - xfs_ioend_t *ioend) + struct xfs_ioend *ioend) { - xfs_inode_t *ip = XFS_I(ioend->io_inode); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_trans *tp = ioend->io_append_trans; xfs_fsize_t isize; - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) - return EAGAIN; + /* + * The transaction was allocated in the I/O submission thread, + * thus we need to mark ourselves as beeing in a transaction + * manually. + */ + current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); - isize = xfs_ioend_new_eof(ioend); - if (isize) { - trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); - ip->i_d.di_size = isize; - xfs_mark_inode_dirty(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); + if (!isize) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp, 0); + return 0; } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return 0; + trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); + + ip->i_d.di_size = isize; + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + return xfs_trans_commit(tp, 0); } /* @@ -168,10 +177,12 @@ xfs_finish_ioend( struct xfs_ioend *ioend) { if (atomic_dec_and_test(&ioend->io_remaining)) { + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + if (ioend->io_type == IO_UNWRITTEN) - queue_work(xfsconvertd_workqueue, &ioend->io_work); - else if (xfs_ioend_is_append(ioend)) - queue_work(xfsdatad_workqueue, &ioend->io_work); + queue_work(mp->m_unwritten_workqueue, &ioend->io_work); + else if (ioend->io_append_trans) + queue_work(mp->m_data_workqueue, &ioend->io_work); else xfs_destroy_ioend(ioend); } @@ -200,35 +211,36 @@ xfs_end_io( * range to normal written extens after the data I/O has finished. */ if (ioend->io_type == IO_UNWRITTEN) { + /* + * For buffered I/O we never preallocate a transaction when + * doing the unwritten extent conversion, but for direct I/O + * we do not know if we are converting an unwritten extent + * or not at the point where we preallocate the transaction. + */ + if (ioend->io_append_trans) { + ASSERT(ioend->io_isdirect); + + current_set_flags_nested( + &ioend->io_append_trans->t_pflags, PF_FSTRANS); + xfs_trans_cancel(ioend->io_append_trans, 0); + } + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); if (error) { ioend->io_error = -error; goto done; } + } else if (ioend->io_append_trans) { + error = xfs_setfilesize(ioend); + if (error) + ioend->io_error = -error; + } else { + ASSERT(!xfs_ioend_is_append(ioend)); } - /* - * We might have to update the on-disk file size after extending - * writes. - */ - error = xfs_setfilesize(ioend); - ASSERT(!error || error == EAGAIN); - done: - /* - * If we didn't complete processing of the ioend, requeue it to the - * tail of the workqueue for another attempt later. Otherwise destroy - * it. - */ - if (error == EAGAIN) { - atomic_inc(&ioend->io_remaining); - xfs_finish_ioend(ioend); - /* ensure we don't spin on blocked ioends */ - delay(1); - } else { - xfs_destroy_ioend(ioend); - } + xfs_destroy_ioend(ioend); } /* @@ -264,6 +276,7 @@ xfs_alloc_ioend( */ atomic_set(&ioend->io_remaining, 1); ioend->io_isasync = 0; + ioend->io_isdirect = 0; ioend->io_error = 0; ioend->io_list = NULL; ioend->io_type = type; @@ -274,6 +287,7 @@ xfs_alloc_ioend( ioend->io_size = 0; ioend->io_iocb = NULL; ioend->io_result = 0; + ioend->io_append_trans = NULL; INIT_WORK(&ioend->io_work, xfs_end_io); return ioend; @@ -384,14 +398,6 @@ xfs_submit_ioend_bio( atomic_inc(&ioend->io_remaining); bio->bi_private = ioend; bio->bi_end_io = xfs_end_bio; - - /* - * If the I/O is beyond EOF we mark the inode dirty immediately - * but don't update the inode size until I/O completion. - */ - if (xfs_ioend_new_eof(ioend)) - xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); } @@ -614,7 +620,7 @@ xfs_map_at_offset( * or delayed allocate extent. */ STATIC int -xfs_is_delayed_page( +xfs_check_page_type( struct page *page, unsigned int type) { @@ -628,11 +634,11 @@ xfs_is_delayed_page( bh = head = page_buffers(page); do { if (buffer_unwritten(bh)) - acceptable = (type == IO_UNWRITTEN); + acceptable += (type == IO_UNWRITTEN); else if (buffer_delay(bh)) - acceptable = (type == IO_DELALLOC); + acceptable += (type == IO_DELALLOC); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == IO_OVERWRITE); + acceptable += (type == IO_OVERWRITE); else break; } while ((bh = bh->b_this_page) != head); @@ -675,7 +681,7 @@ xfs_convert_page( goto fail_unlock_page; if (page->mapping != inode->i_mapping) goto fail_unlock_page; - if (!xfs_is_delayed_page(page, (*ioendp)->io_type)) + if (!xfs_check_page_type(page, (*ioendp)->io_type)) goto fail_unlock_page; /* @@ -825,7 +831,7 @@ xfs_aops_discard_page( struct buffer_head *bh, *head; loff_t offset = page_offset(page); - if (!xfs_is_delayed_page(page, IO_DELALLOC)) + if (!xfs_check_page_type(page, IO_DELALLOC)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -975,10 +981,15 @@ xfs_vm_writepage( imap_valid = 0; } } else { - if (PageUptodate(page)) { + if (PageUptodate(page)) ASSERT(buffer_mapped(bh)); - imap_valid = 0; - } + /* + * This buffer is not uptodate and will not be + * written to disk. Ensure that we will put any + * subsequent writeable buffers into a new + * ioend. + */ + imap_valid = 0; continue; } @@ -1038,8 +1049,20 @@ xfs_vm_writepage( wbc, end_index); } - if (iohead) + if (iohead) { + /* + * Reserve log space if we might write beyond the on-disk + * inode size. + */ + if (ioend->io_type != IO_UNWRITTEN && + xfs_ioend_is_append(ioend)) { + err = xfs_setfilesize_trans_alloc(ioend); + if (err) + goto error; + } + xfs_submit_ioend(wbc, iohead); + } return 0; @@ -1125,7 +1148,14 @@ __xfs_get_blocks( if (!create && direct && offset >= i_size_read(inode)) return 0; - if (create) { + /* + * Direct I/O is usually done on preallocated files, so try getting + * a block mapping without an exclusive lock first. For buffered + * writes we already have the exclusive iolock anyway, so avoiding + * a lock roundtrip here by taking the ilock exclusive from the + * beginning is a useful micro optimization. + */ + if (create && !direct) { lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, lockmode); } else { @@ -1147,23 +1177,45 @@ __xfs_get_blocks( (!nimaps || (imap.br_startblock == HOLESTARTBLOCK || imap.br_startblock == DELAYSTARTBLOCK))) { - if (direct) { + if (direct || xfs_get_extsz_hint(ip)) { + /* + * Drop the ilock in preparation for starting the block + * allocation transaction. It will be retaken + * exclusively inside xfs_iomap_write_direct for the + * actual allocation. + */ + xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset, size, &imap, nimaps); + if (error) + return -error; + new = 1; } else { + /* + * Delalloc reservations do not require a transaction, + * we can go on without dropping the lock here. If we + * are allocating a new delalloc block, make sure that + * we set the new flag so that we mark the buffer new so + * that we know that it is newly allocated if the write + * fails. + */ + if (nimaps && imap.br_startblock == HOLESTARTBLOCK) + new = 1; error = xfs_iomap_write_delay(ip, offset, size, &imap); + if (error) + goto out_unlock; + + xfs_iunlock(ip, lockmode); } - if (error) - goto out_unlock; trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); } else if (nimaps) { trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); + xfs_iunlock(ip, lockmode); } else { trace_xfs_get_blocks_notfound(ip, offset, size); goto out_unlock; } - xfs_iunlock(ip, lockmode); if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK) { @@ -1279,6 +1331,15 @@ xfs_end_io_direct_write( struct xfs_ioend *ioend = iocb->private; /* + * While the generic direct I/O code updates the inode size, it does + * so only after the end_io handler is called, which means our + * end_io handler thinks the on-disk size is outside the in-core + * size. To prevent this just update it a little bit earlier here. + */ + if (offset + size > i_size_read(ioend->io_inode)) + i_size_write(ioend->io_inode, offset + size); + + /* * blockdev_direct_IO can return an error even after the I/O * completion handler was called. Thus we need to protect * against double-freeing. @@ -1310,17 +1371,32 @@ xfs_vm_direct_IO( { struct inode *inode = iocb->ki_filp->f_mapping->host; struct block_device *bdev = xfs_find_bdev_for_inode(inode); + struct xfs_ioend *ioend = NULL; ssize_t ret; if (rw & WRITE) { - iocb->private = xfs_alloc_ioend(inode, IO_DIRECT); + size_t size = iov_length(iov, nr_segs); + + /* + * We need to preallocate a transaction for a size update + * here. In the case that this write both updates the size + * and converts at least on unwritten extent we will cancel + * the still clean transaction after the I/O has finished. + */ + iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT); + if (offset + size > XFS_I(inode)->i_d.di_size) { + ret = xfs_setfilesize_trans_alloc(ioend); + if (ret) + goto out_destroy_ioend; + ioend->io_isdirect = 1; + } ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, xfs_get_blocks_direct, xfs_end_io_direct_write, NULL, 0); if (ret != -EIOCBQUEUED && iocb->private) - xfs_destroy_ioend(iocb->private); + goto out_trans_cancel; } else { ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, @@ -1329,55 +1405,103 @@ xfs_vm_direct_IO( } return ret; + +out_trans_cancel: + if (ioend->io_append_trans) { + current_set_flags_nested(&ioend->io_append_trans->t_pflags, + PF_FSTRANS); + xfs_trans_cancel(ioend->io_append_trans, 0); + } +out_destroy_ioend: + xfs_destroy_ioend(ioend); + return ret; +} + +/* + * Punch out the delalloc blocks we have already allocated. + * + * Don't bother with xfs_setattr given that nothing can have made it to disk yet + * as the page is still locked at this point. + */ +STATIC void +xfs_vm_kill_delalloc_range( + struct inode *inode, + loff_t start, + loff_t end) +{ + struct xfs_inode *ip = XFS_I(inode); + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error; + + start_fsb = XFS_B_TO_FSB(ip->i_mount, start); + end_fsb = XFS_B_TO_FSB(ip->i_mount, end); + if (end_fsb <= start_fsb) + return; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "xfs_vm_write_failed: unable to clean up ino %lld", + ip->i_ino); + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); } STATIC void xfs_vm_write_failed( - struct address_space *mapping, - loff_t to) + struct inode *inode, + struct page *page, + loff_t pos, + unsigned len) { - struct inode *inode = mapping->host; + loff_t block_offset = pos & PAGE_MASK; + loff_t block_start; + loff_t block_end; + loff_t from = pos & (PAGE_CACHE_SIZE - 1); + loff_t to = from + len; + struct buffer_head *bh, *head; - if (to > inode->i_size) { - /* - * punch out the delalloc blocks we have already allocated. We - * don't call xfs_setattr() to do this as we may be in the - * middle of a multi-iovec write and so the vfs inode->i_size - * will not match the xfs ip->i_size and so it will zero too - * much. Hence we jus truncate the page cache to zero what is - * necessary and punch the delalloc blocks directly. - */ - struct xfs_inode *ip = XFS_I(inode); - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error; + ASSERT(block_offset + from == pos); - truncate_pagecache(inode, to, inode->i_size); + head = page_buffers(page); + block_start = 0; + for (bh = head; bh != head || !block_start; + bh = bh->b_this_page, block_start = block_end, + block_offset += bh->b_size) { + block_end = block_start + bh->b_size; - /* - * Check if there are any blocks that are outside of i_size - * that need to be trimmed back. - */ - start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1; - end_fsb = XFS_B_TO_FSB(ip->i_mount, to); - if (end_fsb <= start_fsb) - return; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "xfs_vm_write_failed: unable to clean up ino %lld", - ip->i_ino); - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* skip buffers before the write */ + if (block_end <= from) + continue; + + /* if the buffer is after the write, we're done */ + if (block_start >= to) + break; + + if (!buffer_delay(bh)) + continue; + + if (!buffer_new(bh) && block_offset < i_size_read(inode)) + continue; + + xfs_vm_kill_delalloc_range(inode, block_offset, + block_offset + bh->b_size); } + } +/* + * This used to call block_write_begin(), but it unlocks and releases the page + * on error, and we need that page to be able to punch stale delalloc blocks out + * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at + * the appropriate point. + */ STATIC int xfs_vm_write_begin( struct file *file, @@ -1388,15 +1512,40 @@ xfs_vm_write_begin( struct page **pagep, void **fsdata) { - int ret; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int status; - ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS, - pagep, xfs_get_blocks); - if (unlikely(ret)) - xfs_vm_write_failed(mapping, pos + len); - return ret; + ASSERT(len <= PAGE_CACHE_SIZE); + + page = grab_cache_page_write_begin(mapping, index, + flags | AOP_FLAG_NOFS); + if (!page) + return -ENOMEM; + + status = __block_write_begin(page, pos, len, xfs_get_blocks); + if (unlikely(status)) { + struct inode *inode = mapping->host; + + xfs_vm_write_failed(inode, page, pos, len); + unlock_page(page); + + if (pos + len > i_size_read(inode)) + truncate_pagecache(inode, pos + len, i_size_read(inode)); + + page_cache_release(page); + page = NULL; + } + + *pagep = page; + return status; } +/* + * On failure, we only need to kill delalloc blocks beyond EOF because they + * will never be written. For blocks within EOF, generic_write_end() zeros them + * so they are safe to leave alone and be written with all the other valid data. + */ STATIC int xfs_vm_write_end( struct file *file, @@ -1409,9 +1558,19 @@ xfs_vm_write_end( { int ret; + ASSERT(len <= PAGE_CACHE_SIZE); + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (unlikely(ret < len)) - xfs_vm_write_failed(mapping, pos + len); + if (unlikely(ret < len)) { + struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); + loff_t to = pos + len; + + if (to > isize) { + truncate_pagecache(inode, to, isize); + xfs_vm_kill_delalloc_range(inode, isize, to); + } + } return ret; } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 116dd5c3703..84eafbcb0d9 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -18,8 +18,6 @@ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern struct workqueue_struct *xfsdatad_workqueue; -extern struct workqueue_struct *xfsconvertd_workqueue; extern mempool_t *xfs_ioend_pool; /* @@ -48,12 +46,14 @@ typedef struct xfs_ioend { int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ unsigned int io_isasync : 1; /* needs aio_complete */ + unsigned int io_isdirect : 1;/* direct I/O */ struct inode *io_inode; /* file being written to */ struct buffer_head *io_buffer_head;/* buffer linked list head */ struct buffer_head *io_buffer_tail;/* buffer linked list tail */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ + struct xfs_trans *io_append_trans;/* xact. for size update */ struct kiocb *io_iocb; int io_result; } xfs_ioend_t; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 1e5d97f86ea..a17ff01b5ad 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -21,7 +21,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -39,7 +38,6 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" -#include "xfs_rw.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" @@ -827,10 +825,6 @@ xfs_attr_inactive(xfs_inode_t *dp) if (error) goto out; - /* - * Commit the last in the sequence of transactions. - */ - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); @@ -857,6 +851,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) { int newsize, forkoff, retval; + trace_xfs_attr_sf_addname(args); + retval = xfs_attr_shortform_lookup(args); if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { return(retval); @@ -900,6 +896,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) xfs_dabuf_t *bp; int retval, error, committed, forkoff; + trace_xfs_attr_leaf_addname(args); + /* * Read the (only) block in the attribute list in. */ @@ -924,6 +922,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) xfs_da_brelse(args->trans, bp); return(retval); } + + trace_xfs_attr_leaf_replace(args); + args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; @@ -1094,6 +1095,8 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) xfs_dabuf_t *bp; int error, committed, forkoff; + trace_xfs_attr_leaf_removename(args); + /* * Remove the attribute. */ @@ -1227,6 +1230,8 @@ xfs_attr_node_addname(xfs_da_args_t *args) xfs_mount_t *mp; int committed, retval, error; + trace_xfs_attr_node_addname(args); + /* * Fill in bucket of arguments/results/context to carry around. */ @@ -1253,6 +1258,9 @@ restart: } else if (retval == EEXIST) { if (args->flags & ATTR_CREATE) goto out; + + trace_xfs_attr_node_replace(args); + args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; @@ -1484,6 +1492,8 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_dabuf_t *bp; int retval, error, committed, forkoff; + trace_xfs_attr_node_removename(args); + /* * Tie a string around our finger to remind us where we are. */ @@ -1975,14 +1985,12 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, - blkcnt, XBF_LOCK | XBF_DONT_BLOCK, - &bp); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + dblkno, blkcnt, 0, &bp); if (error) return(error); - tmp = (valuelen < XFS_BUF_SIZE(bp)) - ? valuelen : XFS_BUF_SIZE(bp); + tmp = min_t(int, valuelen, BBTOB(bp->b_length)); xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ); xfs_buf_relse(bp); dst += tmp; @@ -2085,6 +2093,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) lblkno = args->rmtblkno; valuelen = args->valuelen; while (valuelen > 0) { + int buflen; + /* * Try to remember where we decided to put the value. */ @@ -2102,15 +2112,16 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, - XBF_LOCK | XBF_DONT_BLOCK); + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); if (!bp) return ENOMEM; - tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : - XFS_BUF_SIZE(bp); + + buflen = BBTOB(bp->b_length); + tmp = min_t(int, valuelen, buflen); xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); - if (tmp < XFS_BUF_SIZE(bp)) - xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); + if (tmp < buflen) + xfs_buf_zero(bp, tmp, buflen - tmp); + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); if (error) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index c1b55e59655..7d89d800f51 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -235,6 +234,8 @@ xfs_attr_shortform_create(xfs_da_args_t *args) xfs_inode_t *dp; xfs_ifork_t *ifp; + trace_xfs_attr_sf_create(args); + dp = args->dp; ASSERT(dp != NULL); ifp = dp->i_afp; @@ -268,13 +269,11 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) xfs_inode_t *dp; xfs_ifork_t *ifp; + trace_xfs_attr_sf_add(args); + dp = args->dp; mp = dp->i_mount; dp->i_d.di_forkoff = forkoff; - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - dp->i_afp->if_ext_max = - XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); ifp = dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); @@ -326,7 +325,6 @@ xfs_attr_fork_reset( ASSERT(ip->i_d.di_anextents == 0); ASSERT(ip->i_afp == NULL); - ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -342,6 +340,8 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) xfs_mount_t *mp; xfs_inode_t *dp; + trace_xfs_attr_sf_remove(args); + dp = args->dp; mp = dp->i_mount; base = sizeof(xfs_attr_sf_hdr_t); @@ -389,10 +389,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) (args->op_flags & XFS_DA_OP_ADDNAME) || !(mp->m_flags & XFS_MOUNT_ATTR2) || dp->i_d.di_format == XFS_DINODE_FMT_BTREE); - dp->i_afp->if_ext_max = - XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } @@ -414,6 +410,8 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) int i; xfs_ifork_t *ifp; + trace_xfs_attr_sf_lookup(args); + ifp = args->dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; @@ -485,6 +483,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) xfs_dabuf_t *bp; xfs_ifork_t *ifp; + trace_xfs_attr_sf_to_leaf(args); + dp = args->dp; ifp = dp->i_afp; sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; @@ -784,6 +784,8 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) char *tmpbuffer; int error, i; + trace_xfs_attr_leaf_to_sf(args); + dp = args->dp; tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP); ASSERT(tmpbuffer != NULL); @@ -857,6 +859,8 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) xfs_dablk_t blkno; int error; + trace_xfs_attr_leaf_to_node(args); + dp = args->dp; bp1 = bp2 = NULL; error = xfs_da_grow_inode(args, &blkno); @@ -920,6 +924,8 @@ xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) xfs_dabuf_t *bp; int error; + trace_xfs_attr_leaf_create(args); + dp = args->dp; ASSERT(dp != NULL); error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, @@ -957,6 +963,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, xfs_dablk_t blkno; int error; + trace_xfs_attr_leaf_split(state->args); + /* * Allocate space for a new leaf node. */ @@ -986,10 +994,13 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * * Insert the "new" entry in the correct block. */ - if (state->inleaf) + if (state->inleaf) { + trace_xfs_attr_leaf_add_old(state->args); error = xfs_attr_leaf_add(oldblk->bp, state->args); - else + } else { + trace_xfs_attr_leaf_add_new(state->args); error = xfs_attr_leaf_add(newblk->bp, state->args); + } /* * Update last hashval in each block since we added the name. @@ -1010,6 +1021,8 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) xfs_attr_leaf_map_t *map; int tablesize, entsize, sum, tmp, i; + trace_xfs_attr_leaf_add(args); + leaf = bp->data; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT((args->index >= 0) @@ -1137,8 +1150,6 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval))); /* - * Copy the attribute name and value into the new space. - * * For "remote" attribute values, simply note that we need to * allocate space for the "remote" value. We can't actually * allocate the extents in this transaction, and we can't decide @@ -1274,6 +1285,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); args = state->args; + trace_xfs_attr_leaf_rebalance(args); + /* * Check ordering of blocks, reverse if it makes things simpler. * @@ -1819,6 +1832,8 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_mount_t *mp; char *tmpbuffer; + trace_xfs_attr_leaf_unbalance(state->args); + /* * Set up environment. */ @@ -1928,6 +1943,8 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) int probe, span; xfs_dahash_t hashval; + trace_xfs_attr_leaf_lookup(args); + leaf = bp->data; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(be16_to_cpu(leaf->hdr.count) @@ -2454,6 +2471,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) char *name; #endif /* DEBUG */ + trace_xfs_attr_leaf_clearflag(args); /* * Set up the operation. */ @@ -2518,6 +2536,8 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) xfs_dabuf_t *bp; int error; + trace_xfs_attr_leaf_setflag(args); + /* * Set up the operation. */ @@ -2574,6 +2594,8 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) char *name1, *name2; #endif /* DEBUG */ + trace_xfs_attr_leaf_flipflags(args); + /* * Read the block containing the "old" attr */ @@ -2960,7 +2982,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, map.br_blockcount); bp = xfs_trans_get_buf(*trans, dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, XBF_LOCK); + dblkno, dblkcnt, 0); if (!bp) return ENOMEM; xfs_trans_binval(*trans, bp); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index d0ab7883705..58b815ec8c9 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -41,7 +41,6 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_attr_leaf.h" -#include "xfs_rw.h" #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_buf_item.h" @@ -249,7 +248,27 @@ xfs_bmbt_lookup_ge( } /* -* Update the record referred to by cur to the value given + * Check if the inode needs to be converted to btree format. + */ +static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) +{ + return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork); +} + +/* + * Check if the inode should be converted to extent format. + */ +static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) +{ + return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork); +} + +/* + * Update the record referred to by cur to the value given * by [off, bno, len, state]. * This either works (return 0) or gets an EFSCORRUPTED error. */ @@ -683,8 +702,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -767,8 +786,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -836,8 +855,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -884,8 +903,7 @@ xfs_bmap_add_extent_delay_real( } /* convert to a btree if necessary */ - if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); @@ -1421,8 +1439,7 @@ xfs_bmap_add_extent_unwritten_real( } /* convert to a btree if necessary */ - if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); @@ -1812,8 +1829,7 @@ xfs_bmap_add_extent_hole_real( } /* convert to a btree if necessary */ - if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); @@ -3037,8 +3053,7 @@ xfs_bmap_extents_to_btree( ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* * Make space in the inode incore. */ @@ -3184,13 +3199,8 @@ xfs_bmap_forkoff_reset( ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; - if (dfl_forkoff > ip->i_d.di_forkoff) { + if (dfl_forkoff > ip->i_d.di_forkoff) ip->i_d.di_forkoff = dfl_forkoff; - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); - ip->i_afp->if_ext_max = - XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t); - } } } @@ -3430,8 +3440,6 @@ xfs_bmap_add_attrfork( int error; /* error return value */ ASSERT(XFS_IFORK_Q(ip) == 0); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); mp = ip->i_mount; ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); @@ -3486,12 +3494,9 @@ xfs_bmap_add_attrfork( error = XFS_ERROR(EINVAL); goto error1; } - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); - ip->i_afp->if_ext_max = - XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; xfs_bmap_init(&flist, &firstblock); @@ -3535,20 +3540,17 @@ xfs_bmap_add_attrfork( } else spin_unlock(&mp->m_sb_lock); } - if ((error = xfs_bmap_finish(&tp, &flist, &committed))) + + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) goto error2; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); - return error; + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); error2: xfs_bmap_cancel(&flist); error1: xfs_iunlock(ip, XFS_ILOCK_EXCL); error0: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); return error; } @@ -3994,11 +3996,8 @@ xfs_bmap_one_block( xfs_bmbt_irec_t s; /* internal version of extent */ #ifndef DEBUG - if (whichfork == XFS_DATA_FORK) { - return S_ISREG(ip->i_d.di_mode) ? - (ip->i_size == ip->i_mount->m_sb.sb_blocksize) : - (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); - } + if (whichfork == XFS_DATA_FORK) + return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; #endif /* !DEBUG */ if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) return 0; @@ -4010,7 +4009,7 @@ xfs_bmap_one_block( xfs_bmbt_get_all(ep, &s); rval = s.br_startoff == 0 && s.br_blockcount == 1; if (rval && whichfork == XFS_DATA_FORK) - ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize); + ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); return rval; } @@ -4379,8 +4378,6 @@ xfs_bmapi_read( XFS_STATS_INC(xs_blk_mapr); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, whichfork); @@ -4529,7 +4526,7 @@ out_unreserve_blocks: xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) - xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ? + xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); return error; } @@ -4871,8 +4868,6 @@ xfs_bmapi_write( return XFS_ERROR(EIO); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); XFS_STATS_INC(xs_blk_mapw); @@ -4981,8 +4976,7 @@ xfs_bmapi_write( /* * Transform from btree to extents, give it cur. */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + if (xfs_bmap_wants_extents(ip, whichfork)) { int tmp_logflags = 0; ASSERT(bma.cur); @@ -4992,10 +4986,10 @@ xfs_bmapi_write( if (error) goto error0; } - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork)); error = 0; error0: /* @@ -5095,8 +5089,7 @@ xfs_bunmapi( ASSERT(len > 0); ASSERT(nexts >= 0); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + if (!(ifp->if_flags & XFS_IFEXTENTS) && (error = xfs_iread_extents(tp, ip, whichfork))) return error; @@ -5130,6 +5123,15 @@ xfs_bunmapi( cur->bc_private.b.flags = 0; } else cur = NULL; + + if (isrt) { + /* + * Synchronize by locking the bitmap inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + } + extno = 0; while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && (nexts == 0 || extno < nexts)) { @@ -5322,7 +5324,8 @@ xfs_bunmapi( */ if (!wasdel && xfs_trans_get_block_res(tp) == 0 && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max && + XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */ + XFS_IFORK_MAXEXT(ip, whichfork) && del.br_startoff > got.br_startoff && del.br_startoff + del.br_blockcount < got.br_startoff + got.br_blockcount) { @@ -5353,13 +5356,11 @@ nodelete: } } *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* * Convert to a btree if necessary. */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(ip, whichfork)) { ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0, &tmp_logflags, whichfork); @@ -5370,8 +5371,7 @@ nodelete: /* * transform from btree to extents, give it cur */ - else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + else if (xfs_bmap_wants_extents(ip, whichfork)) { ASSERT(cur != NULL); error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, whichfork); @@ -5382,8 +5382,6 @@ nodelete: /* * transform from extents to local? */ - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); error = 0; error0: /* @@ -5434,7 +5432,7 @@ xfs_getbmapx_fix_eof_hole( if (startblock == HOLESTARTBLOCK) { mp = ip->i_mount; out->bmv_block = -1; - fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size)); + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); fixlen -= out->bmv_offset; if (prealloced && out->bmv_offset + out->bmv_length == end) { /* Came to hole at EOF. Trim it. */ @@ -5522,7 +5520,7 @@ xfs_getbmap( fixlen = XFS_MAXIOFFSET(mp); } else { prealloced = 0; - fixlen = ip->i_size; + fixlen = XFS_ISIZE(ip); } } @@ -5546,12 +5544,16 @@ xfs_getbmap( if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) return XFS_ERROR(ENOMEM); out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL); - if (!out) - return XFS_ERROR(ENOMEM); + if (!out) { + out = kmem_zalloc_large(bmv->bmv_count * + sizeof(struct getbmapx)); + if (!out) + return XFS_ERROR(ENOMEM); + } xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { - if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) { + if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); if (error) goto out_unlock_iolock; @@ -5618,8 +5620,20 @@ xfs_getbmap( XFS_FSB_TO_BB(mp, map[i].br_blockcount); out[cur_ext].bmv_unused1 = 0; out[cur_ext].bmv_unused2 = 0; - ASSERT(((iflags & BMV_IF_DELALLOC) != 0) || - (map[i].br_startblock != DELAYSTARTBLOCK)); + + /* + * delayed allocation extents that start beyond EOF can + * occur due to speculative EOF allocation when the + * delalloc extent is larger than the largest freespace + * extent at conversion time. These extents cannot be + * converted by data writeback, so can exist here even + * if we are not supposed to be finding delalloc + * extents. + */ + if (map[i].br_startblock == DELAYSTARTBLOCK && + map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + ASSERT((iflags & BMV_IF_DELALLOC) != 0); + if (map[i].br_startblock == HOLESTARTBLOCK && whichfork == XFS_ATTR_FORK) { /* came to the end of attribute fork */ @@ -5671,7 +5685,10 @@ xfs_getbmap( break; } - kmem_free(out); + if (is_vmalloc_addr(out)) + kmem_free_large(out); + else + kmem_free(out); return error; } @@ -6151,3 +6168,16 @@ next_block: return error; } + +/* + * Convert the given file system block to a disk block. We have to treat it + * differently based on whether the file is a real time file or not, because the + * bmap code does. + */ +xfs_daddr_t +xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) +{ + return (XFS_IS_REALTIME_INODE(ip) ? \ + (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); +} diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 89ee672d378..803b56d7ce1 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -211,6 +211,9 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, int *count); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); + +xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); + #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index e2f5d59cbea..862084a47a7 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 1f19f03af9d..e53e317b158 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index cf0ac056815..269b35c084d 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -35,18 +35,14 @@ #include <linux/freezer.h> #include "xfs_sb.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trace.h" static kmem_zone_t *xfs_buf_zone; -STATIC int xfsbufd(void *); static struct workqueue_struct *xfslogd_workqueue; -struct workqueue_struct *xfsdatad_workqueue; -struct workqueue_struct *xfsconvertd_workqueue; #ifdef XFS_BUF_LOCK_TRACKING # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) @@ -59,11 +55,7 @@ struct workqueue_struct *xfsconvertd_workqueue; #endif #define xb_to_gfp(flags) \ - ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ - ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) - -#define xb_to_km(flags) \ - (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) + ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) static inline int @@ -73,11 +65,11 @@ xfs_buf_is_vmapped( /* * Return true if the buffer is vmapped. * - * The XBF_MAPPED flag is set if the buffer should be mapped, but the - * code is clever enough to know it doesn't have to map a single page, - * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1. + * b_addr is null if the buffer is not mapped, but the code is clever + * enough to know it doesn't have to map a single page, so the check has + * to be both for b_addr and bp->b_page_count > 1. */ - return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1; + return bp->b_addr && bp->b_page_count > 1; } static inline int @@ -146,8 +138,17 @@ void xfs_buf_stale( struct xfs_buf *bp) { + ASSERT(xfs_buf_islocked(bp)); + bp->b_flags |= XBF_STALE; - xfs_buf_delwri_dequeue(bp); + + /* + * Clear the delwri status so that a delwri queue walker will not + * flush this buffer to disk now that it is stale. The delwri queue has + * a reference to the buffer, so this is safe to do. + */ + bp->b_flags &= ~_XBF_DELWRI_Q; + atomic_set(&(bp)->b_lru_ref, 0); if (!list_empty(&bp->b_lru)) { struct xfs_buftarg *btp = bp->b_target; @@ -166,22 +167,22 @@ xfs_buf_stale( struct xfs_buf * xfs_buf_alloc( struct xfs_buftarg *target, - xfs_off_t range_base, - size_t range_length, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { struct xfs_buf *bp; - bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)); + bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); if (unlikely(!bp)) return NULL; /* - * We don't want certain flags to appear in b_flags. + * We don't want certain flags to appear in b_flags unless they are + * specifically set by later operations on the buffer. */ - flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); + flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); - memset(bp, 0, sizeof(xfs_buf_t)); atomic_set(&bp->b_hold, 1); atomic_set(&bp->b_lru_ref, 1); init_completion(&bp->b_iowait); @@ -191,15 +192,16 @@ xfs_buf_alloc( sema_init(&bp->b_sema, 0); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; - bp->b_file_offset = range_base; + /* - * Set buffer_length and count_desired to the same value initially. - * I/O routines should use count_desired, which will be the same in + * Set length and io_length to the same value initially. + * I/O routines should use io_length, which will be the same in * most cases but may be reset (e.g. XFS recovery). */ - bp->b_buffer_length = bp->b_count_desired = range_length; + bp->b_length = numblks; + bp->b_io_length = numblks; bp->b_flags = flags; - bp->b_bn = XFS_BUF_DADDR_NULL; + bp->b_bn = blkno; atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); @@ -221,13 +223,12 @@ _xfs_buf_get_pages( { /* Make sure that we have a page list */ if (bp->b_pages == NULL) { - bp->b_offset = xfs_buf_poff(bp->b_file_offset); bp->b_page_count = page_count; if (page_count <= XB_PAGES) { bp->b_pages = bp->b_page_array; } else { bp->b_pages = kmem_alloc(sizeof(struct page *) * - page_count, xb_to_km(flags)); + page_count, KM_NOFS); if (bp->b_pages == NULL) return -ENOMEM; } @@ -290,11 +291,11 @@ xfs_buf_allocate_memory( xfs_buf_t *bp, uint flags) { - size_t size = bp->b_count_desired; + size_t size; size_t nbytes, offset; gfp_t gfp_mask = xb_to_gfp(flags); unsigned short page_count, i; - xfs_off_t end; + xfs_off_t start, end; int error; /* @@ -302,15 +303,15 @@ xfs_buf_allocate_memory( * the memory from the heap - there's no need for the complexity of * page arrays to keep allocation down to order 0. */ - if (bp->b_buffer_length < PAGE_SIZE) { - bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags)); + size = BBTOB(bp->b_length); + if (size < PAGE_SIZE) { + bp->b_addr = kmem_alloc(size, KM_NOFS); if (!bp->b_addr) { /* low memory - use alloc_page loop instead */ goto use_alloc_page; } - if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) & - PAGE_MASK) != + if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != ((unsigned long)bp->b_addr & PAGE_MASK)) { /* b_addr spans two pages - use alloc_page instead */ kmem_free(bp->b_addr); @@ -321,13 +322,14 @@ xfs_buf_allocate_memory( bp->b_pages = bp->b_page_array; bp->b_pages[0] = virt_to_page(bp->b_addr); bp->b_page_count = 1; - bp->b_flags |= XBF_MAPPED | _XBF_KMEM; + bp->b_flags |= _XBF_KMEM; return 0; } use_alloc_page: - end = bp->b_file_offset + bp->b_buffer_length; - page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); + start = BBTOB(bp->b_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; + page_count = end - start; error = _xfs_buf_get_pages(bp, page_count, flags); if (unlikely(error)) return error; @@ -390,8 +392,9 @@ _xfs_buf_map_pages( if (bp->b_page_count == 1) { /* A single page buffer is always mappable */ bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; - bp->b_flags |= XBF_MAPPED; - } else if (flags & XBF_MAPPED) { + } else if (flags & XBF_UNMAPPED) { + bp->b_addr = NULL; + } else { int retried = 0; do { @@ -405,7 +408,6 @@ _xfs_buf_map_pages( if (!bp->b_addr) return -ENOMEM; bp->b_addr += bp->b_offset; - bp->b_flags |= XBF_MAPPED; } return 0; @@ -422,29 +424,27 @@ _xfs_buf_map_pages( */ xfs_buf_t * _xfs_buf_find( - xfs_buftarg_t *btp, /* block device target */ - xfs_off_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ + struct xfs_buftarg *btp, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags, xfs_buf_t *new_bp) { - xfs_off_t range_base; - size_t range_length; + size_t numbytes; struct xfs_perag *pag; struct rb_node **rbp; struct rb_node *parent; xfs_buf_t *bp; - range_base = (ioff << BBSHIFT); - range_length = (isize << BBSHIFT); + numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(range_length < (1 << btp->bt_sshift))); - ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); + ASSERT(!(numbytes < (1 << btp->bt_sshift))); + ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); /* get tree root */ pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, ioff)); + xfs_daddr_to_agno(btp->bt_mount, blkno)); /* walk tree */ spin_lock(&pag->pag_buf_lock); @@ -455,20 +455,20 @@ _xfs_buf_find( parent = *rbp; bp = rb_entry(parent, struct xfs_buf, b_rbnode); - if (range_base < bp->b_file_offset) + if (blkno < bp->b_bn) rbp = &(*rbp)->rb_left; - else if (range_base > bp->b_file_offset) + else if (blkno > bp->b_bn) rbp = &(*rbp)->rb_right; else { /* - * found a block offset match. If the range doesn't + * found a block number match. If the range doesn't * match, the only way this is allowed is if the buffer * in the cache is stale and the transaction that made * it stale has not yet committed. i.e. we are * reallocating a busy extent. Skip this buffer and * continue searching to the right for an exact match. */ - if (bp->b_buffer_length != range_length) { + if (bp->b_length != numblks) { ASSERT(bp->b_flags & XBF_STALE); rbp = &(*rbp)->rb_right; continue; @@ -513,7 +513,7 @@ found: */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES; + bp->b_flags &= _XBF_KMEM | _XBF_PAGES; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -528,63 +528,54 @@ found: */ struct xfs_buf * xfs_buf_get( - xfs_buftarg_t *target,/* target for buffer */ - xfs_off_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ + xfs_buftarg_t *target, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { struct xfs_buf *bp; struct xfs_buf *new_bp; int error = 0; - bp = _xfs_buf_find(target, ioff, isize, flags, NULL); + bp = _xfs_buf_find(target, blkno, numblks, flags, NULL); if (likely(bp)) goto found; - new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT, - flags); + new_bp = xfs_buf_alloc(target, blkno, numblks, flags); if (unlikely(!new_bp)) return NULL; - bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); - if (!bp) { + error = xfs_buf_allocate_memory(new_bp, flags); + if (error) { kmem_zone_free(xfs_buf_zone, new_bp); return NULL; } - if (bp == new_bp) { - error = xfs_buf_allocate_memory(bp, flags); - if (error) - goto no_buffer; - } else - kmem_zone_free(xfs_buf_zone, new_bp); + bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp); + if (!bp) { + xfs_buf_free(new_bp); + return NULL; + } - /* - * Now we have a workable buffer, fill in the block number so - * that we can do IO on it. - */ - bp->b_bn = ioff; - bp->b_count_desired = bp->b_buffer_length; + if (bp != new_bp) + xfs_buf_free(new_bp); + + bp->b_io_length = bp->b_length; found: - if (!(bp->b_flags & XBF_MAPPED)) { + if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { xfs_warn(target->bt_mount, "%s: failed to map pages\n", __func__); - goto no_buffer; + xfs_buf_relse(bp); + return NULL; } } XFS_STATS_INC(xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); return bp; - -no_buffer: - if (flags & (XBF_LOCK | XBF_TRYLOCK)) - xfs_buf_unlock(bp); - xfs_buf_rele(bp); - return NULL; } STATIC int @@ -592,32 +583,30 @@ _xfs_buf_read( xfs_buf_t *bp, xfs_buf_flags_t flags) { - int status; - - ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); + ASSERT(!(flags & XBF_WRITE)); ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); - bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD); + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); - status = xfs_buf_iorequest(bp); - if (status || bp->b_error || (flags & XBF_ASYNC)) - return status; + xfs_buf_iorequest(bp); + if (flags & XBF_ASYNC) + return 0; return xfs_buf_iowait(bp); } xfs_buf_t * xfs_buf_read( xfs_buftarg_t *target, - xfs_off_t ioff, - size_t isize, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { xfs_buf_t *bp; flags |= XBF_READ; - bp = xfs_buf_get(target, ioff, isize, flags); + bp = xfs_buf_get(target, blkno, numblks, flags); if (bp) { trace_xfs_buf_read(bp, flags, _RET_IP_); @@ -629,7 +618,8 @@ xfs_buf_read( * Read ahead call which is already satisfied, * drop the buffer */ - goto no_buffer; + xfs_buf_relse(bp); + return NULL; } else { /* We do not want read in the flags */ bp->b_flags &= ~XBF_READ; @@ -637,12 +627,6 @@ xfs_buf_read( } return bp; - - no_buffer: - if (flags & (XBF_LOCK | XBF_TRYLOCK)) - xfs_buf_unlock(bp); - xfs_buf_rele(bp); - return NULL; } /* @@ -652,14 +636,14 @@ xfs_buf_read( void xfs_buf_readahead( xfs_buftarg_t *target, - xfs_off_t ioff, - size_t isize) + xfs_daddr_t blkno, + size_t numblks) { if (bdi_read_congested(target->bt_bdi)) return; - xfs_buf_read(target, ioff, isize, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK); + xfs_buf_read(target, blkno, numblks, + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); } /* @@ -668,16 +652,15 @@ xfs_buf_readahead( */ struct xfs_buf * xfs_buf_read_uncached( - struct xfs_mount *mp, struct xfs_buftarg *target, xfs_daddr_t daddr, - size_t length, + size_t numblks, int flags) { xfs_buf_t *bp; int error; - bp = xfs_buf_get_uncached(target, length, flags); + bp = xfs_buf_get_uncached(target, numblks, flags); if (!bp) return NULL; @@ -685,9 +668,9 @@ xfs_buf_read_uncached( XFS_BUF_SET_ADDR(bp, daddr); XFS_BUF_READ(bp); - xfsbdstrat(mp, bp); + xfsbdstrat(target->bt_mount, bp); error = xfs_buf_iowait(bp); - if (error || bp->b_error) { + if (error) { xfs_buf_relse(bp); return NULL; } @@ -701,7 +684,7 @@ xfs_buf_read_uncached( void xfs_buf_set_empty( struct xfs_buf *bp, - size_t len) + size_t numblks) { if (bp->b_pages) _xfs_buf_free_pages(bp); @@ -709,10 +692,9 @@ xfs_buf_set_empty( bp->b_pages = NULL; bp->b_page_count = 0; bp->b_addr = NULL; - bp->b_file_offset = 0; - bp->b_buffer_length = bp->b_count_desired = len; + bp->b_length = numblks; + bp->b_io_length = numblks; bp->b_bn = XFS_BUF_DADDR_NULL; - bp->b_flags &= ~XBF_MAPPED; } static inline struct page * @@ -751,7 +733,7 @@ xfs_buf_associate_memory( bp->b_pages = NULL; bp->b_addr = mem; - rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK); + rval = _xfs_buf_get_pages(bp, page_count, 0); if (rval) return rval; @@ -762,9 +744,8 @@ xfs_buf_associate_memory( pageaddr += PAGE_SIZE; } - bp->b_count_desired = len; - bp->b_buffer_length = buflen; - bp->b_flags |= XBF_MAPPED; + bp->b_io_length = BTOBB(len); + bp->b_length = BTOBB(buflen); return 0; } @@ -772,17 +753,18 @@ xfs_buf_associate_memory( xfs_buf_t * xfs_buf_get_uncached( struct xfs_buftarg *target, - size_t len, + size_t numblks, int flags) { - unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; + unsigned long page_count; int error, i; xfs_buf_t *bp; - bp = xfs_buf_alloc(target, 0, len, 0); + bp = xfs_buf_alloc(target, XFS_BUF_DADDR_NULL, numblks, 0); if (unlikely(bp == NULL)) goto fail; + page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; error = _xfs_buf_get_pages(bp, page_count, 0); if (error) goto fail_free_buf; @@ -794,7 +776,7 @@ xfs_buf_get_uncached( } bp->b_flags |= _XBF_PAGES; - error = _xfs_buf_map_pages(bp, XBF_MAPPED); + error = _xfs_buf_map_pages(bp, 0); if (unlikely(error)) { xfs_warn(target->bt_mount, "%s: failed to map pages\n", __func__); @@ -857,7 +839,7 @@ xfs_buf_rele( spin_unlock(&pag->pag_buf_lock); } else { xfs_buf_lru_del(bp); - ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); @@ -917,13 +899,6 @@ xfs_buf_lock( trace_xfs_buf_lock_done(bp, _RET_IP_); } -/* - * Releases the lock on the buffer object. - * If the buffer is marked delwri but is not queued, do so before we - * unlock the buffer as we need to set flags correctly. We also need to - * take a reference for the delwri queue because the unlocker is going to - * drop their's and they don't know we just queued it. - */ void xfs_buf_unlock( struct xfs_buf *bp) @@ -1010,29 +985,8 @@ xfs_buf_ioerror_alert( const char *func) { xfs_alert(bp->b_target->bt_mount, -"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd", - (__uint64_t)XFS_BUF_ADDR(bp), func, - bp->b_error, XFS_BUF_COUNT(bp)); -} - -int -xfs_bwrite( - struct xfs_buf *bp) -{ - int error; - - bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ); - - xfs_buf_delwri_dequeue(bp); - xfs_bdstrat_cb(bp); - - error = xfs_buf_iowait(bp); - if (error) { - xfs_force_shutdown(bp->b_target->bt_mount, - SHUTDOWN_META_IO_ERROR); - } - return error; +"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", + (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length); } /* @@ -1104,14 +1058,7 @@ xfs_bioerror_relse( return EIO; } - -/* - * All xfs metadata buffers except log state machine buffers - * get this attached as their b_bdstrat callback function. - * This is so that we can catch a buffer - * after prematurely unpinning it to forcibly shutdown the filesystem. - */ -int +STATIC int xfs_bdstrat_cb( struct xfs_buf *bp) { @@ -1132,6 +1079,27 @@ xfs_bdstrat_cb( return 0; } +int +xfs_bwrite( + struct xfs_buf *bp) +{ + int error; + + ASSERT(xfs_buf_islocked(bp)); + + bp->b_flags |= XBF_WRITE; + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); + + xfs_bdstrat_cb(bp); + + error = xfs_buf_iowait(bp); + if (error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } + return error; +} + /* * Wrapper around bdstrat so that we can stop data from going to disk in case * we are shutting down the filesystem. Typically user data goes thru this @@ -1183,7 +1151,7 @@ _xfs_buf_ioapply( int rw, map_i, total_nr_pages, nr_pages; struct bio *bio; int offset = bp->b_offset; - int size = bp->b_count_desired; + int size = BBTOB(bp->b_io_length); sector_t sector = bp->b_bn; total_nr_pages = bp->b_page_count; @@ -1231,7 +1199,7 @@ next_chunk: break; offset = 0; - sector += nbytes >> BBSHIFT; + sector += BTOBB(nbytes); size -= nbytes; total_nr_pages--; } @@ -1250,13 +1218,13 @@ next_chunk: } } -int +void xfs_buf_iorequest( xfs_buf_t *bp) { trace_xfs_buf_iorequest(bp, _RET_IP_); - ASSERT(!(bp->b_flags & XBF_DELWRI)); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); @@ -1268,16 +1236,15 @@ xfs_buf_iorequest( */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 0); + _xfs_buf_ioend(bp, 1); xfs_buf_rele(bp); - return 0; } /* - * Waits for I/O to complete on the buffer supplied. - * It returns immediately if no I/O is pending. - * It returns the I/O error code, if any, or 0 if there was no error. + * Waits for I/O to complete on the buffer supplied. It returns immediately if + * no I/O is pending or there is already a pending error on the buffer. It + * returns the I/O error code, if any, or 0 if there was no error. */ int xfs_buf_iowait( @@ -1285,7 +1252,8 @@ xfs_buf_iowait( { trace_xfs_buf_iowait(bp, _RET_IP_); - wait_for_completion(&bp->b_iowait); + if (!bp->b_error) + wait_for_completion(&bp->b_iowait); trace_xfs_buf_iowait_done(bp, _RET_IP_); return bp->b_error; @@ -1298,7 +1266,7 @@ xfs_buf_offset( { struct page *page; - if (bp->b_flags & XBF_MAPPED) + if (bp->b_addr) return bp->b_addr + offset; offset += bp->b_offset; @@ -1317,27 +1285,30 @@ xfs_buf_iomove( void *data, /* data address */ xfs_buf_rw_t mode) /* read/write/zero flag */ { - size_t bend, cpoff, csize; - struct page *page; + size_t bend; bend = boff + bsize; while (boff < bend) { - page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; - cpoff = xfs_buf_poff(boff + bp->b_offset); - csize = min_t(size_t, - PAGE_SIZE-cpoff, bp->b_count_desired-boff); + struct page *page; + int page_index, page_offset, csize; + + page_index = (boff + bp->b_offset) >> PAGE_SHIFT; + page_offset = (boff + bp->b_offset) & ~PAGE_MASK; + page = bp->b_pages[page_index]; + csize = min_t(size_t, PAGE_SIZE - page_offset, + BBTOB(bp->b_io_length) - boff); - ASSERT(((csize + cpoff) <= PAGE_SIZE)); + ASSERT((csize + page_offset) <= PAGE_SIZE); switch (mode) { case XBRW_ZERO: - memset(page_address(page) + cpoff, 0, csize); + memset(page_address(page) + page_offset, 0, csize); break; case XBRW_READ: - memcpy(data, page_address(page) + cpoff, csize); + memcpy(data, page_address(page) + page_offset, csize); break; case XBRW_WRITE: - memcpy(page_address(page) + cpoff, data, csize); + memcpy(page_address(page) + page_offset, data, csize); } boff += csize; @@ -1370,7 +1341,7 @@ restart: goto restart; } /* - * clear the LRU reference count so the bufer doesn't get + * clear the LRU reference count so the buffer doesn't get * ignored in xfs_buf_rele(). */ atomic_set(&bp->b_lru_ref, 0); @@ -1437,11 +1408,9 @@ xfs_free_buftarg( { unregister_shrinker(&btp->bt_shrinker); - xfs_flush_buftarg(btp, 1); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); - kthread_stop(btp->bt_task); kmem_free(btp); } @@ -1493,20 +1462,6 @@ xfs_setsize_buftarg( return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); } -STATIC int -xfs_alloc_delwri_queue( - xfs_buftarg_t *btp, - const char *fsname) -{ - INIT_LIST_HEAD(&btp->bt_delwri_queue); - spin_lock_init(&btp->bt_delwri_lock); - btp->bt_flags = 0; - btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); - if (IS_ERR(btp->bt_task)) - return PTR_ERR(btp->bt_task); - return 0; -} - xfs_buftarg_t * xfs_alloc_buftarg( struct xfs_mount *mp, @@ -1529,8 +1484,6 @@ xfs_alloc_buftarg( spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - if (xfs_alloc_delwri_queue(btp, fsname)) - goto error; btp->bt_shrinker.shrink = xfs_buftarg_shrink; btp->bt_shrinker.seeks = DEFAULT_SEEKS; register_shrinker(&btp->bt_shrinker); @@ -1541,125 +1494,52 @@ error: return NULL; } - /* - * Delayed write buffer handling + * Add a buffer to the delayed write list. + * + * This queues a buffer for writeout if it hasn't already been. Note that + * neither this routine nor the buffer list submission functions perform + * any internal synchronization. It is expected that the lists are thread-local + * to the callers. + * + * Returns true if we queued up the buffer, or false if it already had + * been on the buffer list. */ -void +bool xfs_buf_delwri_queue( - xfs_buf_t *bp) + struct xfs_buf *bp, + struct list_head *list) { - struct xfs_buftarg *btp = bp->b_target; - - trace_xfs_buf_delwri_queue(bp, _RET_IP_); - + ASSERT(xfs_buf_islocked(bp)); ASSERT(!(bp->b_flags & XBF_READ)); - spin_lock(&btp->bt_delwri_lock); - if (!list_empty(&bp->b_list)) { - /* if already in the queue, move it to the tail */ - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - list_move_tail(&bp->b_list, &btp->bt_delwri_queue); - } else { - /* start xfsbufd as it is about to have something to do */ - if (list_empty(&btp->bt_delwri_queue)) - wake_up_process(bp->b_target->bt_task); - - atomic_inc(&bp->b_hold); - bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC; - list_add_tail(&bp->b_list, &btp->bt_delwri_queue); - } - bp->b_queuetime = jiffies; - spin_unlock(&btp->bt_delwri_lock); -} - -void -xfs_buf_delwri_dequeue( - xfs_buf_t *bp) -{ - int dequeued = 0; - - spin_lock(&bp->b_target->bt_delwri_lock); - if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - list_del_init(&bp->b_list); - dequeued = 1; + /* + * If the buffer is already marked delwri it already is queued up + * by someone else for imediate writeout. Just ignore it in that + * case. + */ + if (bp->b_flags & _XBF_DELWRI_Q) { + trace_xfs_buf_delwri_queued(bp, _RET_IP_); + return false; } - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); - spin_unlock(&bp->b_target->bt_delwri_lock); - - if (dequeued) - xfs_buf_rele(bp); - trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); -} - -/* - * If a delwri buffer needs to be pushed before it has aged out, then promote - * it to the head of the delwri queue so that it will be flushed on the next - * xfsbufd run. We do this by resetting the queuetime of the buffer to be older - * than the age currently needed to flush the buffer. Hence the next time the - * xfsbufd sees it is guaranteed to be considered old enough to flush. - */ -void -xfs_buf_delwri_promote( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1; - - ASSERT(bp->b_flags & XBF_DELWRI); - ASSERT(bp->b_flags & _XBF_DELWRI_Q); + trace_xfs_buf_delwri_queue(bp, _RET_IP_); /* - * Check the buffer age before locking the delayed write queue as we - * don't need to promote buffers that are already past the flush age. + * If a buffer gets written out synchronously or marked stale while it + * is on a delwri list we lazily remove it. To do this, the other party + * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. + * It remains referenced and on the list. In a rare corner case it + * might get readded to a delwri list after the synchronous writeout, in + * which case we need just need to re-add the flag here. */ - if (bp->b_queuetime < jiffies - age) - return; - bp->b_queuetime = jiffies - age; - spin_lock(&btp->bt_delwri_lock); - list_move(&bp->b_list, &btp->bt_delwri_queue); - spin_unlock(&btp->bt_delwri_lock); -} - -/* - * Move as many buffers as specified to the supplied list - * idicating if we skipped any buffers to prevent deadlocks. - */ -STATIC int -xfs_buf_delwri_split( - xfs_buftarg_t *target, - struct list_head *list, - unsigned long age) -{ - xfs_buf_t *bp, *n; - int skipped = 0; - int force; - - force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); - INIT_LIST_HEAD(list); - spin_lock(&target->bt_delwri_lock); - list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) { - ASSERT(bp->b_flags & XBF_DELWRI); - - if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) { - if (!force && - time_before(jiffies, bp->b_queuetime + age)) { - xfs_buf_unlock(bp); - break; - } - - bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q); - bp->b_flags |= XBF_WRITE; - list_move_tail(&bp->b_list, list); - trace_xfs_buf_delwri_split(bp, _RET_IP_); - } else - skipped++; + bp->b_flags |= _XBF_DELWRI_Q; + if (list_empty(&bp->b_list)) { + atomic_inc(&bp->b_hold); + list_add_tail(&bp->b_list, list); } - spin_unlock(&target->bt_delwri_lock); - return skipped; + return true; } /* @@ -1685,103 +1565,109 @@ xfs_buf_cmp( return 0; } -STATIC int -xfsbufd( - void *data) +static int +__xfs_buf_delwri_submit( + struct list_head *buffer_list, + struct list_head *io_list, + bool wait) { - xfs_buftarg_t *target = (xfs_buftarg_t *)data; - - current->flags |= PF_MEMALLOC; - - set_freezable(); - - do { - long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); - long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); - struct list_head tmp; - struct blk_plug plug; - - if (unlikely(freezing(current))) { - set_bit(XBT_FORCE_SLEEP, &target->bt_flags); - refrigerator(); + struct blk_plug plug; + struct xfs_buf *bp, *n; + int pinned = 0; + + list_for_each_entry_safe(bp, n, buffer_list, b_list) { + if (!wait) { + if (xfs_buf_ispinned(bp)) { + pinned++; + continue; + } + if (!xfs_buf_trylock(bp)) + continue; } else { - clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); + xfs_buf_lock(bp); } - /* sleep for a long time if there is nothing to do. */ - if (list_empty(&target->bt_delwri_queue)) - tout = MAX_SCHEDULE_TIMEOUT; - schedule_timeout_interruptible(tout); + /* + * Someone else might have written the buffer synchronously or + * marked it stale in the meantime. In that case only the + * _XBF_DELWRI_Q flag got cleared, and we have to drop the + * reference and remove it from the list here. + */ + if (!(bp->b_flags & _XBF_DELWRI_Q)) { + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + continue; + } + + list_move_tail(&bp->b_list, io_list); + trace_xfs_buf_delwri_split(bp, _RET_IP_); + } - xfs_buf_delwri_split(target, &tmp, age); - list_sort(NULL, &tmp, xfs_buf_cmp); + list_sort(NULL, io_list, xfs_buf_cmp); - blk_start_plug(&plug); - while (!list_empty(&tmp)) { - struct xfs_buf *bp; - bp = list_first_entry(&tmp, struct xfs_buf, b_list); + blk_start_plug(&plug); + list_for_each_entry_safe(bp, n, io_list, b_list) { + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); + bp->b_flags |= XBF_WRITE; + + if (!wait) { + bp->b_flags |= XBF_ASYNC; list_del_init(&bp->b_list); - xfs_bdstrat_cb(bp); } - blk_finish_plug(&plug); - } while (!kthread_should_stop()); + xfs_bdstrat_cb(bp); + } + blk_finish_plug(&plug); - return 0; + return pinned; } /* - * Go through all incore buffers, and release buffers if they belong to - * the given device. This is used in filesystem error handling to - * preserve the consistency of its metadata. + * Write out a buffer list asynchronously. + * + * This will take the @buffer_list, write all non-locked and non-pinned buffers + * out and not wait for I/O completion on any of the buffers. This interface + * is only safely useable for callers that can track I/O completion by higher + * level means, e.g. AIL pushing as the @buffer_list is consumed in this + * function. */ int -xfs_flush_buftarg( - xfs_buftarg_t *target, - int wait) +xfs_buf_delwri_submit_nowait( + struct list_head *buffer_list) { - xfs_buf_t *bp; - int pincount = 0; - LIST_HEAD(tmp_list); - LIST_HEAD(wait_list); - struct blk_plug plug; + LIST_HEAD (io_list); + return __xfs_buf_delwri_submit(buffer_list, &io_list, false); +} - flush_workqueue(xfslogd_workqueue); +/* + * Write out a buffer list synchronously. + * + * This will take the @buffer_list, write all buffers out and wait for I/O + * completion on all of the buffers. @buffer_list is consumed by the function, + * so callers must have some other way of tracking buffers if they require such + * functionality. + */ +int +xfs_buf_delwri_submit( + struct list_head *buffer_list) +{ + LIST_HEAD (io_list); + int error = 0, error2; + struct xfs_buf *bp; - set_bit(XBT_FORCE_FLUSH, &target->bt_flags); - pincount = xfs_buf_delwri_split(target, &tmp_list, 0); + __xfs_buf_delwri_submit(buffer_list, &io_list, true); - /* - * Dropped the delayed write list lock, now walk the temporary list. - * All I/O is issued async and then if we need to wait for completion - * we do that after issuing all the IO. - */ - list_sort(NULL, &tmp_list, xfs_buf_cmp); + /* Wait for IO to complete. */ + while (!list_empty(&io_list)) { + bp = list_first_entry(&io_list, struct xfs_buf, b_list); - blk_start_plug(&plug); - while (!list_empty(&tmp_list)) { - bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); - ASSERT(target == bp->b_target); list_del_init(&bp->b_list); - if (wait) { - bp->b_flags &= ~XBF_ASYNC; - list_add(&bp->b_list, &wait_list); - } - xfs_bdstrat_cb(bp); - } - blk_finish_plug(&plug); - - if (wait) { - /* Wait for IO to complete. */ - while (!list_empty(&wait_list)) { - bp = list_first_entry(&wait_list, struct xfs_buf, b_list); - - list_del_init(&bp->b_list); - xfs_buf_iowait(bp); - xfs_buf_relse(bp); - } + error2 = xfs_buf_iowait(bp); + xfs_buf_relse(bp); + if (!error) + error = error2; } - return pincount; + return error; } int __init @@ -1797,21 +1683,8 @@ xfs_buf_init(void) if (!xfslogd_workqueue) goto out_free_buf_zone; - xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1); - if (!xfsdatad_workqueue) - goto out_destroy_xfslogd_workqueue; - - xfsconvertd_workqueue = alloc_workqueue("xfsconvertd", - WQ_MEM_RECLAIM, 1); - if (!xfsconvertd_workqueue) - goto out_destroy_xfsdatad_workqueue; - return 0; - out_destroy_xfsdatad_workqueue: - destroy_workqueue(xfsdatad_workqueue); - out_destroy_xfslogd_workqueue: - destroy_workqueue(xfslogd_workqueue); out_free_buf_zone: kmem_zone_destroy(xfs_buf_zone); out: @@ -1821,8 +1694,6 @@ xfs_buf_init(void) void xfs_buf_terminate(void) { - destroy_workqueue(xfsconvertd_workqueue); - destroy_workqueue(xfsdatad_workqueue); destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 5bab046e859..79344c48008 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -21,7 +21,6 @@ #include <linux/list.h> #include <linux/types.h> #include <linux/spinlock.h> -#include <asm/system.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/buffer_head.h> @@ -33,11 +32,6 @@ #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) -#define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) -#define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -#define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) -#define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) - typedef enum { XBRW_READ = 1, /* transfer into target memory */ XBRW_WRITE = 2, /* transfer from target memory */ @@ -47,11 +41,9 @@ typedef enum { #define XBF_READ (1 << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ -#define XBF_MAPPED (1 << 3) /* buffer mapped (b_addr valid) */ #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ -#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ -#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ +#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ /* I/O hints for the BIO layer */ #define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ @@ -59,14 +51,13 @@ typedef enum { #define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ /* flags used only as arguments to access routines */ -#define XBF_LOCK (1 << 15)/* lock requested */ #define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ -#define XBF_DONT_BLOCK (1 << 17)/* do not block in current thread */ +#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ /* flags used only internally */ #define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1 << 21)/* backed by heap memory */ -#define _XBF_DELWRI_Q (1 << 22)/* buffer on delwri queue */ +#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ typedef unsigned int xfs_buf_flags_t; @@ -74,26 +65,18 @@ typedef unsigned int xfs_buf_flags_t; { XBF_READ, "READ" }, \ { XBF_WRITE, "WRITE" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ - { XBF_MAPPED, "MAPPED" }, \ { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ - { XBF_DELWRI, "DELWRI" }, \ { XBF_STALE, "STALE" }, \ { XBF_SYNCIO, "SYNCIO" }, \ { XBF_FUA, "FUA" }, \ { XBF_FLUSH, "FLUSH" }, \ - { XBF_LOCK, "LOCK" }, /* should never be set */\ - { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ - { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ + { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ + { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" } -typedef enum { - XBT_FORCE_SLEEP = 0, - XBT_FORCE_FLUSH = 1, -} xfs_buftarg_flags_t; - typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; @@ -103,12 +86,6 @@ typedef struct xfs_buftarg { unsigned int bt_sshift; size_t bt_smask; - /* per device delwri queue */ - struct task_struct *bt_task; - struct list_head bt_delwri_queue; - spinlock_t bt_delwri_lock; - unsigned long bt_flags; - /* LRU control structures */ struct shrinker bt_shrinker; struct list_head bt_lru; @@ -130,8 +107,8 @@ typedef struct xfs_buf { * fast-path on locking. */ struct rb_node b_rbnode; /* rbtree node */ - xfs_off_t b_file_offset; /* offset in file */ - size_t b_buffer_length;/* size of buffer in bytes */ + xfs_daddr_t b_bn; /* block number for I/O */ + int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ atomic_t b_lru_ref; /* lru reclaim ref count */ xfs_buf_flags_t b_flags; /* status flags */ @@ -142,8 +119,6 @@ typedef struct xfs_buf { struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ xfs_buftarg_t *b_target; /* buffer target (device) */ - xfs_daddr_t b_bn; /* block number for I/O */ - size_t b_count_desired;/* desired transfer size */ void *b_addr; /* virtual address of buffer */ struct work_struct b_iodone_work; xfs_buf_iodone_t b_iodone; /* I/O completion function */ @@ -152,7 +127,7 @@ typedef struct xfs_buf { struct xfs_trans *b_transp; struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ - unsigned long b_queuetime; /* time buffer was queued */ + int b_io_length; /* IO size in BBs */ atomic_t b_pin_count; /* pin count */ atomic_t b_io_remaining; /* #outstanding I/O requests */ unsigned int b_page_count; /* size of page array */ @@ -165,26 +140,30 @@ typedef struct xfs_buf { /* Finding and Reading Buffers */ -extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t, xfs_buf_t *); +struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags, + struct xfs_buf *new_bp); #define xfs_incore(buftarg,blkno,len,lockit) \ _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) -extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t); -extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t); - -struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t, - xfs_buf_flags_t); -extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len); -extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); -extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); -extern void xfs_buf_hold(xfs_buf_t *); -extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t); -struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp, - struct xfs_buftarg *target, - xfs_daddr_t daddr, size_t length, int flags); +struct xfs_buf *xfs_buf_get(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags); +struct xfs_buf *xfs_buf_read(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags); +void xfs_buf_readahead(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks); + +struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); +struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags); +void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); +int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); + +struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, + int flags); +struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, + xfs_daddr_t daddr, size_t numblks, int flags); +void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ extern void xfs_buf_free(xfs_buf_t *); @@ -201,12 +180,11 @@ extern void xfs_buf_unlock(xfs_buf_t *); extern int xfs_bwrite(struct xfs_buf *bp); extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); -extern int xfs_bdstrat_cb(struct xfs_buf *); extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); -extern int xfs_buf_iorequest(xfs_buf_t *); +extern void xfs_buf_iorequest(xfs_buf_t *); extern int xfs_buf_iowait(xfs_buf_t *); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_rw_t); @@ -222,24 +200,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp) extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); /* Delayed Write Buffer Routines */ -extern void xfs_buf_delwri_queue(struct xfs_buf *); -extern void xfs_buf_delwri_dequeue(struct xfs_buf *); -extern void xfs_buf_delwri_promote(struct xfs_buf *); +extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); +extern int xfs_buf_delwri_submit(struct list_head *); +extern int xfs_buf_delwri_submit_nowait(struct list_head *); /* Buffer Daemon Setup Routines */ extern int xfs_buf_init(void); extern void xfs_buf_terminate(void); #define XFS_BUF_ZEROFLAGS(bp) \ - ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ + ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \ XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) #define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) -#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) - #define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) #define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) #define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) @@ -258,12 +234,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_ADDR(bp) ((bp)->b_bn) #define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) -#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset) -#define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off)) -#define XFS_BUF_COUNT(bp) ((bp)->b_count_desired) -#define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt)) -#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) -#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { @@ -289,7 +259,6 @@ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); -extern int xfs_flush_buftarg(xfs_buftarg_t *, int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index eac97ef81e2..d9e451115f9 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -123,11 +122,11 @@ xfs_buf_item_log_check( ASSERT(bip->bli_logged != NULL); bp = bip->bli_buf; - ASSERT(XFS_BUF_COUNT(bp) > 0); + ASSERT(bp->b_length > 0); ASSERT(bp->b_addr != NULL); orig = bip->bli_orig; buffer = bp->b_addr; - for (x = 0; x < XFS_BUF_COUNT(bp); x++) { + for (x = 0; x < BBTOB(bp->b_length); x++) { if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { xfs_emerg(bp->b_mount, "%s: bip %x buffer %x orig %x index %d", @@ -418,7 +417,6 @@ xfs_buf_item_unpin( if (freed && stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(xfs_buf_islocked(bp)); - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); @@ -455,42 +453,42 @@ xfs_buf_item_unpin( bp->b_iodone = NULL; } else { spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); ASSERT(bp->b_fspriv == NULL); } xfs_buf_relse(bp); + } else if (freed && remove) { + xfs_buf_lock(bp); + xfs_buf_ioerror(bp, EIO); + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); + xfs_buf_ioend(bp, 0); } } -/* - * This is called to attempt to lock the buffer associated with this - * buf log item. Don't sleep on the buffer lock. If we can't get - * the lock right away, return 0. If we can get the lock, take a - * reference to the buffer. If this is a delayed write buffer that - * needs AIL help to be written back, invoke the pushbuf routine - * rather than the normal success path. - */ STATIC uint -xfs_buf_item_trylock( - struct xfs_log_item *lip) +xfs_buf_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; + uint rval = XFS_ITEM_SUCCESS; if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; - /* take a reference to the buffer. */ - xfs_buf_hold(bp); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - trace_xfs_buf_item_trylock(bip); - if (XFS_BUF_ISDELAYWRITE(bp)) - return XFS_ITEM_PUSHBUF; - return XFS_ITEM_SUCCESS; + + trace_xfs_buf_item_push(bip); + + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_unlock(bp); + return rval; } /* @@ -603,49 +601,6 @@ xfs_buf_item_committed( return lsn; } -/* - * The buffer is locked, but is not a delayed write buffer. This happens - * if we race with IO completion and hence we don't want to try to write it - * again. Just release the buffer. - */ -STATIC void -xfs_buf_item_push( - struct xfs_log_item *lip) -{ - struct xfs_buf_log_item *bip = BUF_ITEM(lip); - struct xfs_buf *bp = bip->bli_buf; - - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); - - trace_xfs_buf_item_push(bip); - - xfs_buf_relse(bp); -} - -/* - * The buffer is locked and is a delayed write buffer. Promote the buffer - * in the delayed write queue as the caller knows that they must invoke - * the xfsbufd to get this buffer written. We have to unlock the buffer - * to allow the xfsbufd to write it, too. - */ -STATIC bool -xfs_buf_item_pushbuf( - struct xfs_log_item *lip) -{ - struct xfs_buf_log_item *bip = BUF_ITEM(lip); - struct xfs_buf *bp = bip->bli_buf; - - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(XFS_BUF_ISDELAYWRITE(bp)); - - trace_xfs_buf_item_pushbuf(bip); - - xfs_buf_delwri_promote(bp); - xfs_buf_relse(bp); - return true; -} - STATIC void xfs_buf_item_committing( struct xfs_log_item *lip, @@ -661,11 +616,9 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_format = xfs_buf_item_format, .iop_pin = xfs_buf_item_pin, .iop_unpin = xfs_buf_item_unpin, - .iop_trylock = xfs_buf_item_trylock, .iop_unlock = xfs_buf_item_unlock, .iop_committed = xfs_buf_item_committed, .iop_push = xfs_buf_item_push, - .iop_pushbuf = xfs_buf_item_pushbuf, .iop_committing = xfs_buf_item_committing }; @@ -703,7 +656,8 @@ xfs_buf_item_init( * truncate any pieces. map_size is the size of the * bitmap needed to describe the chunks of the buffer. */ - chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT); + chunks = (int)((BBTOB(bp->b_length) + (XFS_BLF_CHUNK - 1)) >> + XFS_BLF_SHIFT); map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, @@ -713,7 +667,7 @@ xfs_buf_item_init( xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); - bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); + bip->bli_format.blf_len = (ushort)bp->b_length; bip->bli_format.blf_map_size = map_size; #ifdef XFS_TRANS_DEBUG @@ -725,9 +679,9 @@ xfs_buf_item_init( * the buffer to indicate which bytes the callers have asked * to have logged. */ - bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); - memcpy(bip->bli_orig, bp->b_addr, XFS_BUF_COUNT(bp)); - bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); + bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP); + memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length)); + bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP); #endif /* @@ -984,20 +938,27 @@ xfs_buf_iodone_callbacks( * If the write was asynchronous then no one will be looking for the * error. Clear the error state and write the buffer out again. * - * During sync or umount we'll write all pending buffers again - * synchronous, which will catch these errors if they keep hanging - * around. + * XXX: This helps against transient write errors, but we need to find + * a way to shut the filesystem down if the writes keep failing. + * + * In practice we'll shut the filesystem down soon as non-transient + * erorrs tend to affect the whole device and a failing log write + * will make us give up. But we really ought to do better here. */ if (XFS_BUF_ISASYNC(bp)) { + ASSERT(bp->b_iodone != NULL); + + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ if (!XFS_BUF_ISSTALE(bp)) { - xfs_buf_delwri_queue(bp); - XFS_BUF_DONE(bp); + bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE; + xfs_buf_iorequest(bp); + } else { + xfs_buf_relse(bp); } - ASSERT(bp->b_iodone != NULL); - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - xfs_buf_relse(bp); + return; } @@ -1045,6 +1006,6 @@ xfs_buf_iodone( * Either way, AIL is useless if we're forcing a shutdown. */ spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_free(BUF_ITEM(lip)); } diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 77c74257c2a..015b946c580 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -108,6 +107,8 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, int error; xfs_trans_t *tp; + trace_xfs_da_node_create(args); + tp = args->trans; error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork); if (error) @@ -140,6 +141,8 @@ xfs_da_split(xfs_da_state_t *state) xfs_dabuf_t *bp; int max, action, error, i; + trace_xfs_da_split(state->args); + /* * Walk back up the tree splitting/inserting/adjusting as necessary. * If we need to insert and there isn't room, split the node, then @@ -178,10 +181,12 @@ xfs_da_split(xfs_da_state_t *state) state->extravalid = 1; if (state->inleaf) { state->extraafter = 0; /* before newblk */ + trace_xfs_attr_leaf_split_before(state->args); error = xfs_attr_leaf_split(state, oldblk, &state->extrablk); } else { state->extraafter = 1; /* after newblk */ + trace_xfs_attr_leaf_split_after(state->args); error = xfs_attr_leaf_split(state, newblk, &state->extrablk); } @@ -300,6 +305,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_mount_t *mp; xfs_dir2_leaf_t *leaf; + trace_xfs_da_root_split(state->args); + /* * Copy the existing (incorrect) block from the root node position * to a free space somewhere. @@ -380,6 +387,8 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, int newcount, error; int useextra; + trace_xfs_da_node_split(state->args); + node = oldblk->bp->data; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); @@ -466,6 +475,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, int count, tmp; xfs_trans_t *tp; + trace_xfs_da_node_rebalance(state->args); + node1 = blk1->bp->data; node2 = blk2->bp->data; /* @@ -574,6 +585,8 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, xfs_da_node_entry_t *btree; int tmp; + trace_xfs_da_node_add(state->args); + node = oldblk->bp->data; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); @@ -619,6 +632,8 @@ xfs_da_join(xfs_da_state_t *state) xfs_da_state_blk_t *drop_blk, *save_blk; int action, error; + trace_xfs_da_join(state->args); + action = 0; drop_blk = &state->path.blk[ state->path.active-1 ]; save_blk = &state->altpath.blk[ state->path.active-1 ]; @@ -723,6 +738,8 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) xfs_dabuf_t *bp; int error; + trace_xfs_da_root_join(state->args); + args = state->args; ASSERT(args != NULL); ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); @@ -941,6 +958,8 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) xfs_da_node_entry_t *btree; int tmp; + trace_xfs_da_node_remove(state->args); + node = drop_blk->bp->data; ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count)); ASSERT(drop_blk->index >= 0); @@ -984,6 +1003,8 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, int tmp; xfs_trans_t *tp; + trace_xfs_da_node_unbalance(state->args); + drop_node = drop_blk->bp->data; save_node = save_blk->bp->data; ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); @@ -1230,6 +1251,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, /* * Link new block in before existing block. */ + trace_xfs_da_link_before(args); new_info->forw = cpu_to_be32(old_blk->blkno); new_info->back = old_info->back; if (old_info->back) { @@ -1251,6 +1273,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, /* * Link new block in after existing block. */ + trace_xfs_da_link_after(args); new_info->forw = old_info->forw; new_info->back = cpu_to_be32(old_blk->blkno); if (old_info->forw) { @@ -1348,6 +1371,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * Unlink the leaf block from the doubly linked chain of leaves. */ if (be32_to_cpu(save_info->back) == drop_blk->blkno) { + trace_xfs_da_unlink_back(args); save_info->back = drop_info->back; if (drop_info->back) { error = xfs_da_read_buf(args->trans, args->dp, @@ -1365,6 +1389,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_da_buf_done(bp); } } else { + trace_xfs_da_unlink_forward(args); save_info->forw = drop_info->forw; if (drop_info->forw) { error = xfs_da_read_buf(args->trans, args->dp, @@ -1652,6 +1677,8 @@ xfs_da_grow_inode( int count; int error; + trace_xfs_da_grow_inode(args); + if (args->whichfork == XFS_DATA_FORK) { bno = args->dp->i_mount->m_dirleafblk; count = args->dp->i_mount->m_dirblkfsbs; @@ -1690,6 +1717,8 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, xfs_dir2_leaf_t *dead_leaf2; xfs_dahash_t dead_hash; + trace_xfs_da_swap_lastblock(args); + dead_buf = *dead_bufp; dead_blkno = *dead_blknop; tp = args->trans; @@ -1878,6 +1907,8 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, xfs_trans_t *tp; xfs_mount_t *mp; + trace_xfs_da_shrink_inode(args); + dp = args->dp; w = args->whichfork; tp = args->trans; @@ -2245,20 +2276,20 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps) if (nbuf == 1) { dabuf->nbuf = 1; bp = bps[0]; - dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp)); + dabuf->bbcount = bp->b_length; dabuf->data = bp->b_addr; dabuf->bps[0] = bp; } else { dabuf->nbuf = nbuf; for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) { dabuf->bps[i] = bp = bps[i]; - dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp)); + dabuf->bbcount += bp->b_length; } dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); - for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) { + for (i = off = 0; i < nbuf; i++, off += BBTOB(bp->b_length)) { bp = bps[i]; memcpy((char *)dabuf->data + off, bp->b_addr, - XFS_BUF_COUNT(bp)); + BBTOB(bp->b_length)); } } return dabuf; @@ -2278,10 +2309,10 @@ xfs_da_buf_clean(xfs_dabuf_t *dabuf) ASSERT(dabuf->nbuf > 1); dabuf->dirty = 0; for (i = off = 0; i < dabuf->nbuf; - i++, off += XFS_BUF_COUNT(bp)) { + i++, off += BBTOB(bp->b_length)) { bp = dabuf->bps[i]; memcpy(bp->b_addr, dabuf->data + off, - XFS_BUF_COUNT(bp)); + BBTOB(bp->b_length)); } } } @@ -2324,10 +2355,10 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) } dabuf->dirty = 1; ASSERT(first <= last); - for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) { + for (i = off = 0; i < dabuf->nbuf; i++, off += BBTOB(bp->b_length)) { bp = dabuf->bps[i]; f = off; - l = f + XFS_BUF_COUNT(bp) - 1; + l = f + BBTOB(bp->b_length) - 1; if (f < first) f = first; if (l > last) diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 654dc6f05ba..e00de08dc8a 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -163,12 +161,14 @@ xfs_swap_extents_check_format( /* Check temp in extent form to max in target */ if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return EINVAL; /* Check target in extent form to max in temp */ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return EINVAL; /* @@ -180,18 +180,25 @@ xfs_swap_extents_check_format( * (a common defrag case) which will occur when the temp inode is in * extent format... */ - if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && - ((XFS_IFORK_BOFF(ip) && - tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) || - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max)) - return EINVAL; + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(ip) && + tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) + return EINVAL; + if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + return EINVAL; + } /* Reciprocal target->temp btree format checks */ - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && - ((XFS_IFORK_BOFF(tip) && - ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) || - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max)) - return EINVAL; + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(tip) && + ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) + return EINVAL; + + if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + return EINVAL; + } return 0; } @@ -206,7 +213,7 @@ xfs_swap_extents( xfs_trans_t *tp; xfs_bstat_t *sbp = &sxp->sx_stat; xfs_ifork_t *tempifp, *ifp, *tifp; - int ilf_fields, tilf_fields; + int src_log_flags, target_log_flags; int error = 0; int aforkblks = 0; int taforkblks = 0; @@ -349,16 +356,6 @@ xfs_swap_extents( *tifp = *tempifp; /* struct copy */ /* - * Fix the in-memory data fork values that are dependent on the fork - * offset in the inode. We can't assume they remain the same as attr2 - * has dynamic fork offsets. - */ - ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) / - (uint)sizeof(xfs_bmbt_rec_t); - tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) / - (uint)sizeof(xfs_bmbt_rec_t); - - /* * Fix the on-disk inode values */ tmp = (__uint64_t)ip->i_d.di_nblocks; @@ -386,9 +383,8 @@ xfs_swap_extents( tip->i_delayed_blks = ip->i_delayed_blks; ip->i_delayed_blks = 0; - ilf_fields = XFS_ILOG_CORE; - - switch(ip->i_d.di_format) { + src_log_flags = XFS_ILOG_CORE; + switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: /* If the extents fit in the inode, fix the * pointer. Otherwise it's already NULL or @@ -398,16 +394,15 @@ xfs_swap_extents( ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; } - ilf_fields |= XFS_ILOG_DEXT; + src_log_flags |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - ilf_fields |= XFS_ILOG_DBROOT; + src_log_flags |= XFS_ILOG_DBROOT; break; } - tilf_fields = XFS_ILOG_CORE; - - switch(tip->i_d.di_format) { + target_log_flags = XFS_ILOG_CORE; + switch (tip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: /* If the extents fit in the inode, fix the * pointer. Otherwise it's already NULL or @@ -417,10 +412,10 @@ xfs_swap_extents( tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; } - tilf_fields |= XFS_ILOG_DEXT; + target_log_flags |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - tilf_fields |= XFS_ILOG_DBROOT; + target_log_flags |= XFS_ILOG_DBROOT; break; } @@ -428,8 +423,8 @@ xfs_swap_extents( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_log_inode(tp, ip, ilf_fields); - xfs_trans_log_inode(tp, tip, tilf_fields); + xfs_trans_log_inode(tp, ip, src_log_flags); + xfs_trans_log_inode(tp, tip, target_log_flags); /* * If this is a synchronous mount, make sure that the diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index a2e27010c7f..67a250c36d4 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -18,7 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 9245e029b8e..586732f2d80 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -29,6 +28,7 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" +#include "xfs_dir2.h" #include "xfs_dir2_format.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 5bbe2a8a023..2046988e9eb 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 66e108f561a..397ffbcbab1 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 0179a41d9e5..b0f26780449 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 79d05e84e29..19bf0c5e38f 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 8a24f0c6c86..f9c3fe304a1 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -17,7 +17,6 @@ */ #include "xfs.h" #include "xfs_sb.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" @@ -30,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_extent_busy.h" #include "xfs_discard.h" #include "xfs_trace.h" @@ -37,9 +37,9 @@ STATIC int xfs_trim_extents( struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_fsblock_t start, - xfs_fsblock_t end, - xfs_fsblock_t minlen, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen, __uint64_t *blocks_trimmed) { struct block_device *bdev = mp->m_ddev_targp->bt_bdev; @@ -67,8 +67,8 @@ xfs_trim_extents( /* * Look up the longest btree in the AGF and start with it. */ - error = xfs_alloc_lookup_le(cur, 0, - XFS_BUF_TO_AGF(agbp)->agf_longest, &i); + error = xfs_alloc_lookup_ge(cur, 0, + be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); if (error) goto out_del_cursor; @@ -77,19 +77,29 @@ xfs_trim_extents( * enough to be worth discarding. */ while (i) { - xfs_agblock_t fbno; - xfs_extlen_t flen; + xfs_agblock_t fbno; + xfs_extlen_t flen; + xfs_daddr_t dbno; + xfs_extlen_t dlen; error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) goto out_del_cursor; XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); - ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest); + ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); + + /* + * use daddr format for all range/len calculations as that is + * the format the range/len variables are supplied in by + * userspace. + */ + dbno = XFS_AGB_TO_DADDR(mp, agno, fbno); + dlen = XFS_FSB_TO_BB(mp, flen); /* * Too small? Give up. */ - if (flen < minlen) { + if (dlen < minlen) { trace_xfs_discard_toosmall(mp, agno, fbno, flen); goto out_del_cursor; } @@ -99,8 +109,7 @@ xfs_trim_extents( * supposed to discard skip it. Do not bother to trim * down partially overlapping ranges for now. */ - if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || - XFS_AGB_TO_FSB(mp, agno, fbno) > end) { + if (dbno + dlen < start || dbno > end) { trace_xfs_discard_exclude(mp, agno, fbno, flen); goto next_extent; } @@ -109,16 +118,13 @@ xfs_trim_extents( * If any blocks in the range are still busy, skip the * discard and try again the next time. */ - if (xfs_alloc_busy_search(mp, agno, fbno, flen)) { + if (xfs_extent_busy_search(mp, agno, fbno, flen)) { trace_xfs_discard_busy(mp, agno, fbno, flen); goto next_extent; } trace_xfs_discard_extent(mp, agno, fbno, flen); - error = -blkdev_issue_discard(bdev, - XFS_AGB_TO_DADDR(mp, agno, fbno), - XFS_FSB_TO_BB(mp, flen), - GFP_NOFS, 0); + error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0); if (error) goto out_del_cursor; *blocks_trimmed += flen; @@ -137,6 +143,15 @@ out_put_perag: return error; } +/* + * trim a range of the filesystem. + * + * Note: the parameters passed from userspace are byte ranges into the + * filesystem which does not match to the format we use for filesystem block + * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format + * is a linear address range. Hence we need to use DADDR based conversions and + * comparisons for determining the correct offset and regions to trim. + */ int xfs_ioc_trim( struct xfs_mount *mp, @@ -145,7 +160,7 @@ xfs_ioc_trim( struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; unsigned int granularity = q->limits.discard_granularity; struct fstrim_range range; - xfs_fsblock_t start, end, minlen; + xfs_daddr_t start, end, minlen; xfs_agnumber_t start_agno, end_agno, agno; __uint64_t blocks_trimmed = 0; int error, last_error = 0; @@ -159,22 +174,22 @@ xfs_ioc_trim( /* * Truncating down the len isn't actually quite correct, but using - * XFS_B_TO_FSB would mean we trivially get overflows for values + * BBTOB would mean we trivially get overflows for values * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default * used by the fstrim application. In the end it really doesn't * matter as trimming blocks is an advisory interface. */ - start = XFS_B_TO_FSBT(mp, range.start); - end = start + XFS_B_TO_FSBT(mp, range.len) - 1; - minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); + start = BTOBB(range.start); + end = start + BTOBBT(range.len) - 1; + minlen = BTOBB(max_t(u64, granularity, range.minlen)); - if (start >= mp->m_sb.sb_dblocks) + if (XFS_BB_TO_FSB(mp, start) >= mp->m_sb.sb_dblocks) return -XFS_ERROR(EINVAL); - if (end > mp->m_sb.sb_dblocks - 1) - end = mp->m_sb.sb_dblocks - 1; + if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) + end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1; - start_agno = XFS_FSB_TO_AGNO(mp, start); - end_agno = XFS_FSB_TO_AGNO(mp, end); + start_agno = xfs_daddr_to_agno(mp, start); + end_agno = xfs_daddr_to_agno(mp, end); for (agno = start_agno; agno <= end_agno; agno++) { error = -xfs_trim_extents(mp, agno, start, end, minlen, @@ -197,7 +212,7 @@ xfs_discard_extents( struct xfs_mount *mp, struct list_head *list) { - struct xfs_busy_extent *busyp; + struct xfs_extent_busy *busyp; int error = 0; list_for_each_entry(busyp, list, list) { diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 25d7280e9f6..bf27fcca484 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -39,20 +38,18 @@ #include "xfs_qm.h" #include "xfs_trace.h" - /* - LOCK ORDER - - inode lock (ilock) - dquot hash-chain lock (hashlock) - xqm dquot freelist lock (freelistlock - mount's dquot list lock (mplistlock) - user dquot lock - lock ordering among dquots is based on the uid or gid - group dquot lock - similar to udquots. Between the two dquots, the udquot - has to be locked first. - pin lock - the dquot lock must be held to take this lock. - flush lock - ditto. -*/ + * Lock order: + * + * ip->i_lock + * qi->qi_tree_lock + * dquot->q_qlock (xfs_dqlock() and friends) + * dquot->q_flush (xfs_dqflock() and friends) + * qi->qi_lru_lock + * + * If two dquots need to be locked the order is user before group/project, + * otherwise by the lowest id first, see xfs_dqlock2. + */ #ifdef DEBUG xfs_buftarg_t *xfs_dqerror_target; @@ -61,83 +58,10 @@ int xfs_dqreq_num; int xfs_dqerror_mod = 33; #endif -static struct lock_class_key xfs_dquot_other_class; - -/* - * Allocate and initialize a dquot. We don't always allocate fresh memory; - * we try to reclaim a free dquot if the number of incore dquots are above - * a threshold. - * The only field inside the core that gets initialized at this point - * is the d_id field. The idea is to fill in the entire q_core - * when we read in the on disk dquot. - */ -STATIC xfs_dquot_t * -xfs_qm_dqinit( - xfs_mount_t *mp, - xfs_dqid_t id, - uint type) -{ - xfs_dquot_t *dqp; - boolean_t brandnewdquot; - - brandnewdquot = xfs_qm_dqalloc_incore(&dqp); - dqp->dq_flags = type; - dqp->q_core.d_id = cpu_to_be32(id); - dqp->q_mount = mp; - - /* - * No need to re-initialize these if this is a reclaimed dquot. - */ - if (brandnewdquot) { - INIT_LIST_HEAD(&dqp->q_freelist); - mutex_init(&dqp->q_qlock); - init_waitqueue_head(&dqp->q_pinwait); - - /* - * Because we want to use a counting completion, complete - * the flush completion once to allow a single access to - * the flush completion without blocking. - */ - init_completion(&dqp->q_flush); - complete(&dqp->q_flush); - - trace_xfs_dqinit(dqp); - } else { - /* - * Only the q_core portion was zeroed in dqreclaim_one(). - * So, we need to reset others. - */ - dqp->q_nrefs = 0; - dqp->q_blkno = 0; - INIT_LIST_HEAD(&dqp->q_mplist); - INIT_LIST_HEAD(&dqp->q_hashlist); - dqp->q_bufoffset = 0; - dqp->q_fileoffset = 0; - dqp->q_transp = NULL; - dqp->q_gdquot = NULL; - dqp->q_res_bcount = 0; - dqp->q_res_icount = 0; - dqp->q_res_rtbcount = 0; - atomic_set(&dqp->q_pincount, 0); - dqp->q_hash = NULL; - ASSERT(list_empty(&dqp->q_freelist)); - - trace_xfs_dqreuse(dqp); - } - - /* - * In either case we need to make sure group quotas have a different - * lock class than user quotas, to make sure lockdep knows we can - * locks of one of each at the same time. - */ - if (!(type & XFS_DQ_USER)) - lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); +struct kmem_zone *xfs_qm_dqtrxzone; +static struct kmem_zone *xfs_qm_dqzone; - /* - * log item gets initialized later - */ - return (dqp); -} +static struct lock_class_key xfs_dquot_other_class; /* * This is called to free all the memory associated with a dquot @@ -146,30 +70,12 @@ void xfs_qm_dqdestroy( xfs_dquot_t *dqp) { - ASSERT(list_empty(&dqp->q_freelist)); + ASSERT(list_empty(&dqp->q_lru)); mutex_destroy(&dqp->q_qlock); - kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); + kmem_zone_free(xfs_qm_dqzone, dqp); - atomic_dec(&xfs_Gqm->qm_totaldquots); -} - -/* - * This is what a 'fresh' dquot inside a dquot chunk looks like on disk. - */ -STATIC void -xfs_qm_dqinit_core( - xfs_dqid_t id, - uint type, - xfs_dqblk_t *d) -{ - /* - * Caller has zero'd the entire dquot 'chunk' already. - */ - d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); - d->dd_diskdq.d_version = XFS_DQUOT_VERSION; - d->dd_diskdq.d_id = cpu_to_be32(id); - d->dd_diskdq.d_flags = type; + XFS_STATS_DEC(xs_qm_dquot); } /* @@ -234,10 +140,10 @@ xfs_qm_adjust_dqtimers( if (!d->d_btimer) { if ((d->d_blk_softlimit && - (be64_to_cpu(d->d_bcount) >= + (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_softlimit))) || (d->d_blk_hardlimit && - (be64_to_cpu(d->d_bcount) >= + (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_hardlimit)))) { d->d_btimer = cpu_to_be32(get_seconds() + mp->m_quotainfo->qi_btimelimit); @@ -246,10 +152,10 @@ xfs_qm_adjust_dqtimers( } } else { if ((!d->d_blk_softlimit || - (be64_to_cpu(d->d_bcount) < + (be64_to_cpu(d->d_bcount) <= be64_to_cpu(d->d_blk_softlimit))) && (!d->d_blk_hardlimit || - (be64_to_cpu(d->d_bcount) < + (be64_to_cpu(d->d_bcount) <= be64_to_cpu(d->d_blk_hardlimit)))) { d->d_btimer = 0; } @@ -257,10 +163,10 @@ xfs_qm_adjust_dqtimers( if (!d->d_itimer) { if ((d->d_ino_softlimit && - (be64_to_cpu(d->d_icount) >= + (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_softlimit))) || (d->d_ino_hardlimit && - (be64_to_cpu(d->d_icount) >= + (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_hardlimit)))) { d->d_itimer = cpu_to_be32(get_seconds() + mp->m_quotainfo->qi_itimelimit); @@ -269,10 +175,10 @@ xfs_qm_adjust_dqtimers( } } else { if ((!d->d_ino_softlimit || - (be64_to_cpu(d->d_icount) < + (be64_to_cpu(d->d_icount) <= be64_to_cpu(d->d_ino_softlimit))) && (!d->d_ino_hardlimit || - (be64_to_cpu(d->d_icount) < + (be64_to_cpu(d->d_icount) <= be64_to_cpu(d->d_ino_hardlimit)))) { d->d_itimer = 0; } @@ -280,10 +186,10 @@ xfs_qm_adjust_dqtimers( if (!d->d_rtbtimer) { if ((d->d_rtb_softlimit && - (be64_to_cpu(d->d_rtbcount) >= + (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_softlimit))) || (d->d_rtb_hardlimit && - (be64_to_cpu(d->d_rtbcount) >= + (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_hardlimit)))) { d->d_rtbtimer = cpu_to_be32(get_seconds() + mp->m_quotainfo->qi_rtbtimelimit); @@ -292,10 +198,10 @@ xfs_qm_adjust_dqtimers( } } else { if ((!d->d_rtb_softlimit || - (be64_to_cpu(d->d_rtbcount) < + (be64_to_cpu(d->d_rtbcount) <= be64_to_cpu(d->d_rtb_softlimit))) && (!d->d_rtb_hardlimit || - (be64_to_cpu(d->d_rtbcount) < + (be64_to_cpu(d->d_rtbcount) <= be64_to_cpu(d->d_rtb_hardlimit)))) { d->d_rtbtimer = 0; } @@ -328,8 +234,13 @@ xfs_qm_init_dquot_blk( curid = id - (id % q->qi_dqperchunk); ASSERT(curid >= 0); memset(d, 0, BBTOB(q->qi_dqchunklen)); - for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) - xfs_qm_dqinit_core(curid, type, d); + for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) { + d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + d->dd_diskdq.d_version = XFS_DQUOT_VERSION; + d->dd_diskdq.d_id = cpu_to_be32(curid); + d->dd_diskdq.d_flags = type; + } + xfs_trans_dquot_buf(tp, bp, (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : @@ -372,7 +283,7 @@ xfs_qm_dqalloc( * Return if this type of quotas is turned off while we didn't * have an inode lock */ - if (XFS_IS_THIS_QUOTA_OFF(dqp)) { + if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { xfs_iunlock(quotip, XFS_ILOCK_EXCL); return (ESRCH); } @@ -474,7 +385,7 @@ xfs_qm_dqtobp( dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; xfs_ilock(quotip, XFS_ILOCK_SHARED); - if (XFS_IS_THIS_QUOTA_OFF(dqp)) { + if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. @@ -564,36 +475,87 @@ xfs_qm_dqtobp( * Read in the ondisk dquot using dqtobp() then copy it to an incore version, * and release the buffer immediately. * + * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed. */ -/* ARGSUSED */ -STATIC int +int xfs_qm_dqread( - xfs_trans_t **tpp, - xfs_dqid_t id, - xfs_dquot_t *dqp, /* dquot to get filled in */ - uint flags) + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + uint flags, + struct xfs_dquot **O_dqpp) { - xfs_disk_dquot_t *ddqp; - xfs_buf_t *bp; - int error; - xfs_trans_t *tp; + struct xfs_dquot *dqp; + struct xfs_disk_dquot *ddqp; + struct xfs_buf *bp; + struct xfs_trans *tp = NULL; + int error; + int cancelflags = 0; + + + dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); + + dqp->dq_flags = type; + dqp->q_core.d_id = cpu_to_be32(id); + dqp->q_mount = mp; + INIT_LIST_HEAD(&dqp->q_lru); + mutex_init(&dqp->q_qlock); + init_waitqueue_head(&dqp->q_pinwait); - ASSERT(tpp); + /* + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. + */ + init_completion(&dqp->q_flush); + complete(&dqp->q_flush); + + /* + * Make sure group quotas have a different lock class than user + * quotas. + */ + if (!(type & XFS_DQ_USER)) + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); + + XFS_STATS_INC(xs_qm_dquot); trace_xfs_dqread(dqp); + if (flags & XFS_QMOPT_DQALLOC) { + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); + error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), + XFS_WRITE_LOG_RES(mp) + + /* + * Round the chunklen up to the next multiple + * of 128 (buf log item chunk size)). + */ + BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128, + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error) + goto error1; + cancelflags = XFS_TRANS_RELEASE_LOG_RES; + } + /* * get a pointer to the on-disk dquot and the buffer containing it * dqp already knows its own type (GROUP/USER). */ - if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) { - return (error); + error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags); + if (error) { + /* + * This can happen if quotas got turned off (ESRCH), + * or if the dquot didn't exist on disk and we ask to + * allocate (ENOENT). + */ + trace_xfs_dqread_fail(dqp); + cancelflags |= XFS_TRANS_ABORT; + goto error1; } - tp = *tpp; /* copy everything from disk dquot to the incore dquot */ memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); - ASSERT(be32_to_cpu(dqp->q_core.d_id) == id); xfs_qm_dquot_logitem_init(dqp); /* @@ -622,173 +584,22 @@ xfs_qm_dqread( ASSERT(xfs_buf_islocked(bp)); xfs_trans_brelse(tp, bp); - return (error); -} - - -/* - * allocate an incore dquot from the kernel heap, - * and fill its core with quota information kept on disk. - * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk - * if it wasn't already allocated. - */ -STATIC int -xfs_qm_idtodq( - xfs_mount_t *mp, - xfs_dqid_t id, /* gid or uid, depending on type */ - uint type, /* UDQUOT or GDQUOT */ - uint flags, /* DQALLOC, DQREPAIR */ - xfs_dquot_t **O_dqpp)/* OUT : incore dquot, not locked */ -{ - xfs_dquot_t *dqp; - int error; - xfs_trans_t *tp; - int cancelflags=0; - - dqp = xfs_qm_dqinit(mp, id, type); - tp = NULL; - if (flags & XFS_QMOPT_DQALLOC) { - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), - XFS_WRITE_LOG_RES(mp) + - BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + - 128, - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - if (error) { - cancelflags = 0; - goto error0; - } - cancelflags = XFS_TRANS_RELEASE_LOG_RES; - } - - /* - * Read it from disk; xfs_dqread() takes care of - * all the necessary initialization of dquot's fields (locks, etc) - */ - if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) { - /* - * This can happen if quotas got turned off (ESRCH), - * or if the dquot didn't exist on disk and we ask to - * allocate (ENOENT). - */ - trace_xfs_dqread_fail(dqp); - cancelflags |= XFS_TRANS_ABORT; - goto error0; - } if (tp) { - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) - goto error1; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error0; } *O_dqpp = dqp; - return (0); + return error; - error0: - ASSERT(error); +error1: if (tp) xfs_trans_cancel(tp, cancelflags); - error1: +error0: xfs_qm_dqdestroy(dqp); *O_dqpp = NULL; - return (error); -} - -/* - * Lookup a dquot in the incore dquot hashtable. We keep two separate - * hashtables for user and group dquots; and, these are global tables - * inside the XQM, not per-filesystem tables. - * The hash chain must be locked by caller, and it is left locked - * on return. Returning dquot is locked. - */ -STATIC int -xfs_qm_dqlookup( - xfs_mount_t *mp, - xfs_dqid_t id, - xfs_dqhash_t *qh, - xfs_dquot_t **O_dqpp) -{ - xfs_dquot_t *dqp; - uint flist_locked; - - ASSERT(mutex_is_locked(&qh->qh_lock)); - - flist_locked = B_FALSE; - - /* - * Traverse the hashchain looking for a match - */ - list_for_each_entry(dqp, &qh->qh_list, q_hashlist) { - /* - * We already have the hashlock. We don't need the - * dqlock to look at the id field of the dquot, since the - * id can't be modified without the hashlock anyway. - */ - if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) { - trace_xfs_dqlookup_found(dqp); - - /* - * All in core dquots must be on the dqlist of mp - */ - ASSERT(!list_empty(&dqp->q_mplist)); - - xfs_dqlock(dqp); - if (dqp->q_nrefs == 0) { - ASSERT(!list_empty(&dqp->q_freelist)); - if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { - trace_xfs_dqlookup_want(dqp); - - /* - * We may have raced with dqreclaim_one() - * (and lost). So, flag that we don't - * want the dquot to be reclaimed. - */ - dqp->dq_flags |= XFS_DQ_WANT; - xfs_dqunlock(dqp); - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - xfs_dqlock(dqp); - dqp->dq_flags &= ~(XFS_DQ_WANT); - } - flist_locked = B_TRUE; - } - - /* - * id couldn't have changed; we had the hashlock all - * along - */ - ASSERT(be32_to_cpu(dqp->q_core.d_id) == id); - - if (flist_locked) { - if (dqp->q_nrefs != 0) { - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - flist_locked = B_FALSE; - } else { - /* take it off the freelist */ - trace_xfs_dqlookup_freelist(dqp); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - } - } - - XFS_DQHOLD(dqp); - - if (flist_locked) - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - /* - * move the dquot to the front of the hashchain - */ - ASSERT(mutex_is_locked(&qh->qh_lock)); - list_move(&dqp->q_hashlist, &qh->qh_list); - trace_xfs_dqlookup_done(dqp); - *O_dqpp = dqp; - return 0; - } - } - - *O_dqpp = NULL; - ASSERT(mutex_is_locked(&qh->qh_lock)); - return (1); + return error; } /* @@ -808,10 +619,10 @@ xfs_qm_dqget( uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */ xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ { - xfs_dquot_t *dqp; - xfs_dqhash_t *h; - uint version; - int error; + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); + struct xfs_dquot *dqp; + int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || @@ -819,7 +630,6 @@ xfs_qm_dqget( (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { return (ESRCH); } - h = XFS_DQ_HASH(mp, id, type); #ifdef DEBUG if (xfs_do_dqerror) { @@ -829,42 +639,39 @@ xfs_qm_dqget( return (EIO); } } -#endif - again: - -#ifdef DEBUG ASSERT(type == XFS_DQ_USER || type == XFS_DQ_PROJ || type == XFS_DQ_GROUP); if (ip) { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (type == XFS_DQ_USER) - ASSERT(ip->i_udquot == NULL); - else - ASSERT(ip->i_gdquot == NULL); + ASSERT(xfs_inode_dquot(ip, type) == NULL); } #endif - mutex_lock(&h->qh_lock); - /* - * Look in the cache (hashtable). - * The chain is kept locked during lookup. - */ - if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) { - XQM_STATS_INC(xqmstats.xs_qm_dqcachehits); - /* - * The dquot was found, moved to the front of the chain, - * taken off the freelist if it was on it, and locked - * at this point. Just unlock the hashchain and return. - */ - ASSERT(*O_dqpp); - ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); - mutex_unlock(&h->qh_lock); - trace_xfs_dqget_hit(*O_dqpp); - return (0); /* success */ +restart: + mutex_lock(&qi->qi_tree_lock); + dqp = radix_tree_lookup(tree, id); + if (dqp) { + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) { + xfs_dqunlock(dqp); + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_freeing(dqp); + delay(1); + goto restart; + } + + dqp->q_nrefs++; + mutex_unlock(&qi->qi_tree_lock); + + trace_xfs_dqget_hit(dqp); + XFS_STATS_INC(xs_qm_dqcachehits); + *O_dqpp = dqp; + return 0; } - XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses); + mutex_unlock(&qi->qi_tree_lock); + XFS_STATS_INC(xs_qm_dqcachemisses); /* * Dquot cache miss. We don't want to keep the inode lock across @@ -875,130 +682,62 @@ xfs_qm_dqget( */ if (ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* - * Save the hashchain version stamp, and unlock the chain, so that - * we don't keep the lock across a disk read - */ - version = h->qh_version; - mutex_unlock(&h->qh_lock); - /* - * Allocate the dquot on the kernel heap, and read the ondisk - * portion off the disk. Also, do all the necessary initialization - * This can return ENOENT if dquot didn't exist on disk and we didn't - * ask it to allocate; ESRCH if quotas got turned off suddenly. - */ - if ((error = xfs_qm_idtodq(mp, id, type, - flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR| - XFS_QMOPT_DOWARN), - &dqp))) { - if (ip) - xfs_ilock(ip, XFS_ILOCK_EXCL); - return (error); - } - - /* - * See if this is mount code calling to look at the overall quota limits - * which are stored in the id == 0 user or group's dquot. - * Since we may not have done a quotacheck by this point, just return - * the dquot without attaching it to any hashtables, lists, etc, or even - * taking a reference. - * The caller must dqdestroy this once done. - */ - if (flags & XFS_QMOPT_DQSUSER) { - ASSERT(id == 0); - ASSERT(! ip); - goto dqret; - } + error = xfs_qm_dqread(mp, id, type, flags, &dqp); - /* - * Dquot lock comes after hashlock in the lock ordering - */ - if (ip) { + if (ip) xfs_ilock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + if (ip) { /* * A dquot could be attached to this inode by now, since * we had dropped the ilock. */ - if (type == XFS_DQ_USER) { - if (!XFS_IS_UQUOTA_ON(mp)) { - /* inode stays locked on return */ - xfs_qm_dqdestroy(dqp); - return XFS_ERROR(ESRCH); - } - if (ip->i_udquot) { + if (xfs_this_quota_on(mp, type)) { + struct xfs_dquot *dqp1; + + dqp1 = xfs_inode_dquot(ip, type); + if (dqp1) { xfs_qm_dqdestroy(dqp); - dqp = ip->i_udquot; + dqp = dqp1; xfs_dqlock(dqp); goto dqret; } } else { - if (!XFS_IS_OQUOTA_ON(mp)) { - /* inode stays locked on return */ - xfs_qm_dqdestroy(dqp); - return XFS_ERROR(ESRCH); - } - if (ip->i_gdquot) { - xfs_qm_dqdestroy(dqp); - dqp = ip->i_gdquot; - xfs_dqlock(dqp); - goto dqret; - } + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return XFS_ERROR(ESRCH); } } - /* - * Hashlock comes after ilock in lock order - */ - mutex_lock(&h->qh_lock); - if (version != h->qh_version) { - xfs_dquot_t *tmpdqp; + mutex_lock(&qi->qi_tree_lock); + error = -radix_tree_insert(tree, id, dqp); + if (unlikely(error)) { + WARN_ON(error != EEXIST); + /* - * Now, see if somebody else put the dquot in the - * hashtable before us. This can happen because we didn't - * keep the hashchain lock. We don't have to worry about - * lock order between the two dquots here since dqp isn't - * on any findable lists yet. + * Duplicate found. Just throw away the new dquot and start + * over. */ - if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) { - /* - * Duplicate found. Just throw away the new dquot - * and start over. - */ - xfs_qm_dqput(tmpdqp); - mutex_unlock(&h->qh_lock); - xfs_qm_dqdestroy(dqp); - XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); - goto again; - } + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_dup(dqp); + xfs_qm_dqdestroy(dqp); + XFS_STATS_INC(xs_qm_dquot_dups); + goto restart; } /* - * Put the dquot at the beginning of the hash-chain and mp's list - * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock .. - */ - ASSERT(mutex_is_locked(&h->qh_lock)); - dqp->q_hash = h; - list_add(&dqp->q_hashlist, &h->qh_list); - h->qh_version++; - - /* - * Attach this dquot to this filesystem's list of all dquots, - * kept inside the mount structure in m_quotainfo field - */ - mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); - - /* * We return a locked dquot to the caller, with a reference taken */ xfs_dqlock(dqp); dqp->q_nrefs = 1; - list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist); - mp->m_quotainfo->qi_dquots++; - mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); - mutex_unlock(&h->qh_lock); + qi->qi_dquots++; + mutex_unlock(&qi->qi_tree_lock); + dqret: ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); trace_xfs_dqget_miss(dqp); @@ -1007,75 +746,61 @@ xfs_qm_dqget( } -/* - * Release a reference to the dquot (decrement ref-count) - * and unlock it. If there is a group quota attached to this - * dquot, carefully release that too without tripping over - * deadlocks'n'stuff. - */ -void -xfs_qm_dqput( - xfs_dquot_t *dqp) +STATIC void +xfs_qm_dqput_final( + struct xfs_dquot *dqp) { - xfs_dquot_t *gdqp; + struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; + struct xfs_dquot *gdqp; - ASSERT(dqp->q_nrefs > 0); - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - - trace_xfs_dqput(dqp); + trace_xfs_dqput_free(dqp); - if (dqp->q_nrefs != 1) { - dqp->q_nrefs--; - xfs_dqunlock(dqp); - return; + mutex_lock(&qi->qi_lru_lock); + if (list_empty(&dqp->q_lru)) { + list_add_tail(&dqp->q_lru, &qi->qi_lru_list); + qi->qi_lru_count++; + XFS_STATS_INC(xs_qm_dquot_unused); } + mutex_unlock(&qi->qi_lru_lock); /* - * drop the dqlock and acquire the freelist and dqlock - * in the right order; but try to get it out-of-order first + * If we just added a udquot to the freelist, then we want to release + * the gdquot reference that it (probably) has. Otherwise it'll keep + * the gdquot from getting reclaimed. */ - if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { - trace_xfs_dqput_wait(dqp); - xfs_dqunlock(dqp); - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - xfs_dqlock(dqp); + gdqp = dqp->q_gdquot; + if (gdqp) { + xfs_dqlock(gdqp); + dqp->q_gdquot = NULL; } + xfs_dqunlock(dqp); - while (1) { - gdqp = NULL; + /* + * If we had a group quota hint, release it now. + */ + if (gdqp) + xfs_qm_dqput(gdqp); +} - /* We can't depend on nrefs being == 1 here */ - if (--dqp->q_nrefs == 0) { - trace_xfs_dqput_free(dqp); +/* + * Release a reference to the dquot (decrement ref-count) and unlock it. + * + * If there is a group quota attached to this dquot, carefully release that + * too without tripping over deadlocks'n'stuff. + */ +void +xfs_qm_dqput( + struct xfs_dquot *dqp) +{ + ASSERT(dqp->q_nrefs > 0); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); - list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); - xfs_Gqm->qm_dqfrlist_cnt++; + trace_xfs_dqput(dqp); - /* - * If we just added a udquot to the freelist, then - * we want to release the gdquot reference that - * it (probably) has. Otherwise it'll keep the - * gdquot from getting reclaimed. - */ - if ((gdqp = dqp->q_gdquot)) { - /* - * Avoid a recursive dqput call - */ - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - } + if (--dqp->q_nrefs > 0) xfs_dqunlock(dqp); - - /* - * If we had a group quota inside the user quota as a hint, - * release it now. - */ - if (! gdqp) - break; - dqp = gdqp; - } - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + else + xfs_qm_dqput_final(dqp); } /* @@ -1131,7 +856,7 @@ xfs_qm_dqflush_done( /* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->xa_lock); if (lip->li_lsn == qip->qli_flush_lsn) - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); else spin_unlock(&ailp->xa_lock); } @@ -1152,8 +877,8 @@ xfs_qm_dqflush_done( */ int xfs_qm_dqflush( - xfs_dquot_t *dqp, - uint flags) + struct xfs_dquot *dqp, + struct xfs_buf **bpp) { struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; @@ -1165,25 +890,30 @@ xfs_qm_dqflush( trace_xfs_dqflush(dqp); - /* - * If not dirty, or it's pinned and we are not supposed to block, nada. - */ - if (!XFS_DQ_IS_DIRTY(dqp) || - (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) { - xfs_dqfunlock(dqp); - return 0; - } + *bpp = NULL; + xfs_qm_dqunpin_wait(dqp); /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this dquot - * to disk, because the log record didn't make it to disk! + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an emptry AIL as part of the unmount process. */ if (XFS_FORCED_SHUTDOWN(mp)) { + struct xfs_log_item *lip = &dqp->q_logitem.qli_item; dqp->dq_flags &= ~XFS_DQ_DIRTY; - xfs_dqfunlock(dqp); - return XFS_ERROR(EIO); + + spin_lock(&mp->m_ail->xa_lock); + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(mp->m_ail, lip, + SHUTDOWN_CORRUPT_INCORE); + else + spin_unlock(&mp->m_ail->xa_lock); + error = XFS_ERROR(EIO); + goto out_unlock; } /* @@ -1191,11 +921,8 @@ xfs_qm_dqflush( */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp); - if (error) { - ASSERT(error != ENOENT); - xfs_dqfunlock(dqp); - return error; - } + if (error) + goto out_unlock; /* * Calculate the location of the dquot inside the buffer. @@ -1241,54 +968,13 @@ xfs_qm_dqflush( xfs_log_force(mp, 0); } - if (flags & SYNC_WAIT) - error = xfs_bwrite(bp); - else - xfs_buf_delwri_queue(bp); - - xfs_buf_relse(bp); - trace_xfs_dqflush_done(dqp); + *bpp = bp; + return 0; - /* - * dqp is still locked, but caller is free to unlock it now. - */ - return error; - -} - -int -xfs_qm_dqlock_nowait( - xfs_dquot_t *dqp) -{ - return mutex_trylock(&dqp->q_qlock); -} - -void -xfs_dqlock( - xfs_dquot_t *dqp) -{ - mutex_lock(&dqp->q_qlock); -} - -void -xfs_dqunlock( - xfs_dquot_t *dqp) -{ - mutex_unlock(&(dqp->q_qlock)); - if (dqp->q_logitem.qli_dquot == dqp) { - /* Once was dqp->q_mount, but might just have been cleared */ - xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp, - (xfs_log_item_t*)&(dqp->q_logitem)); - } -} - - -void -xfs_dqunlock_nonotify( - xfs_dquot_t *dqp) -{ - mutex_unlock(&(dqp->q_qlock)); +out_unlock: + xfs_dqfunlock(dqp); + return XFS_ERROR(EIO); } /* @@ -1319,138 +1005,30 @@ xfs_dqlock2( } } - -/* - * Take a dquot out of the mount's dqlist as well as the hashlist. - * This is called via unmount as well as quotaoff, and the purge - * will always succeed unless there are soft (temp) references - * outstanding. - * - * This returns 0 if it was purged, 1 if it wasn't. It's not an error code - * that we're returning! XXXsup - not cool. - */ -/* ARGSUSED */ -int -xfs_qm_dqpurge( - xfs_dquot_t *dqp) +int __init +xfs_qm_init(void) { - xfs_dqhash_t *qh = dqp->q_hash; - xfs_mount_t *mp = dqp->q_mount; - - ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock)); - ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); + xfs_qm_dqzone = + kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot"); + if (!xfs_qm_dqzone) + goto out; - xfs_dqlock(dqp); - /* - * We really can't afford to purge a dquot that is - * referenced, because these are hard refs. - * It shouldn't happen in general because we went thru _all_ inodes in - * dqrele_all_inodes before calling this and didn't let the mountlock go. - * However it is possible that we have dquots with temporary - * references that are not attached to an inode. e.g. see xfs_setattr(). - */ - if (dqp->q_nrefs != 0) { - xfs_dqunlock(dqp); - mutex_unlock(&dqp->q_hash->qh_lock); - return (1); - } - - ASSERT(!list_empty(&dqp->q_freelist)); - - /* - * If we're turning off quotas, we have to make sure that, for - * example, we don't delete quota disk blocks while dquots are - * in the process of getting written to those disk blocks. - * This dquot might well be on AIL, and we can't leave it there - * if we're turning off quotas. Basically, we need this flush - * lock, and are willing to block on it. - */ - if (!xfs_dqflock_nowait(dqp)) { - /* - * Block on the flush lock after nudging dquot buffer, - * if it is incore. - */ - xfs_qm_dqflock_pushbuf_wait(dqp); - } - - /* - * XXXIf we're turning this type of quotas off, we don't care - * about the dirty metadata sitting in this dquot. OTOH, if - * we're unmounting, we do care, so we flush it and wait. - */ - if (XFS_DQ_IS_DIRTY(dqp)) { - int error; + xfs_qm_dqtrxzone = + kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx"); + if (!xfs_qm_dqtrxzone) + goto out_free_dqzone; - /* dqflush unlocks dqflock */ - /* - * Given that dqpurge is a very rare occurrence, it is OK - * that we're holding the hashlist and mplist locks - * across the disk write. But, ... XXXsup - * - * We don't care about getting disk errors here. We need - * to purge this dquot anyway, so we go ahead regardless. - */ - error = xfs_qm_dqflush(dqp, SYNC_WAIT); - if (error) - xfs_warn(mp, "%s: dquot %p flush failed", - __func__, dqp); - xfs_dqflock(dqp); - } - ASSERT(atomic_read(&dqp->q_pincount) == 0); - ASSERT(XFS_FORCED_SHUTDOWN(mp) || - !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); - - list_del_init(&dqp->q_hashlist); - qh->qh_version++; - list_del_init(&dqp->q_mplist); - mp->m_quotainfo->qi_dqreclaims++; - mp->m_quotainfo->qi_dquots--; - /* - * XXX Move this to the front of the freelist, if we can get the - * freelist lock. - */ - ASSERT(!list_empty(&dqp->q_freelist)); + return 0; - dqp->q_mount = NULL; - dqp->q_hash = NULL; - dqp->dq_flags = XFS_DQ_INACTIVE; - memset(&dqp->q_core, 0, sizeof(dqp->q_core)); - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - mutex_unlock(&qh->qh_lock); - return (0); +out_free_dqzone: + kmem_zone_destroy(xfs_qm_dqzone); +out: + return -ENOMEM; } - -/* - * Give the buffer a little push if it is incore and - * wait on the flush lock. - */ void -xfs_qm_dqflock_pushbuf_wait( - xfs_dquot_t *dqp) +xfs_qm_exit(void) { - xfs_mount_t *mp = dqp->q_mount; - xfs_buf_t *bp; - - /* - * Check to see if the dquot has been flushed delayed - * write. If so, grab its buffer and send it - * out immediately. We'll be able to acquire - * the flush lock when the I/O completes. - */ - bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); - if (!bp) - goto out_lock; - - if (XFS_BUF_ISDELAYWRITE(bp)) { - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - xfs_buf_delwri_promote(bp); - wake_up_process(bp->b_target->bt_task); - } - xfs_buf_relse(bp); -out_lock: - xfs_dqflock(dqp); + kmem_zone_destroy(xfs_qm_dqtrxzone); + kmem_zone_destroy(xfs_qm_dqzone); } diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 34b7e945dbf..7d20af27346 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -29,16 +29,6 @@ * when quotas are off. */ -/* - * The hash chain headers (hash buckets) - */ -typedef struct xfs_dqhash { - struct list_head qh_list; - struct mutex qh_lock; - uint qh_version; /* ever increasing version */ - uint qh_nelems; /* number of dquots on the list */ -} xfs_dqhash_t; - struct xfs_mount; struct xfs_trans; @@ -47,10 +37,7 @@ struct xfs_trans; */ typedef struct xfs_dquot { uint dq_flags; /* various flags (XFS_DQ_*) */ - struct list_head q_freelist; /* global free list of dquots */ - struct list_head q_mplist; /* mount's list of dquots */ - struct list_head q_hashlist; /* gloabl hash list of dquots */ - xfs_dqhash_t *q_hash; /* the hashchain header */ + struct list_head q_lru; /* global free list of dquots */ struct xfs_mount*q_mount; /* filesystem this relates to */ struct xfs_trans*q_transp; /* trans this belongs to currently */ uint q_nrefs; /* # active refs from inodes */ @@ -80,8 +67,6 @@ enum { XFS_QLOCK_NESTED, }; -#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) - /* * Manage the q_flush completion queue embedded in the dquot. This completion * queue synchronizes processes attempting to flush the in-core dquot back to @@ -102,6 +87,47 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) complete(&dqp->q_flush); } +static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp) +{ + return mutex_trylock(&dqp->q_qlock); +} + +static inline void xfs_dqlock(struct xfs_dquot *dqp) +{ + mutex_lock(&dqp->q_qlock); +} + +static inline void xfs_dqunlock(struct xfs_dquot *dqp) +{ + mutex_unlock(&dqp->q_qlock); +} + +static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) +{ + switch (type & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return XFS_IS_UQUOTA_ON(mp); + case XFS_DQ_GROUP: + case XFS_DQ_PROJ: + return XFS_IS_OQUOTA_ON(mp); + default: + return 0; + } +} + +static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) +{ + switch (type & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return ip->i_udquot; + case XFS_DQ_GROUP: + case XFS_DQ_PROJ: + return ip->i_gdquot; + default: + return NULL; + } +} + #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) @@ -112,16 +138,11 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \ XFS_DQ_TO_QINF(dqp)->qi_gquotaip) -#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \ - (XFS_IS_UQUOTA_ON((d)->q_mount)) : \ - (XFS_IS_OQUOTA_ON((d)->q_mount)))) - +extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, + uint, struct xfs_dquot **); extern void xfs_qm_dqdestroy(xfs_dquot_t *); -extern int xfs_qm_dqflush(xfs_dquot_t *, uint); -extern int xfs_qm_dqpurge(xfs_dquot_t *); +extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); -extern int xfs_qm_dqlock_nowait(xfs_dquot_t *); -extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp); extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, @@ -129,9 +150,15 @@ extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, xfs_dqid_t, uint, uint, xfs_dquot_t **); extern void xfs_qm_dqput(xfs_dquot_t *); -extern void xfs_dqlock(xfs_dquot_t *); -extern void xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *); -extern void xfs_dqunlock(xfs_dquot_t *); -extern void xfs_dqunlock_nonotify(xfs_dquot_t *); + +extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); + +static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) +{ + xfs_dqlock(dqp); + dqp->q_nrefs++; + xfs_dqunlock(dqp); + return dqp; +} #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 0dee0b71029..57aa4b03720 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -73,7 +71,6 @@ xfs_qm_dquot_logitem_format( logvec->i_len = sizeof(xfs_disk_dquot_t); logvec->i_type = XLOG_REG_TYPE_DQUOT; - ASSERT(2 == lip->li_desc->lid_size); qlip->qli_format.qlf_size = 2; } @@ -109,38 +106,6 @@ xfs_qm_dquot_logitem_unpin( wake_up(&dqp->q_pinwait); } -/* - * Given the logitem, this writes the corresponding dquot entry to disk - * asynchronously. This is called with the dquot entry securely locked; - * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot - * at the end. - */ -STATIC void -xfs_qm_dquot_logitem_push( - struct xfs_log_item *lip) -{ - struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - int error; - - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(!completion_done(&dqp->q_flush)); - - /* - * Since we were able to lock the dquot's flush lock and - * we found it on the AIL, the dquot must be dirty. This - * is because the dquot is removed from the AIL while still - * holding the flush lock in xfs_dqflush_done(). Thus, if - * we found it in the AIL and were able to obtain the flush - * lock without sleeping, then there must not have been - * anyone in the process of flushing the dquot. - */ - error = xfs_qm_dqflush(dqp, 0); - if (error) - xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", - __func__, error, dqp); - xfs_dqunlock(dqp); -} - STATIC xfs_lsn_t xfs_qm_dquot_logitem_committed( struct xfs_log_item *lip, @@ -172,84 +137,57 @@ xfs_qm_dqunpin_wait( wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } -/* - * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that - * the dquot is locked by us, but the flush lock isn't. So, here we are - * going to see if the relevant dquot buffer is incore, waiting on DELWRI. - * If so, we want to push it out to help us take this item off the AIL as soon - * as possible. - * - * We must not be holding the AIL lock at this point. Calling incore() to - * search the buffer cache can be a time consuming thing, and AIL lock is a - * spinlock. - */ -STATIC bool -xfs_qm_dquot_logitem_pushbuf( - struct xfs_log_item *lip) -{ - struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); - struct xfs_dquot *dqp = qlip->qli_dquot; - struct xfs_buf *bp; - bool ret = true; - - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - - /* - * If flushlock isn't locked anymore, chances are that the - * inode flush completed and the inode was taken off the AIL. - * So, just get out. - */ - if (completion_done(&dqp->q_flush) || - !(lip->li_flags & XFS_LI_IN_AIL)) { - xfs_dqunlock(dqp); - return true; - } - - bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno, - dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); - xfs_dqunlock(dqp); - if (!bp) - return true; - if (XFS_BUF_ISDELAYWRITE(bp)) - xfs_buf_delwri_promote(bp); - if (xfs_buf_ispinned(bp)) - ret = false; - xfs_buf_relse(bp); - return ret; -} - -/* - * This is called to attempt to lock the dquot associated with this - * dquot log item. Don't sleep on the dquot lock or the flush lock. - * If the flush lock is already held, indicating that the dquot has - * been or is in the process of being flushed, then see if we can - * find the dquot's buffer in the buffer cache without sleeping. If - * we can and it is marked delayed write, then we want to send it out. - * We delay doing so until the push routine, though, to avoid sleeping - * in any device strategy routines. - */ STATIC uint -xfs_qm_dquot_logitem_trylock( - struct xfs_log_item *lip) +xfs_qm_dquot_logitem_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + struct xfs_buf *bp = NULL; + uint rval = XFS_ITEM_SUCCESS; + int error; if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; - if (!xfs_qm_dqlock_nowait(dqp)) + if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; + /* + * Re-check the pincount now that we stabilized the value by + * taking the quota lock. + */ + if (atomic_read(&dqp->q_pincount) > 0) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + + /* + * Someone else is already flushing the dquot. Nothing we can do + * here but wait for the flush to finish and remove the item from + * the AIL. + */ if (!xfs_dqflock_nowait(dqp)) { - /* - * dquot has already been flushed to the backing buffer, - * leave it locked, pushbuf routine will unlock it. - */ - return XFS_ITEM_PUSHBUF; + rval = XFS_ITEM_FLUSHING; + goto out_unlock; } - ASSERT(lip->li_flags & XFS_LI_IN_AIL); - return XFS_ITEM_SUCCESS; + spin_unlock(&lip->li_ailp->xa_lock); + + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", + __func__, error, dqp); + } else { + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_relse(bp); + } + + spin_lock(&lip->li_ailp->xa_lock); +out_unlock: + xfs_dqunlock(dqp); + return rval; } /* @@ -300,11 +238,9 @@ static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_format = xfs_qm_dquot_logitem_format, .iop_pin = xfs_qm_dquot_logitem_pin, .iop_unpin = xfs_qm_dquot_logitem_unpin, - .iop_trylock = xfs_qm_dquot_logitem_trylock, .iop_unlock = xfs_qm_dquot_logitem_unlock, .iop_committed = xfs_qm_dquot_logitem_committed, .iop_push = xfs_qm_dquot_logitem_push, - .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf, .iop_committing = xfs_qm_dquot_logitem_committing }; @@ -399,11 +335,13 @@ xfs_qm_qoff_logitem_unpin( } /* - * Quotaoff items have no locking, so just return success. + * There isn't much you can do to push a quotaoff item. It is simply + * stuck waiting for the log to be flushed to disk. */ STATIC uint -xfs_qm_qoff_logitem_trylock( - struct xfs_log_item *lip) +xfs_qm_qoff_logitem_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { return XFS_ITEM_LOCKED; } @@ -430,17 +368,6 @@ xfs_qm_qoff_logitem_committed( return lsn; } -/* - * There isn't much you can do to push on an quotaoff item. It is simply - * stuck waiting for the log to be flushed to disk. - */ -STATIC void -xfs_qm_qoff_logitem_push( - struct xfs_log_item *lip) -{ -} - - STATIC xfs_lsn_t xfs_qm_qoffend_logitem_committed( struct xfs_log_item *lip, @@ -455,7 +382,7 @@ xfs_qm_qoffend_logitem_committed( * xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs); + xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); kmem_free(qfs); kmem_free(qfe); @@ -488,7 +415,6 @@ static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, .iop_unpin = xfs_qm_qoff_logitem_unpin, - .iop_trylock = xfs_qm_qoff_logitem_trylock, .iop_unlock = xfs_qm_qoff_logitem_unlock, .iop_committed = xfs_qm_qoffend_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, @@ -503,7 +429,6 @@ static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, .iop_unpin = xfs_qm_qoff_logitem_unpin, - .iop_trylock = xfs_qm_qoff_logitem_trylock, .iop_unlock = xfs_qm_qoff_logitem_unlock, .iop_committed = xfs_qm_qoff_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 39f06336b99..610456054dc 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 558910f5e3c..42679223a0f 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -17,7 +17,6 @@ */ #include "xfs.h" #include "xfs_types.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" @@ -53,19 +52,18 @@ static int xfs_fileid_length(int fileid_type) STATIC int xfs_fs_encode_fh( - struct dentry *dentry, - __u32 *fh, - int *max_len, - int connectable) + struct inode *inode, + __u32 *fh, + int *max_len, + struct inode *parent) { struct fid *fid = (struct fid *)fh; struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh; - struct inode *inode = dentry->d_inode; int fileid_type; int len; /* Directories don't need their parent encoded, they have ".." */ - if (S_ISDIR(inode->i_mode) || !connectable) + if (!parent) fileid_type = FILEID_INO32_GEN; else fileid_type = FILEID_INO32_GEN_PARENT; @@ -97,20 +95,16 @@ xfs_fs_encode_fh( switch (fileid_type) { case FILEID_INO32_GEN_PARENT: - spin_lock(&dentry->d_lock); - fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; - fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation; - spin_unlock(&dentry->d_lock); + fid->i32.parent_ino = XFS_I(parent)->i_ino; + fid->i32.parent_gen = parent->i_generation; /*FALLTHRU*/ case FILEID_INO32_GEN: fid->i32.ino = XFS_I(inode)->i_ino; fid->i32.gen = inode->i_generation; break; case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: - spin_lock(&dentry->d_lock); - fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; - fid64->parent_gen = dentry->d_parent->d_inode->i_generation; - spin_unlock(&dentry->d_lock); + fid64->parent_ino = XFS_I(parent)->i_ino; + fid64->parent_gen = parent->i_generation; /*FALLTHRU*/ case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: fid64->ino = XFS_I(inode)->i_ino; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c new file mode 100644 index 00000000000..85e9f87a1a7 --- /dev/null +++ b/fs/xfs/xfs_extent_busy.c @@ -0,0 +1,603 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2010 David Chinner. + * Copyright (c) 2011 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_inode.h" +#include "xfs_extent_busy.h" +#include "xfs_trace.h" + +void +xfs_extent_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + unsigned int flags) +{ + struct xfs_extent_busy *new; + struct xfs_extent_busy *busyp; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent = NULL; + + new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); + if (!new) { + /* + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. + */ + trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len); + xfs_trans_set_sync(tp); + return; + } + + new->agno = agno; + new->bno = bno; + new->length = len; + INIT_LIST_HEAD(&new->list); + new->flags = flags; + + /* trace before insert to be able to see failed inserts */ + trace_xfs_extent_busy(tp->t_mountp, agno, bno, len); + + pag = xfs_perag_get(tp->t_mountp, new->agno); + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + while (*rbp) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); + + if (new->bno < busyp->bno) { + rbp = &(*rbp)->rb_left; + ASSERT(new->bno + new->length <= busyp->bno); + } else if (new->bno > busyp->bno) { + rbp = &(*rbp)->rb_right; + ASSERT(bno >= busyp->bno + busyp->length); + } else { + ASSERT(0); + } + } + + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + + list_add(&new->list, &tp->t_busy); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_extent_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. + */ +int +xfs_extent_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + struct xfs_extent_busy *busyp; + int match = 0; + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); + + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + return match; +} + +/* + * The found free extent [fbno, fend] overlaps part or all of the given busy + * extent. If the overlap covers the beginning, the end, or all of the busy + * extent, the overlapping portion can be made unbusy and used for the + * allocation. We can't split a busy extent because we can't modify a + * transaction/CIL context busy list, but we can update an entries block + * number or length. + * + * Returns true if the extent can safely be reused, or false if the search + * needs to be restarted. + */ +STATIC bool +xfs_extent_busy_update_extent( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_extent_busy *busyp, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + /* + * This extent is currently being discarded. Give the thread + * performing the discard a chance to mark the extent unbusy + * and retry. + */ + if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { + spin_unlock(&pag->pagb_lock); + delay(1); + spin_lock(&pag->pagb_lock); + return false; + } + + /* + * If there is a busy extent overlapping a user allocation, we have + * no choice but to force the log and retry the search. + * + * Fortunately this does not happen during normal operation, but + * only if the filesystem is very low on space and has to dip into + * the AGFL for normal allocations. + */ + if (userdata) + goto out_force_log; + + if (bbno < fbno && bend > fend) { + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + */ + + /* + * We would have to split the busy extent to be able to track + * it correct, which we cannot do because we would have to + * modify the list of busy extents attached to the transaction + * or CIL context, which is immutable. + * + * Force out the log to clear the busy extent and retry the + * search. + */ + goto out_force_log; + } else if (bbno >= fbno && bend <= fend) { + /* + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + */ + + /* + * The busy extent is fully covered by the extent we are + * allocating, and can simply be removed from the rbtree. + * However we cannot remove it from the immutable list + * tracking busy extents in the transaction or CIL context, + * so set the length to zero to mark it invalid. + * + * We also need to restart the busy extent search from the + * tree root, because erasing the node can rearrange the + * tree topology. + */ + rb_erase(&busyp->rb_node, &pag->pagb_tree); + busyp->length = 0; + return false; + } else if (fend < bend) { + /* + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + */ + busyp->bno = fend; + } else if (bbno < fbno) { + /* + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + */ + busyp->length = fbno - busyp->bno; + } else { + ASSERT(0); + } + + trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); + return true; + +out_force_log: + spin_unlock(&pag->pagb_lock); + xfs_log_force(mp, XFS_LOG_SYNC); + trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); + spin_lock(&pag->pagb_lock); + return false; +} + + +/* + * For a given extent [fbno, flen], make sure we can reuse it safely. + */ +void +xfs_extent_busy_reuse( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + + ASSERT(flen > 0); + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); +restart: + rbp = pag->pagb_tree.rb_node; + while (rbp) { + struct xfs_extent_busy *busyp = + rb_entry(rbp, struct xfs_extent_busy, rb_node); + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fbno + flen <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, + userdata)) + goto restart; + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * For a given extent [fbno, flen], search the busy extent list to find a + * subset of the extent that is not busy. If *rlen is smaller than + * args->minlen no suitable extent could be found, and the higher level + * code needs to force out the log and retry the allocation. + */ +void +xfs_extent_busy_trim( + struct xfs_alloc_arg *args, + xfs_agblock_t bno, + xfs_extlen_t len, + xfs_agblock_t *rbno, + xfs_extlen_t *rlen) +{ + xfs_agblock_t fbno; + xfs_extlen_t flen; + struct rb_node *rbp; + + ASSERT(len > 0); + + spin_lock(&args->pag->pagb_lock); +restart: + fbno = bno; + flen = len; + rbp = args->pag->pagb_tree.rb_node; + while (rbp && flen >= args->minlen) { + struct xfs_extent_busy *busyp = + rb_entry(rbp, struct xfs_extent_busy, rb_node); + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fend <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + /* + * If this is a metadata allocation, try to reuse the busy + * extent instead of trimming the allocation. + */ + if (!args->userdata && + !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { + if (!xfs_extent_busy_update_extent(args->mp, args->pag, + busyp, fbno, flen, + false)) + goto restart; + continue; + } + + if (bbno <= fbno) { + /* start overlap */ + + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * No unbusy region in extent, return failure. + */ + if (fend <= bend) + goto fail; + + /* + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + * + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fbno = bend; + } else if (bend >= fend) { + /* end overlap */ + + /* + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fend = bbno; + } else { + /* middle overlap */ + + /* + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + * Can be trimmed to: + * +-------+ OR +-------+ + * fbno fend fbno fend + * + * Backward allocation leads to significant + * fragmentation of directories, which degrades + * directory performance, therefore we always want to + * choose the option that produces forward allocation + * patterns. + * Preferring the lower bno extent will make the next + * request use "fend" as the start of the next + * allocation; if the segment is no longer busy at + * that point, we'll get a contiguous allocation, but + * even if it is still busy, we will get a forward + * allocation. + * We try to avoid choosing the segment at "bend", + * because that can lead to the next allocation + * taking the segment at "fbno", which would be a + * backward allocation. We only use the segment at + * "fbno" if it is much larger than the current + * requested size, because in that case there's a + * good chance subsequent allocations will be + * contiguous. + */ + if (bbno - fbno >= args->maxlen) { + /* left candidate fits perfect */ + fend = bbno; + } else if (fend - bend >= args->maxlen * 4) { + /* right candidate has enough free space */ + fbno = bend; + } else if (bbno - fbno >= args->minlen) { + /* left candidate fits minimum requirement */ + fend = bbno; + } else { + goto fail; + } + } + + flen = fend - fbno; + } + spin_unlock(&args->pag->pagb_lock); + + if (fbno != bno || flen != len) { + trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, + fbno, flen); + } + *rbno = fbno; + *rlen = flen; + return; +fail: + /* + * Return a zero extent length as failure indications. All callers + * re-check if the trimmed extent satisfies the minlen requirement. + */ + spin_unlock(&args->pag->pagb_lock); + trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0); + *rbno = fbno; + *rlen = 0; +} + +STATIC void +xfs_extent_busy_clear_one( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_extent_busy *busyp) +{ + if (busyp->length) { + trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, + busyp->length); + rb_erase(&busyp->rb_node, &pag->pagb_tree); + } + + list_del_init(&busyp->list); + kmem_free(busyp); +} + +/* + * Remove all extents on the passed in list from the busy extents tree. + * If do_discard is set skip extents that need to be discarded, and mark + * these as undergoing a discard operation instead. + */ +void +xfs_extent_busy_clear( + struct xfs_mount *mp, + struct list_head *list, + bool do_discard) +{ + struct xfs_extent_busy *busyp, *n; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = NULLAGNUMBER; + + list_for_each_entry_safe(busyp, n, list, list) { + if (busyp->agno != agno) { + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + agno = busyp->agno; + } + + if (do_discard && busyp->length && + !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) + busyp->flags = XFS_EXTENT_BUSY_DISCARDED; + else + xfs_extent_busy_clear_one(mp, pag, busyp); + } + + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } +} + +/* + * Callback for list_sort to sort busy extents by the AG they reside in. + */ +int +xfs_extent_busy_ag_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + return container_of(a, struct xfs_extent_busy, list)->agno - + container_of(b, struct xfs_extent_busy, list)->agno; +} diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h new file mode 100644 index 00000000000..985412d65ba --- /dev/null +++ b/fs/xfs/xfs_extent_busy.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2010 David Chinner. + * Copyright (c) 2011 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_EXTENT_BUSY_H__ +#define __XFS_EXTENT_BUSY_H__ + +/* + * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that + * have been freed but whose transactions aren't committed to disk yet. + * + * Note that we use the transaction ID to record the transaction, not the + * transaction structure itself. See xfs_extent_busy_insert() for details. + */ +struct xfs_extent_busy { + struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct list_head list; /* transaction busy extent list */ + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_extlen_t length; + unsigned int flags; +#define XFS_EXTENT_BUSY_DISCARDED 0x01 /* undergoing a discard op. */ +#define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ +}; + +void +xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); + +void +xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, + bool do_discard); + +int +xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); + +void +xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); + +void +xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno, + xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen); + +int +xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b); + +static inline void xfs_extent_busy_sort(struct list_head *list) +{ + list_sort(NULL, list, xfs_extent_busy_ag_cmp); +} + +#endif /* __XFS_EXTENT_BUSY_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 35c2aff38b2..feb36d7551a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_sb.h" @@ -64,7 +63,8 @@ __xfs_efi_release( if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) { spin_lock(&ailp->xa_lock); /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, &efip->efi_item); + xfs_trans_ail_delete(ailp, &efip->efi_item, + SHUTDOWN_LOG_IO_ERROR); xfs_efi_item_free(efip); } } @@ -147,22 +147,20 @@ xfs_efi_item_unpin( } /* - * Efi items have no locking or pushing. However, since EFIs are - * pulled from the AIL when their corresponding EFDs are committed - * to disk, their situation is very similar to being pinned. Return - * XFS_ITEM_PINNED so that the caller will eventually flush the log. - * This should help in getting the EFI out of the AIL. + * Efi items have no locking or pushing. However, since EFIs are pulled from + * the AIL when their corresponding EFDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the EFI out of + * the AIL. */ STATIC uint -xfs_efi_item_trylock( - struct xfs_log_item *lip) +xfs_efi_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { return XFS_ITEM_PINNED; } -/* - * Efi items have no locking, so just return. - */ STATIC void xfs_efi_item_unlock( struct xfs_log_item *lip) @@ -190,17 +188,6 @@ xfs_efi_item_committed( } /* - * There isn't much you can do to push on an efi item. It is simply - * stuck waiting for all of its corresponding efd items to be - * committed to disk. - */ -STATIC void -xfs_efi_item_push( - struct xfs_log_item *lip) -{ -} - -/* * The EFI dependency tracking op doesn't do squat. It can't because * it doesn't know where the free extent is coming from. The dependency * tracking has to be handled by the "enclosing" metadata object. For @@ -222,7 +209,6 @@ static const struct xfs_item_ops xfs_efi_item_ops = { .iop_format = xfs_efi_item_format, .iop_pin = xfs_efi_item_pin, .iop_unpin = xfs_efi_item_unpin, - .iop_trylock = xfs_efi_item_trylock, .iop_unlock = xfs_efi_item_unlock, .iop_committed = xfs_efi_item_committed, .iop_push = xfs_efi_item_push, @@ -404,19 +390,17 @@ xfs_efd_item_unpin( } /* - * Efd items have no locking, so just return success. + * There isn't much you can do to push on an efd item. It is simply stuck + * waiting for the log to be flushed to disk. */ STATIC uint -xfs_efd_item_trylock( - struct xfs_log_item *lip) +xfs_efd_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { - return XFS_ITEM_LOCKED; + return XFS_ITEM_PINNED; } -/* - * Efd items have no locking or pushing, so return failure - * so that the caller doesn't bother with us. - */ STATIC void xfs_efd_item_unlock( struct xfs_log_item *lip) @@ -451,16 +435,6 @@ xfs_efd_item_committed( } /* - * There isn't much you can do to push on an efd item. It is simply - * stuck waiting for the log to be flushed to disk. - */ -STATIC void -xfs_efd_item_push( - struct xfs_log_item *lip) -{ -} - -/* * The EFD dependency tracking op doesn't do squat. It can't because * it doesn't know where the free extent is coming from. The dependency * tracking has to be handled by the "enclosing" metadata object. For @@ -482,7 +456,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = { .iop_format = xfs_efd_item_format, .iop_pin = xfs_efd_item_pin, .iop_unpin = xfs_efd_item_unpin, - .iop_trylock = xfs_efd_item_trylock, .iop_unlock = xfs_efd_item_unlock, .iop_committed = xfs_efd_item_committed, .iop_push = xfs_efd_item_push, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 753ed9b5c70..9f7ec15a652 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_trans.h" @@ -163,7 +161,6 @@ xfs_file_fsync( struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; int error = 0; int log_flushed = 0; xfs_lsn_t lsn = 0; @@ -194,75 +191,18 @@ xfs_file_fsync( } /* - * We always need to make sure that the required inode state is safe on - * disk. The inode might be clean but we still might need to force the - * log because of committed transactions that haven't hit the disk yet. - * Likewise, there could be unflushed non-transactional changes to the - * inode core that have to go to disk and this requires us to issue - * a synchronous transaction to capture these changes correctly. - * - * This code relies on the assumption that if the i_update_core field - * of the inode is clear and the inode is unpinned then it is clean - * and no action is required. + * All metadata updates are logged, which means that we just have + * to flush the log up to the latest LSN that touched the inode. */ xfs_ilock(ip, XFS_ILOCK_SHARED); - - /* - * First check if the VFS inode is marked dirty. All the dirtying - * of non-transactional updates no goes through mark_inode_dirty*, - * which allows us to distinguish beteeen pure timestamp updates - * and i_size updates which need to be caught for fdatasync. - * After that also theck for the dirty state in the XFS inode, which - * might gets cleared when the inode gets written out via the AIL - * or xfs_iflush_cluster. - */ - if (((inode->i_state & I_DIRTY_DATASYNC) || - ((inode->i_state & I_DIRTY_SYNC) && !datasync)) && - ip->i_update_core) { - /* - * Kick off a transaction to log the inode core to get the - * updates. The sync transaction will also force the log. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, - XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return -error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* - * Note - it's possible that we might have pushed ourselves out - * of the way during trans_reserve which would flush the inode. - * But there's no guarantee that the inode buffer has actually - * gone out yet (it's delwri). Plus the buffer could be pinned - * anyway if it's part of an inode in another recent - * transaction. So we play it safe and fire off the - * transaction anyway. - */ - xfs_trans_ijoin(tp, ip, 0); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); - - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } else { - /* - * Timestamps/size haven't changed since last inode flush or - * inode transaction commit. That means either nothing got - * written or a transaction committed which caught the updates. - * If the latter happened and the transaction hasn't hit the - * disk yet, the inode will be still be pinned. If it is, - * force the log. - */ - if (xfs_ipincount(ip)) + if (xfs_ipincount(ip)) { + if (!datasync || + (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP)) lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); } + xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (!error && lsn) + if (lsn) error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); /* @@ -327,7 +267,7 @@ xfs_file_aio_read( mp->m_rtdev_targp : mp->m_ddev_targp; if ((iocb->ki_pos & target->bt_smask) || (size & target->bt_smask)) { - if (iocb->ki_pos == ip->i_size) + if (iocb->ki_pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); } @@ -412,51 +352,6 @@ xfs_file_splice_read( return ret; } -STATIC void -xfs_aio_write_isize_update( - struct inode *inode, - loff_t *ppos, - ssize_t bytes_written) -{ - struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t isize = i_size_read(inode); - - if (bytes_written > 0) - XFS_STATS_ADD(xs_write_bytes, bytes_written); - - if (unlikely(bytes_written < 0 && bytes_written != -EFAULT && - *ppos > isize)) - *ppos = isize; - - if (*ppos > ip->i_size) { - xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_size) - ip->i_size = *ppos; - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - } -} - -/* - * If this was a direct or synchronous I/O that failed (such as ENOSPC) then - * part of the I/O may have been written to disk before the error occurred. In - * this case the on-disk file size may have been adjusted beyond the in-memory - * file size and now needs to be truncated back. - */ -STATIC void -xfs_aio_write_newsize_update( - struct xfs_inode *ip, - xfs_fsize_t new_size) -{ - if (new_size == ip->i_new_size) { - xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - if (new_size == ip->i_new_size) - ip->i_new_size = 0; - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - } -} - /* * xfs_file_splice_write() does not use xfs_rw_ilock() because * generic_file_splice_write() takes the i_mutex itself. This, in theory, @@ -475,7 +370,6 @@ xfs_file_splice_write( { struct inode *inode = outfilp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t new_size; int ioflags = 0; ssize_t ret; @@ -489,132 +383,107 @@ xfs_file_splice_write( xfs_ilock(ip, XFS_IOLOCK_EXCL); - new_size = *ppos + count; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (new_size > ip->i_size) - ip->i_new_size = new_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - trace_xfs_file_splice_write(ip, count, *ppos, ioflags); ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_write_bytes, ret); - xfs_aio_write_isize_update(inode, ppos, ret); - xfs_aio_write_newsize_update(ip, new_size); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } /* - * This routine is called to handle zeroing any space in the last - * block of the file that is beyond the EOF. We do this since the - * size is being increased without writing anything to that block - * and we don't want anyone to read the garbage on the disk. + * This routine is called to handle zeroing any space in the last block of the + * file that is beyond the EOF. We do this since the size is being increased + * without writing anything to that block and we don't want to read the + * garbage on the disk. */ STATIC int /* error (positive) */ xfs_zero_last_block( - xfs_inode_t *ip, - xfs_fsize_t offset, - xfs_fsize_t isize) + struct xfs_inode *ip, + xfs_fsize_t offset, + xfs_fsize_t isize) { - xfs_fileoff_t last_fsb; - xfs_mount_t *mp = ip->i_mount; - int nimaps; - int zero_offset; - int zero_len; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - zero_offset = XFS_B_FSB_OFFSET(mp, isize); - if (zero_offset == 0) { - /* - * There are no extra bytes in the last block on disk to - * zero, so return. - */ - return 0; - } + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); + int zero_offset = XFS_B_FSB_OFFSET(mp, isize); + int zero_len; + int nimaps = 1; + int error = 0; + struct xfs_bmbt_irec imap; - last_fsb = XFS_B_TO_FSBT(mp, isize); - nimaps = 1; + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; + ASSERT(nimaps > 0); + /* * If the block underlying isize is just a hole, then there * is nothing to zero. */ - if (imap.br_startblock == HOLESTARTBLOCK) { + if (imap.br_startblock == HOLESTARTBLOCK) return 0; - } - /* - * Zero the part of the last block beyond the EOF, and write it - * out sync. We need to drop the ilock while we do this so we - * don't deadlock when the buffer cache calls back to us. - */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); zero_len = mp->m_sb.sb_blocksize - zero_offset; if (isize + zero_len > offset) zero_len = offset - isize; - error = xfs_iozero(ip, isize, zero_len); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - ASSERT(error >= 0); - return error; + return xfs_iozero(ip, isize, zero_len); } /* - * Zero any on disk space between the current EOF and the new, - * larger EOF. This handles the normal case of zeroing the remainder - * of the last block in the file and the unusual case of zeroing blocks - * out beyond the size of the file. This second case only happens - * with fixed size extents and when the system crashes before the inode - * size was updated but after blocks were allocated. If fill is set, - * then any holes in the range are filled and zeroed. If not, the holes - * are left alone as holes. + * Zero any on disk space between the current EOF and the new, larger EOF. + * + * This handles the normal case of zeroing the remainder of the last block in + * the file and the unusual case of zeroing blocks out beyond the size of the + * file. This second case only happens with fixed size extents and when the + * system crashes before the inode size was updated but after blocks were + * allocated. + * + * Expects the iolock to be held exclusive, and will take the ilock internally. */ - int /* error (positive) */ xfs_zero_eof( - xfs_inode_t *ip, - xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize) /* current inode size */ + struct xfs_inode *ip, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize) /* current inode size */ { - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_fileoff_t zero_off; + xfs_fsize_t zero_len; + int nimaps; + int error = 0; + struct xfs_bmbt_irec imap; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(offset > isize); /* * First handle zeroing the block on which isize resides. + * * We only zero a part of that block so it is handled specially. */ - error = xfs_zero_last_block(ip, offset, isize); - if (error) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - return error; + if (XFS_B_FSB_OFFSET(mp, isize) != 0) { + error = xfs_zero_last_block(ip, offset, isize); + if (error) + return error; } /* - * Calculate the range between the new size and the old - * where blocks needing to be zeroed may exist. To get the - * block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back - * to a block boundary. We subtract 1 in case the size is - * exactly on a block boundary. + * Calculate the range between the new size and the old where blocks + * needing to be zeroed may exist. + * + * To get the block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back to a block + * boundary. We subtract 1 in case the size is exactly on a block + * boundary. */ last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); @@ -632,23 +501,18 @@ xfs_zero_eof( while (start_zero_fsb <= end_zero_fsb) { nimaps = 1; zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, &imap, &nimaps, 0); - if (error) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) return error; - } + ASSERT(nimaps > 0); if (imap.br_state == XFS_EXT_UNWRITTEN || imap.br_startblock == HOLESTARTBLOCK) { - /* - * This loop handles initializing pages that were - * partially initialized by the code below this - * loop. It basically zeroes the part of the page - * that sits on a hole and sets the page as P_HOLE - * and calls remapf if it is a mapped file. - */ start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); continue; @@ -656,11 +520,7 @@ xfs_zero_eof( /* * There are blocks we need to zero. - * Drop the inode lock while we're doing the I/O. - * We'll still have the iolock to protect us. */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); @@ -668,99 +528,76 @@ xfs_zero_eof( zero_len = offset - zero_off; error = xfs_iozero(ip, zero_off, zero_len); - if (error) { - goto out_lock; - } + if (error) + return error; start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - - xfs_ilock(ip, XFS_ILOCK_EXCL); } return 0; - -out_lock: - xfs_ilock(ip, XFS_ILOCK_EXCL); - ASSERT(error >= 0); - return error; } /* * Common pre-write limit and setup checks. * - * Returns with iolock held according to @iolock. + * Called with the iolocked held either shared and exclusive according to + * @iolock, and returns with it held. Might upgrade the iolock to exclusive + * if called for a direct write beyond i_size. */ STATIC ssize_t xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, - xfs_fsize_t *new_sizep, int *iolock) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t new_size; int error = 0; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - *new_sizep = 0; restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); - if (error) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); - *iolock = 0; + if (error) return error; - } - - if (likely(!(file->f_mode & FMODE_NOCMTIME))) - file_update_time(file); /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this - * write. There is no need to issue zeroing if another in-flght IO ends - * at or before this one If zeronig is needed and we are currently - * holding the iolock shared, we need to update it to exclusive which - * involves dropping all locks and relocking to maintain correct locking - * order. If we do this, restart the function to ensure all checks and - * values are still valid. + * write. If zeroing is needed and we are currently holding the + * iolock shared, we need to update it to exclusive which implies + * having to redo all checks before. */ - if ((ip->i_new_size && *pos > ip->i_new_size) || - (!ip->i_new_size && *pos > ip->i_size)) { + if (*pos > i_size_read(inode)) { if (*iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_ilock(ip, *iolock); goto restart; } - error = -xfs_zero_eof(ip, *pos, ip->i_size); + error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); + if (error) + return error; } /* - * If this IO extends beyond EOF, we may need to update ip->i_new_size. - * We have already zeroed space beyond EOF (if necessary). Only update - * ip->i_new_size if this IO ends beyond any other in-flight writes. + * Updating the timestamps will grab the ilock again from + * xfs_fs_dirty_inode, so we have to call it after dropping the + * lock above. Eventually we should look into a way to avoid + * the pointless lock roundtrip. */ - new_size = *pos + *count; - if (new_size > ip->i_size) { - if (new_size > ip->i_new_size) - ip->i_new_size = new_size; - *new_sizep = new_size; + if (likely(!(file->f_mode & FMODE_NOCMTIME))) { + error = file_update_time(file); + if (error) + return error; } - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - /* * If we're writing the file then make sure to clear the setuid and * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ return file_remove_suid(file); - } /* @@ -794,9 +631,7 @@ xfs_file_dio_aio_write( const struct iovec *iovp, unsigned long nr_segs, loff_t pos, - size_t ocount, - xfs_fsize_t *new_size, - int *iolock) + size_t ocount) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -806,10 +641,10 @@ xfs_file_dio_aio_write( ssize_t ret = 0; size_t count = ocount; int unaligned_io = 0; + int iolock; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - *iolock = 0; if ((pos & target->bt_smask) || (count & target->bt_smask)) return -XFS_ERROR(EINVAL); @@ -824,31 +659,31 @@ xfs_file_dio_aio_write( * EOF zeroing cases and fill out the new inode size as appropriate. */ if (unaligned_io || mapping->nrpages) - *iolock = XFS_IOLOCK_EXCL; + iolock = XFS_IOLOCK_EXCL; else - *iolock = XFS_IOLOCK_SHARED; - xfs_rw_ilock(ip, *iolock); + iolock = XFS_IOLOCK_SHARED; + xfs_rw_ilock(ip, iolock); /* * Recheck if there are cached pages that need invalidate after we got * the iolock to protect against other threads adding new pages while * we were waiting for the iolock. */ - if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, iolock); + iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, iolock); } - ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) - return ret; + goto out; if (mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) - return ret; + goto out; } /* @@ -857,15 +692,18 @@ xfs_file_dio_aio_write( */ if (unaligned_io) inode_dio_wait(inode); - else if (*iolock == XFS_IOLOCK_EXCL) { + else if (iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - *iolock = XFS_IOLOCK_SHARED; + iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); ret = generic_file_direct_write(iocb, iovp, &nr_segs, pos, &iocb->ki_pos, count, ocount); +out: + xfs_rw_iunlock(ip, iolock); + /* No fallback to buffered IO on errors for XFS. */ ASSERT(ret < 0 || ret == count); return ret; @@ -877,9 +715,7 @@ xfs_file_buffered_aio_write( const struct iovec *iovp, unsigned long nr_segs, loff_t pos, - size_t ocount, - xfs_fsize_t *new_size, - int *iolock) + size_t ocount) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -887,14 +723,14 @@ xfs_file_buffered_aio_write( struct xfs_inode *ip = XFS_I(inode); ssize_t ret; int enospc = 0; + int iolock = XFS_IOLOCK_EXCL; size_t count = ocount; - *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_rw_ilock(ip, iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) - return ret; + goto out; /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; @@ -908,13 +744,15 @@ write_retry: * page locks and retry *once* */ if (ret == -ENOSPC && !enospc) { - ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (ret) - return ret; enospc = 1; - goto write_retry; + ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); + if (!ret) + goto write_retry; } + current->backing_dev_info = NULL; +out: + xfs_rw_iunlock(ip, iolock); return ret; } @@ -930,9 +768,7 @@ xfs_file_aio_write( struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; - int iolock; size_t ocount = 0; - xfs_fsize_t new_size = 0; XFS_STATS_INC(xs_write_calls); @@ -951,33 +787,22 @@ xfs_file_aio_write( return -EIO; if (unlikely(file->f_flags & O_DIRECT)) - ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, - ocount, &new_size, &iolock); + ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); else ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, - ocount, &new_size, &iolock); - - xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); + ocount); - if (ret <= 0) - goto out_unlock; + if (ret > 0) { + ssize_t err; - /* Handle various SYNC-type writes */ - if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - loff_t end = pos + ret - 1; - int error; + XFS_STATS_ADD(xs_write_bytes, ret); - xfs_rw_iunlock(ip, iolock); - error = xfs_file_fsync(file, pos, end, - (file->f_flags & __O_SYNC) ? 0 : 1); - xfs_rw_ilock(ip, iolock); - if (error) - ret = error; + /* Handle various SYNC-type writes */ + err = generic_write_sync(file, pos, ret); + if (err < 0) + ret = err; } -out_unlock: - xfs_aio_write_newsize_update(ip, new_size); - xfs_rw_iunlock(ip, iolock); return ret; } @@ -1141,8 +966,149 @@ xfs_vm_page_mkwrite( return block_page_mkwrite(vma, vmf, xfs_get_blocks); } +STATIC loff_t +xfs_seek_data( + struct file *file, + loff_t start, + u32 type) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec map[2]; + int nmap = 2; + loff_t uninitialized_var(offset); + xfs_fsize_t isize; + xfs_fileoff_t fsbno; + xfs_filblks_t end; + uint lock; + int error; + + lock = xfs_ilock_map_shared(ip); + + isize = i_size_read(inode); + if (start >= isize) { + error = ENXIO; + goto out_unlock; + } + + fsbno = XFS_B_TO_FSBT(mp, start); + + /* + * Try to read extents from the first block indicated + * by fsbno to the end block of the file. + */ + end = XFS_B_TO_FSB(mp, isize); + + error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; + + /* + * Treat unwritten extent as data extent since it might + * contains dirty data in page cache. + */ + if (map[0].br_startblock != HOLESTARTBLOCK) { + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[0].br_startoff)); + } else { + if (nmap == 1) { + error = ENXIO; + goto out_unlock; + } + + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[1].br_startoff)); + } + + if (offset != file->f_pos) + file->f_pos = offset; + +out_unlock: + xfs_iunlock_map_shared(ip, lock); + + if (error) + return -error; + return offset; +} + +STATIC loff_t +xfs_seek_hole( + struct file *file, + loff_t start, + u32 type) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + loff_t uninitialized_var(offset); + loff_t holeoff; + xfs_fsize_t isize; + xfs_fileoff_t fsbno; + uint lock; + int error; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + lock = xfs_ilock_map_shared(ip); + + isize = i_size_read(inode); + if (start >= isize) { + error = ENXIO; + goto out_unlock; + } + + fsbno = XFS_B_TO_FSBT(mp, start); + error = xfs_bmap_first_unused(NULL, ip, 1, &fsbno, XFS_DATA_FORK); + if (error) + goto out_unlock; + + holeoff = XFS_FSB_TO_B(mp, fsbno); + if (holeoff <= start) + offset = start; + else { + /* + * xfs_bmap_first_unused() could return a value bigger than + * isize if there are no more holes past the supplied offset. + */ + offset = min_t(loff_t, holeoff, isize); + } + + if (offset != file->f_pos) + file->f_pos = offset; + +out_unlock: + xfs_iunlock_map_shared(ip, lock); + + if (error) + return -error; + return offset; +} + +STATIC loff_t +xfs_file_llseek( + struct file *file, + loff_t offset, + int origin) +{ + switch (origin) { + case SEEK_END: + case SEEK_CUR: + case SEEK_SET: + return generic_file_llseek(file, offset, origin); + case SEEK_DATA: + return xfs_seek_data(file, offset, origin); + case SEEK_HOLE: + return xfs_seek_hole(file, offset, origin); + default: + return -EINVAL; + } +} + const struct file_operations xfs_file_operations = { - .llseek = generic_file_llseek, + .llseek = xfs_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = xfs_file_aio_read, diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index ed88ed16811..652b875a9d4 100644 --- a/fs/xfs/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c @@ -90,7 +90,7 @@ xfs_wait_on_pages( if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { return -filemap_fdatawait_range(mapping, first, - last == -1 ? ip->i_size - 1 : last); + last == -1 ? XFS_ISIZE(ip) - 1 : last); } return 0; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 1c6fdeb702f..c25b094efbf 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -18,8 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" @@ -39,7 +37,6 @@ #include "xfs_itable.h" #include "xfs_trans_space.h" #include "xfs_rtalloc.h" -#include "xfs_rw.h" #include "xfs_filestream.h" #include "xfs_trace.h" @@ -147,9 +144,9 @@ xfs_growfs_data_private( if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) return error; dpct = pct - mp->m_sb.sb_imax_pct; - bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, + bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) return EIO; xfs_buf_relse(bp); @@ -193,7 +190,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { error = ENOMEM; goto error0; @@ -230,7 +227,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { error = ENOMEM; goto error0; @@ -259,8 +256,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), - XBF_LOCK | XBF_MAPPED); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; @@ -286,8 +282,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), - XBF_LOCK | XBF_MAPPED); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; @@ -314,8 +309,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), - XBF_LOCK | XBF_MAPPED); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; @@ -405,7 +399,7 @@ xfs_growfs_data_private( /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { - error = xfs_read_buf(mp, mp->m_ddev_targp, + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp); if (error) { @@ -693,3 +687,63 @@ xfs_fs_goingdown( return 0; } + +/* + * Force a shutdown of the filesystem instantly while keeping the filesystem + * consistent. We don't do an unmount here; just shutdown the shop, make sure + * that absolutely nothing persistent happens to this filesystem after this + * point. + */ +void +xfs_do_force_shutdown( + xfs_mount_t *mp, + int flags, + char *fname, + int lnnum) +{ + int logerror; + + logerror = flags & SHUTDOWN_LOG_IO_ERROR; + + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_notice(mp, + "%s(0x%x) called from line %d of file %s. Return address = 0x%p", + __func__, flags, lnnum, fname, __return_address); + } + /* + * No need to duplicate efforts. + */ + if (XFS_FORCED_SHUTDOWN(mp) && !logerror) + return; + + /* + * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't + * queue up anybody new on the log reservations, and wakes up + * everybody who's sleeping on log reservations to tell them + * the bad news. + */ + if (xfs_log_force_umount(mp, logerror)) + return; + + if (flags & SHUTDOWN_CORRUPT_INCORE) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, + "Corruption of in-memory data detected. Shutting down filesystem"); + if (XFS_ERRLEVEL_HIGH <= xfs_error_level) + xfs_stack_trace(); + } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + if (logerror) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, + "Log I/O Error Detected. Shutting down filesystem"); + } else if (flags & SHUTDOWN_DEVICE_REQ) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "All device paths lost. Shutting down filesystem"); + } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "I/O Error Detected. Shutting down filesystem"); + } + } + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_alert(mp, + "Please umount the filesystem and rectify the problem(s)"); + } +} diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 169380e6605..177a21a7ac4 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -200,8 +200,7 @@ xfs_ialloc_inode_init( */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * blks_per_cluster, - XBF_LOCK); + mp->m_bsize * blks_per_cluster, 0); if (!fbuf) return ENOMEM; /* @@ -447,7 +446,7 @@ STATIC xfs_buf_t * /* allocation group buffer */ xfs_ialloc_ag_select( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent directory inode number */ - mode_t mode, /* bits set to indicate file type */ + umode_t mode, /* bits set to indicate file type */ int okalloc) /* ok to allocate more space */ { xfs_buf_t *agbp; /* allocation group header buffer */ @@ -610,6 +609,13 @@ xfs_ialloc_get_rec( /* * Visible inode allocation functions. */ +/* + * Find a free (set) bit in the inode bitmask. + */ +static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) +{ + return xfs_lowbit64(*fp); +} /* * Allocate an inode on disk. @@ -640,7 +646,7 @@ int xfs_dialloc( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent inode (directory) */ - mode_t mode, /* mode bits for new inode */ + umode_t mode, /* mode bits for new inode */ int okalloc, /* ok to allocate more space */ xfs_buf_t **IO_agbp, /* in/out ag header's buffer */ boolean_t *alloc_done, /* true if we needed to replenish diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index bb5385475e1..65ac57c8063 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -47,15 +47,6 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) } /* - * Find a free (set) bit in the inode bitmask. - */ -static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) -{ - return xfs_lowbit64(*fp); -} - - -/* * Allocate an inode on disk. * Mode is used to tell whether the new inode will need space, and whether * it is a directory. @@ -81,7 +72,7 @@ int /* error */ xfs_dialloc( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t parent, /* parent inode (directory) */ - mode_t mode, /* mode bits for new inode */ + umode_t mode, /* mode bits for new inode */ int okalloc, /* ok to allocate more space */ struct xfs_buf **agbp, /* buf for a.g. inode header */ boolean_t *alloc_done, /* an allocation was done to replenish diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index c6a75815aea..2b8b7a37aa1 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 0fa98b1c70e..1bb4365e8c2 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_acl.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -77,7 +76,7 @@ xfs_inode_alloc( ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(completion_done(&ip->i_flush)); + ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); @@ -91,11 +90,8 @@ xfs_inode_alloc( ip->i_afp = NULL; memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); ip->i_flags = 0; - ip->i_update_core = 0; ip->i_delayed_blks = 0; memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); - ip->i_size = 0; - ip->i_new_size = 0; return ip; } @@ -107,7 +103,6 @@ xfs_inode_free_callback( struct inode *inode = container_of(head, struct inode, i_rcu); struct xfs_inode *ip = XFS_I(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_zone_free(xfs_inode_zone, ip); } @@ -127,23 +122,7 @@ xfs_inode_free( xfs_idestroy_fork(ip, XFS_ATTR_FORK); if (ip->i_itemp) { - /* - * Only if we are shutting down the fs will we see an - * inode still in the AIL. If it is there, we should remove - * it to prevent a use-after-free from occurring. - */ - xfs_log_item_t *lip = &ip->i_itemp->ili_item; - struct xfs_ail *ailp = lip->li_ailp; - - ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || - XFS_FORCED_SHUTDOWN(ip->i_mount)); - if (lip->li_flags & XFS_LI_IN_AIL) { - spin_lock(&ailp->xa_lock); - if (lip->li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(ailp, lip); - else - spin_unlock(&ailp->xa_lock); - } + ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); xfs_inode_item_destroy(ip); ip->i_itemp = NULL; } @@ -151,7 +130,7 @@ xfs_inode_free( /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(completion_done(&ip->i_flush)); + ASSERT(!xfs_isiflocked(ip)); /* * Because we use RCU freeing we need to ensure the inode always @@ -293,7 +272,7 @@ xfs_iget_cache_hit( if (lock_flags != 0) xfs_ilock(ip, lock_flags); - xfs_iflags_clear(ip, XFS_ISTALE); + xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); XFS_STATS_INC(xs_ig_found); return 0; @@ -318,6 +297,7 @@ xfs_iget_cache_miss( struct xfs_inode *ip; int error; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); + int iflags; ip = xfs_inode_alloc(mp, ino); if (!ip) @@ -337,9 +317,10 @@ xfs_iget_cache_miss( /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload - * region. + * region. Since we can be called from transaction context, don't + * recurse into the file system. */ - if (radix_tree_preload(GFP_KERNEL)) { + if (radix_tree_preload(GFP_NOFS)) { error = EAGAIN; goto out_destroy; } @@ -353,9 +334,23 @@ xfs_iget_cache_miss( BUG(); } - spin_lock(&pag->pag_ici_lock); + /* + * These values must be set before inserting the inode into the radix + * tree as the moment it is inserted a concurrent lookup (allowed by the + * RCU locking mechanism) can find it and that lookup must see that this + * is an inode currently under construction (i.e. that XFS_INEW is set). + * The ip->i_flags_lock that protects the XFS_INEW flag forms the + * memory barrier that ensures this detection works correctly at lookup + * time. + */ + iflags = XFS_INEW; + if (flags & XFS_IGET_DONTCACHE) + iflags |= XFS_IDONTCACHE; + ip->i_udquot = ip->i_gdquot = NULL; + xfs_iflags_set(ip, iflags); /* insert the new inode */ + spin_lock(&pag->pag_ici_lock); error = radix_tree_insert(&pag->pag_ici_root, agino, ip); if (unlikely(error)) { WARN_ON(error != -EEXIST); @@ -363,11 +358,6 @@ xfs_iget_cache_miss( error = EAGAIN; goto out_preload_end; } - - /* These values _must_ be set before releasing the radix tree lock! */ - ip->i_udquot = ip->i_gdquot = NULL; - xfs_iflags_set(ip, XFS_INEW); - spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); @@ -421,6 +411,15 @@ xfs_iget( xfs_perag_t *pag; xfs_agino_t agino; + /* + * xfs_reclaim_inode() uses the ILOCK to ensure an inode + * doesn't get freed while it's being referenced during a + * radix tree traversal here. It assumes this function + * aqcuires only the ILOCK (and therefore it has no need to + * involve the IOLOCK in this synchronization). + */ + ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); + /* reject inode numbers outside existing AGs */ if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) return EINVAL; @@ -451,8 +450,6 @@ again: *ipp = ip; - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t)); /* * If we have a real type for an on-disk inode, we can set ops(&unlock) * now. If it's a new inode being created, xfs_ialloc will handle it. @@ -647,8 +644,7 @@ xfs_iunlock( (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY | - XFS_LOCK_DEP_MASK)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); ASSERT(lock_flags != 0); if (lock_flags & XFS_IOLOCK_EXCL) @@ -661,16 +657,6 @@ xfs_iunlock( else if (lock_flags & XFS_ILOCK_SHARED) mrunlock_shared(&ip->i_lock); - if ((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) && - !(lock_flags & XFS_IUNLOCK_NONOTIFY) && ip->i_itemp) { - /* - * Let the AIL know that this item has been unlocked in case - * it is in the AIL and anyone is waiting on it. Don't do - * this if the caller has asked us not to. - */ - xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp, - (xfs_log_item_t*)(ip->i_itemp)); - } trace_xfs_iunlock(ip, lock_flags, _RET_IP_); } @@ -716,3 +702,19 @@ xfs_isilocked( return 0; } #endif + +void +__xfs_iflock( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); + + finish_wait(wq, &wait.wait); +} diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 755ee816488..a59eea09930 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -20,7 +20,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -61,6 +60,20 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); +/* + * helper function to extract extent size hint from inode + */ +xfs_extlen_t +xfs_get_extsz_hint( + struct xfs_inode *ip) +{ + if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) + return ip->i_d.di_extsize; + if (XFS_IS_REALTIME_INODE(ip)) + return ip->i_mount->m_sb.sb_rextsize; + return 0; +} + #ifdef DEBUG /* * Make sure that the extents in the given memory buffer @@ -137,6 +150,7 @@ xfs_imap_to_bp( int ni; xfs_buf_t *bp; + buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, (int)imap->im_len, buf_flags, &bp); if (error) { @@ -226,7 +240,7 @@ xfs_inotobp( if (error) return error; - error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags); + error = xfs_imap_to_bp(mp, tp, &imap, &bp, 0, imap_flags); if (error) return error; @@ -299,11 +313,8 @@ xfs_iformat( { xfs_attr_shortform_t *atp; int size; - int error; + int error = 0; xfs_fsize_t di_size; - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); - error = 0; if (unlikely(be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) > @@ -350,7 +361,6 @@ xfs_iformat( return XFS_ERROR(EFSCORRUPTED); } ip->i_d.di_size = 0; - ip->i_size = 0; ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); break; @@ -409,10 +419,10 @@ xfs_iformat( } if (!XFS_DFORK_Q(dip)) return 0; + ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); - ip->i_afp->if_ext_max = - XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + switch (dip->di_aformat) { case XFS_DINODE_FMT_LOCAL: atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); @@ -604,10 +614,11 @@ xfs_iformat_btree( * or the number of extents is greater than the number of * blocks. */ - if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max - || XFS_BMDR_SPACE_CALC(nrecs) > - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) - || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_BMDR_SPACE_CALC(nrecs) > + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || + XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, @@ -785,8 +796,7 @@ xfs_iread( /* * Get pointers to the on-disk inode and the buffer containing it. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, - XBF_LOCK, iget_flags); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 0, iget_flags); if (error) return error; dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -835,12 +845,6 @@ xfs_iread( * with the uninitialized part of it. */ ip->i_d.di_mode = 0; - /* - * Initialize the per-fork minima and maxima for a new - * inode here. xfs_iformat will do it for old inodes. - */ - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); } /* @@ -861,7 +865,6 @@ xfs_iread( } ip->i_delayed_blks = 0; - ip->i_size = ip->i_d.di_size; /* * Mark the buffer containing the inode as something to keep @@ -961,7 +964,7 @@ int xfs_ialloc( xfs_trans_t *tp, xfs_inode_t *pip, - mode_t mode, + umode_t mode, xfs_nlink_t nlink, xfs_dev_t rdev, prid_t prid, @@ -1002,7 +1005,7 @@ xfs_ialloc( return error; ASSERT(ip != NULL); - ip->i_d.di_mode = (__uint16_t)mode; + ip->i_d.di_mode = mode; ip->i_d.di_onlink = 0; ip->i_d.di_nlink = nlink; ASSERT(ip->i_d.di_nlink == nlink); @@ -1051,7 +1054,6 @@ xfs_ialloc( } ip->i_d.di_size = 0; - ip->i_size = 0; ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); @@ -1166,52 +1168,6 @@ xfs_ialloc( } /* - * Check to make sure that there are no blocks allocated to the - * file beyond the size of the file. We don't check this for - * files with fixed size extents or real time extents, but we - * at least do it for regular files. - */ -#ifdef DEBUG -STATIC void -xfs_isize_check( - struct xfs_inode *ip, - xfs_fsize_t isize) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t map_first; - int nimaps; - xfs_bmbt_irec_t imaps[2]; - int error; - - if (!S_ISREG(ip->i_d.di_mode)) - return; - - if (XFS_IS_REALTIME_INODE(ip)) - return; - - if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) - return; - - nimaps = 2; - map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - /* - * The filesystem could be shutting down, so bmapi may return - * an error. - */ - error = xfs_bmapi_read(ip, map_first, - (XFS_B_TO_FSB(mp, - (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first), - imaps, &nimaps, XFS_BMAPI_ENTIRE); - if (error) - return; - ASSERT(nimaps == 1); - ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); -} -#else /* DEBUG */ -#define xfs_isize_check(ip, isize) -#endif /* DEBUG */ - -/* * Free up the underlying blocks past new_size. The new size must be smaller * than the current size. This routine can be used both for the attribute and * data fork, and does not modify the inode size, which is left to the caller. @@ -1252,12 +1208,14 @@ xfs_itruncate_extents( int done = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - ASSERT(new_size <= ip->i_size); + ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_itemp != NULL); ASSERT(ip->i_itemp->ili_lock_flags == 0); ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + trace_xfs_itruncate_extents_start(ip, new_size); + /* * Since it is possible for space to become allocated beyond * the end of the file (in a crash where the space is allocated @@ -1325,6 +1283,14 @@ xfs_itruncate_extents( goto out; } + /* + * Always re-log the inode so that our permanent transaction can keep + * on rolling it forward in the log. + */ + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + trace_xfs_itruncate_extents_end(ip, new_size); + out: *tpp = tp; return error; @@ -1338,74 +1304,6 @@ out_bmap_cancel: goto out; } -int -xfs_itruncate_data( - struct xfs_trans **tpp, - struct xfs_inode *ip, - xfs_fsize_t new_size) -{ - int error; - - trace_xfs_itruncate_data_start(ip, new_size); - - /* - * The first thing we do is set the size to new_size permanently on - * disk. This way we don't have to worry about anyone ever being able - * to look at the data being freed even in the face of a crash. - * What we're getting around here is the case where we free a block, it - * is allocated to another file, it is written to, and then we crash. - * If the new data gets written to the file but the log buffers - * containing the free and reallocation don't, then we'd end up with - * garbage in the blocks being freed. As long as we make the new_size - * permanent before actually freeing any blocks it doesn't matter if - * they get written to. - */ - if (ip->i_d.di_nextents > 0) { - /* - * If we are not changing the file size then do not update - * the on-disk file size - we may be called from - * xfs_inactive_free_eofblocks(). If we update the on-disk - * file size and then the system crashes before the contents - * of the file are flushed to disk then the files may be - * full of holes (ie NULL files bug). - */ - if (ip->i_size != new_size) { - ip->i_d.di_size = new_size; - ip->i_size = new_size; - xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); - } - } - - error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size); - if (error) - return error; - - /* - * If we are not changing the file size then do not update the on-disk - * file size - we may be called from xfs_inactive_free_eofblocks(). - * If we update the on-disk file size and then the system crashes - * before the contents of the file are flushed to disk then the files - * may be full of holes (ie NULL files bug). - */ - xfs_isize_check(ip, new_size); - if (ip->i_size != new_size) { - ip->i_d.di_size = new_size; - ip->i_size = new_size; - } - - ASSERT(new_size != 0 || ip->i_delayed_blks == 0); - ASSERT(new_size != 0 || ip->i_d.di_nextents == 0); - - /* - * Always re-log the inode so that our permanent transaction can keep - * on rolling it forward in the log. - */ - xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); - - trace_xfs_itruncate_data_end(ip, new_size); - return 0; -} - /* * This is called when the inode's link count goes to 0. * We place the on-disk inode on a list in the AGI. It @@ -1457,7 +1355,7 @@ xfs_iunlink( * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) return error; @@ -1538,7 +1436,7 @@ xfs_iunlink_remove( * of dealing with the buffer when there is no need to * change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) { xfs_warn(mp, "%s: xfs_itobp() returned error %d.", __func__, error); @@ -1599,7 +1497,7 @@ xfs_iunlink_remove( * Now last_ibp points to the buffer previous to us on * the unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) { xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.", __func__, error); @@ -1681,8 +1579,7 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, - XBF_LOCK); + mp->m_bsize * blks_per_cluster, 0); if (!bp) return ENOMEM; @@ -1771,14 +1668,13 @@ retry: iip = ip->i_itemp; if (!iip || xfs_inode_clean(ip)) { ASSERT(ip != free_ip); - ip->i_update_core = 0; xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); continue; } - iip->ili_last_fields = iip->ili_format.ilf_fields; - iip->ili_format.ilf_fields = 0; + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; iip->ili_logged = 1; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -1824,8 +1720,7 @@ xfs_ifree( ASSERT(ip->i_d.di_nlink == 0); ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); - ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || - (!S_ISREG(ip->i_d.di_mode))); + ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode)); ASSERT(ip->i_d.di_nblocks == 0); /* @@ -1844,8 +1739,6 @@ xfs_ifree( ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; /* @@ -1856,7 +1749,7 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0); if (error) return error; @@ -2151,7 +2044,7 @@ xfs_idestroy_fork( * once someone is waiting for it to be unpinned. */ static void -xfs_iunpin_nowait( +xfs_iunpin( struct xfs_inode *ip) { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); @@ -2163,14 +2056,29 @@ xfs_iunpin_nowait( } +static void +__xfs_iunpin_wait( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); + + xfs_iunpin(ip); + + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_ipincount(ip)) + io_schedule(); + } while (xfs_ipincount(ip)); + finish_wait(wq, &wait.wait); +} + void xfs_iunpin_wait( struct xfs_inode *ip) { - if (xfs_ipincount(ip)) { - xfs_iunpin_nowait(ip); - wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0)); - } + if (xfs_ipincount(ip)) + __xfs_iunpin_wait(ip); } /* @@ -2280,7 +2188,7 @@ xfs_iflush_fork( mp = ip->i_mount; switch (XFS_IFORK_FORMAT(ip, whichfork)) { case XFS_DINODE_FMT_LOCAL: - if ((iip->ili_format.ilf_fields & dataflag[whichfork]) && + if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(ifp->if_u1.if_data != NULL); ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); @@ -2290,8 +2198,8 @@ xfs_iflush_fork( case XFS_DINODE_FMT_EXTENTS: ASSERT((ifp->if_flags & XFS_IFEXTENTS) || - !(iip->ili_format.ilf_fields & extflag[whichfork])); - if ((iip->ili_format.ilf_fields & extflag[whichfork]) && + !(iip->ili_fields & extflag[whichfork])); + if ((iip->ili_fields & extflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(xfs_iext_get_ext(ifp, 0)); ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); @@ -2301,7 +2209,7 @@ xfs_iflush_fork( break; case XFS_DINODE_FMT_BTREE: - if ((iip->ili_format.ilf_fields & brootflag[whichfork]) && + if ((iip->ili_fields & brootflag[whichfork]) && (ifp->if_broot_bytes > 0)) { ASSERT(ifp->if_broot != NULL); ASSERT(ifp->if_broot_bytes <= @@ -2314,14 +2222,14 @@ xfs_iflush_fork( break; case XFS_DINODE_FMT_DEV: - if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { + if (iip->ili_fields & XFS_ILOG_DEV) { ASSERT(whichfork == XFS_DATA_FORK); xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); } break; case XFS_DINODE_FMT_UUID: - if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { + if (iip->ili_fields & XFS_ILOG_UUID) { ASSERT(whichfork == XFS_DATA_FORK); memcpy(XFS_DFORK_DPTR(dip), &ip->i_df.if_u2.if_uuid, @@ -2451,11 +2359,11 @@ cluster_corrupt_out: */ rcu_read_unlock(); /* - * Clean up the buffer. If it was B_DELWRI, just release it -- + * Clean up the buffer. If it was delwri, just release it -- * brelse can handle it with no problems. If not, shut down the * filesystem before releasing the buffer. */ - bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); + bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); if (bufwasdelwri) xfs_buf_relse(bp); @@ -2481,58 +2389,40 @@ cluster_corrupt_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(iq); + xfs_iflush_abort(iq, false); kmem_free(ilist); xfs_perag_put(pag); return XFS_ERROR(EFSCORRUPTED); } /* - * xfs_iflush() will write a modified inode's changes out to the - * inode's on disk home. The caller must have the inode lock held - * in at least shared mode and the inode flush completion must be - * active as well. The inode lock will still be held upon return from - * the call and the caller is free to unlock it. - * The inode flush will be completed when the inode reaches the disk. - * The flags indicate how the inode's buffer should be written out. + * Flush dirty inode metadata into the backing buffer. + * + * The caller must have the inode lock and the inode flush lock held. The + * inode lock will still be held upon return to the caller, and the inode + * flush lock will be released after the inode has reached the disk. + * + * The caller must write out the buffer returned in *bpp and release it. */ int xfs_iflush( - xfs_inode_t *ip, - uint flags) + struct xfs_inode *ip, + struct xfs_buf **bpp) { - xfs_inode_log_item_t *iip; - xfs_buf_t *bp; - xfs_dinode_t *dip; - xfs_mount_t *mp; + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + struct xfs_dinode *dip; int error; XFS_STATS_INC(xs_iflush_count); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(!completion_done(&ip->i_flush)); + ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > ip->i_df.if_ext_max); + ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - iip = ip->i_itemp; - mp = ip->i_mount; + *bpp = NULL; - /* - * We can't flush the inode until it is unpinned, so wait for it if we - * are allowed to block. We know no one new can pin it, because we are - * holding the inode lock shared and you need to hold it exclusively to - * pin the inode. - * - * If we are not allowed to block, force the log out asynchronously so - * that when we come back the inode will be unpinned. If other inodes - * in the same cluster are dirty, they will probably write the inode - * out for us if they occur after the log force completes. - */ - if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { - xfs_iunpin_nowait(ip); - xfs_ifunlock(ip); - return EAGAIN; - } xfs_iunpin_wait(ip); /* @@ -2551,21 +2441,20 @@ xfs_iflush( /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this inode - * to disk, because the log record didn't make it to disk! + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an empty AIL as part of the unmount process. */ if (XFS_FORCED_SHUTDOWN(mp)) { - ip->i_update_core = 0; - if (iip) - iip->ili_format.ilf_fields = 0; - xfs_ifunlock(ip); - return XFS_ERROR(EIO); + error = XFS_ERROR(EIO); + goto abort_out; } /* * Get the buffer containing the on-disk inode. */ - error = xfs_itobp(mp, NULL, ip, &dip, &bp, - (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK); + error = xfs_itobp(mp, NULL, ip, &dip, &bp, XBF_TRYLOCK); if (error || !bp) { xfs_ifunlock(ip); return error; @@ -2593,23 +2482,20 @@ xfs_iflush( if (error) goto cluster_corrupt_out; - if (flags & SYNC_WAIT) - error = xfs_bwrite(bp); - else - xfs_buf_delwri_queue(bp); - - xfs_buf_relse(bp); - return error; + *bpp = bp; + return 0; corrupt_out: xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); cluster_corrupt_out: + error = XFS_ERROR(EFSCORRUPTED); +abort_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(ip); - return XFS_ERROR(EFSCORRUPTED); + xfs_iflush_abort(ip, false); + return error; } @@ -2626,9 +2512,9 @@ xfs_iflush_int( #endif ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(!completion_done(&ip->i_flush)); + ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > ip->i_df.if_ext_max); + ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); iip = ip->i_itemp; mp = ip->i_mount; @@ -2636,26 +2522,6 @@ xfs_iflush_int( /* set *dip = inode's place in the buffer */ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); - /* - * Clear i_update_core before copying out the data. - * This is for coordination with our timestamp updates - * that don't hold the inode lock. They will always - * update the timestamps BEFORE setting i_update_core, - * so if we clear i_update_core after they set it we - * are guaranteed to see their updates to the timestamps. - * I believe that this depends on strongly ordered memory - * semantics, but we have that. We use the SYNCHRONIZE - * macro to make sure that the compiler does not reorder - * the i_update_core access below the data copy below. - */ - ip->i_update_core = 0; - SYNCHRONIZE(); - - /* - * Make sure to get the latest timestamps from the Linux inode. - */ - xfs_synchronize_times(ip); - if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, @@ -2766,36 +2632,33 @@ xfs_iflush_int( xfs_inobp_check(mp, bp); /* - * We've recorded everything logged in the inode, so we'd - * like to clear the ilf_fields bits so we don't log and - * flush things unnecessarily. However, we can't stop - * logging all this information until the data we've copied - * into the disk buffer is written to disk. If we did we might - * overwrite the copy of the inode in the log with all the - * data after re-logging only part of it, and in the face of - * a crash we wouldn't have all the data we need to recover. + * We've recorded everything logged in the inode, so we'd like to clear + * the ili_fields bits so we don't log and flush things unnecessarily. + * However, we can't stop logging all this information until the data + * we've copied into the disk buffer is written to disk. If we did we + * might overwrite the copy of the inode in the log with all the data + * after re-logging only part of it, and in the face of a crash we + * wouldn't have all the data we need to recover. * - * What we do is move the bits to the ili_last_fields field. - * When logging the inode, these bits are moved back to the - * ilf_fields field. In the xfs_iflush_done() routine we - * clear ili_last_fields, since we know that the information - * those bits represent is permanently on disk. As long as - * the flush completes before the inode is logged again, then - * both ilf_fields and ili_last_fields will be cleared. + * What we do is move the bits to the ili_last_fields field. When + * logging the inode, these bits are moved back to the ili_fields field. + * In the xfs_iflush_done() routine we clear ili_last_fields, since we + * know that the information those bits represent is permanently on + * disk. As long as the flush completes before the inode is logged + * again, then both ili_fields and ili_last_fields will be cleared. * - * We can play with the ilf_fields bits here, because the inode - * lock must be held exclusively in order to set bits there - * and the flush lock protects the ili_last_fields bits. - * Set ili_logged so the flush done - * routine can tell whether or not to look in the AIL. - * Also, store the current LSN of the inode so that we can tell - * whether the item has moved in the AIL from xfs_iflush_done(). - * In order to read the lsn we need the AIL lock, because - * it is a 64 bit value that cannot be read atomically. - */ - if (iip != NULL && iip->ili_format.ilf_fields != 0) { - iip->ili_last_fields = iip->ili_format.ilf_fields; - iip->ili_format.ilf_fields = 0; + * We can play with the ili_fields bits here, because the inode lock + * must be held exclusively in order to set bits there and the flush + * lock protects the ili_last_fields bits. Set ili_logged so the flush + * done routine can tell whether or not to look in the AIL. Also, store + * the current LSN of the inode so that we can tell whether the item has + * moved in the AIL from xfs_iflush_done(). In order to read the lsn we + * need the AIL lock, because it is a 64 bit value that cannot be read + * atomically. + */ + if (iip != NULL && iip->ili_fields != 0) { + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; iip->ili_logged = 1; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, @@ -2814,8 +2677,7 @@ xfs_iflush_int( } else { /* * We're flushing an inode which is not in the AIL and has - * not been logged but has i_update_core set. For this - * case we can use a B_DELWRI flush and immediately drop + * not been logged. For this case we can immediately drop * the inode flush lock because we can avoid the whole * AIL state thing. It's OK to drop the flush lock now, * because we've already locked the buffer and to do anything @@ -2835,27 +2697,6 @@ corrupt_out: return XFS_ERROR(EFSCORRUPTED); } -void -xfs_promote_inode( - struct xfs_inode *ip) -{ - struct xfs_buf *bp; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - - bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno, - ip->i_imap.im_len, XBF_TRYLOCK); - if (!bp) - return; - - if (XFS_BUF_ISDELAYWRITE(bp)) { - xfs_buf_delwri_promote(bp); - wake_up_process(ip->i_mount->m_ddev_targp->bt_task); - } - - xfs_buf_relse(bp); -} - /* * Return a pointer to the extent record at file index idx. */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b4cd4739f98..1efff36a75b 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -66,7 +66,6 @@ typedef struct xfs_ifork { struct xfs_btree_block *if_broot; /* file's incore btree root */ short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ - unsigned char if_ext_max; /* max # of extent records */ union { xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ @@ -206,12 +205,12 @@ typedef struct xfs_icdinode { ((w) == XFS_DATA_FORK ? \ ((ip)->i_d.di_nextents = (n)) : \ ((ip)->i_d.di_anextents = (n))) - +#define XFS_IFORK_MAXEXT(ip, w) \ + (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) #ifdef __KERNEL__ -struct bhv_desc; struct xfs_buf; struct xfs_bmap_free; struct xfs_bmbt_irec; @@ -220,12 +219,6 @@ struct xfs_mount; struct xfs_trans; struct xfs_dquot; -typedef struct dm_attrs_s { - __uint32_t da_dmevmask; /* DMIG event mask */ - __uint16_t da_dmstate; /* DMIG state info */ - __uint16_t da_pad; /* DMIG extra padding */ -} dm_attrs_t; - typedef struct xfs_inode { /* Inode linking and identification information. */ struct xfs_mount *i_mount; /* fs mount struct ptr */ @@ -244,27 +237,18 @@ typedef struct xfs_inode { struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ mrlock_t i_iolock; /* inode IO lock */ - struct completion i_flush; /* inode flush completion q */ atomic_t i_pincount; /* inode pin count */ - wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ - unsigned short i_flags; /* see defined flags below */ - unsigned char i_update_core; /* timestamps/size is dirty */ + unsigned long i_flags; /* see defined flags below */ unsigned int i_delayed_blks; /* count of delay alloc blks */ xfs_icdinode_t i_d; /* most of ondisk inode */ - xfs_fsize_t i_size; /* in-memory size */ - xfs_fsize_t i_new_size; /* size when write completes */ - /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ } xfs_inode_t; -#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \ - (ip)->i_size : (ip)->i_d.di_size; - /* Convert from vfs inode to xfs inode */ static inline struct xfs_inode *XFS_I(struct inode *inode) { @@ -278,6 +262,32 @@ static inline struct inode *VFS_I(struct xfs_inode *ip) } /* + * For regular files we only update the on-disk filesize when actually + * writing data back to disk. Until then only the copy in the VFS inode + * is uptodate. + */ +static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) +{ + if (S_ISREG(ip->i_d.di_mode)) + return i_size_read(VFS_I(ip)); + return ip->i_d.di_size; +} + +/* + * If this I/O goes past the on-disk inode size update it unless it would + * be past the current in-core inode size. + */ +static inline xfs_fsize_t +xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size) +{ + xfs_fsize_t i_size = i_size_read(VFS_I(ip)); + + if (new_size > i_size) + new_size = i_size; + return new_size > ip->i_d.di_size ? new_size : 0; +} + +/* * i_flags helper functions */ static inline void @@ -331,6 +341,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) return ret; } +static inline int +xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + + spin_lock(&ip->i_flags_lock); + ret = ip->i_flags & flags; + if (!ret) + ip->i_flags |= flags; + spin_unlock(&ip->i_flags_lock); + return ret; +} + /* * Project quota id helpers (previously projid was 16bit only * and using two 16bit values to hold new 32bit projid was chosen @@ -351,39 +374,24 @@ xfs_set_projid(struct xfs_inode *ip, } /* - * Manage the i_flush queue embedded in the inode. This completion - * queue synchronizes processes attempting to flush the in-core - * inode back to disk. - */ -static inline void xfs_iflock(xfs_inode_t *ip) -{ - wait_for_completion(&ip->i_flush); -} - -static inline int xfs_iflock_nowait(xfs_inode_t *ip) -{ - return try_wait_for_completion(&ip->i_flush); -} - -static inline void xfs_ifunlock(xfs_inode_t *ip) -{ - complete(&ip->i_flush); -} - -/* * In-core inode flags. */ -#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */ -#define XFS_ISTALE 0x0002 /* inode has been staled */ -#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ -#define XFS_INEW 0x0008 /* inode has just been allocated */ -#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ -#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ -#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */ +#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ +#define XFS_ISTALE (1 << 1) /* inode has been staled */ +#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ +#define XFS_INEW (1 << 3) /* inode has just been allocated */ +#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */ +#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ +#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ +#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ +#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) +#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ +#define XFS_IPINNED (1 << __XFS_IPINNED_BIT) +#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during - * inode lookup. Thi prevents unintended behaviour on the new inode from + * inode lookup. This prevents unintended behaviour on the new inode from * ocurring. */ #define XFS_IRECLAIM_RESET_FLAGS \ @@ -392,6 +400,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) XFS_IFILESTREAM); /* + * Synchronize processes attempting to flush the in-core inode back to disk. + */ + +extern void __xfs_iflock(struct xfs_inode *ip); + +static inline int xfs_iflock_nowait(struct xfs_inode *ip) +{ + return !xfs_iflags_test_and_set(ip, XFS_IFLOCK); +} + +static inline void xfs_iflock(struct xfs_inode *ip) +{ + if (!xfs_iflock_nowait(ip)) + __xfs_iflock(ip); +} + +static inline void xfs_ifunlock(struct xfs_inode *ip) +{ + xfs_iflags_clear(ip, XFS_IFLOCK); + wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); +} + +static inline int xfs_isiflocked(struct xfs_inode *ip) +{ + return xfs_iflags_test(ip, XFS_IFLOCK); +} + +/* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) * 1<<16 - 1<<32-1 -- lockdep annotation (integers) @@ -400,7 +436,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) #define XFS_IOLOCK_SHARED (1<<1) #define XFS_ILOCK_EXCL (1<<2) #define XFS_ILOCK_SHARED (1<<3) -#define XFS_IUNLOCK_NONOTIFY (1<<4) #define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) @@ -409,8 +444,7 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ - { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \ - { XFS_IUNLOCK_NONOTIFY, "IUNLOCK_NONOTIFY" } + { XFS_ILOCK_SHARED, "ILOCK_SHARED" } /* @@ -481,7 +515,7 @@ void xfs_inode_free(struct xfs_inode *ip); /* * xfs_inode.c prototypes. */ -int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, +int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_buf **, boolean_t *, xfs_inode_t **); @@ -491,20 +525,15 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t); -int xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *, - xfs_fsize_t); int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); void xfs_iunpin_wait(xfs_inode_t *); -int xfs_iflush(xfs_inode_t *, uint); -void xfs_promote_inode(struct xfs_inode *); +int xfs_iflush(struct xfs_inode *, struct xfs_buf **); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); -void xfs_synchronize_times(xfs_inode_t *); -void xfs_mark_inode_dirty(xfs_inode_t *); -void xfs_mark_inode_dirty_sync(xfs_inode_t *); +xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); #define IHOLD(ip) \ do { \ @@ -526,6 +555,7 @@ do { \ */ #define XFS_IGET_CREATE 0x1 #define XFS_IGET_UNTRUSTED 0x2 +#define XFS_IGET_DONTCACHE 0x4 int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, struct xfs_dinode **, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index abaafdbb3e6..d041d47d9d8 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -57,79 +55,28 @@ xfs_inode_item_size( struct xfs_inode *ip = iip->ili_inode; uint nvecs = 2; - /* - * Only log the data/extents/b-tree root if there is something - * left to log. - */ - iip->ili_format.ilf_fields |= XFS_ILOG_CORE; - switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEV | XFS_ILOG_UUID); - if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) && - (ip->i_d.di_nextents > 0) && - (ip->i_df.if_bytes > 0)) { - ASSERT(ip->i_df.if_u1.if_extents != NULL); + if ((iip->ili_fields & XFS_ILOG_DEXT) && + ip->i_d.di_nextents > 0 && + ip->i_df.if_bytes > 0) nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT; - } break; case XFS_DINODE_FMT_BTREE: - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | - XFS_ILOG_DEV | XFS_ILOG_UUID); - if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) && - (ip->i_df.if_broot_bytes > 0)) { - ASSERT(ip->i_df.if_broot != NULL); + if ((iip->ili_fields & XFS_ILOG_DBROOT) && + ip->i_df.if_broot_bytes > 0) nvecs++; - } else { - ASSERT(!(iip->ili_format.ilf_fields & - XFS_ILOG_DBROOT)); -#ifdef XFS_TRANS_DEBUG - if (iip->ili_root_size > 0) { - ASSERT(iip->ili_root_size == - ip->i_df.if_broot_bytes); - ASSERT(memcmp(iip->ili_orig_root, - ip->i_df.if_broot, - iip->ili_root_size) == 0); - } else { - ASSERT(ip->i_df.if_broot_bytes == 0); - } -#endif - iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT; - } break; case XFS_DINODE_FMT_LOCAL: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | - XFS_ILOG_DEV | XFS_ILOG_UUID); - if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) && - (ip->i_df.if_bytes > 0)) { - ASSERT(ip->i_df.if_u1.if_data != NULL); - ASSERT(ip->i_d.di_size > 0); + if ((iip->ili_fields & XFS_ILOG_DDATA) && + ip->i_df.if_bytes > 0) nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA; - } break; case XFS_DINODE_FMT_DEV: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEXT | XFS_ILOG_UUID); - break; - case XFS_DINODE_FMT_UUID: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEXT | XFS_ILOG_DEV); break; default: @@ -137,56 +84,31 @@ xfs_inode_item_size( break; } - /* - * If there are no attributes associated with this file, - * then there cannot be anything more to log. - * Clear all attribute-related log flags. - */ - if (!XFS_IFORK_Q(ip)) { - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); + if (!XFS_IFORK_Q(ip)) return nvecs; - } + /* * Log any necessary attribute data. */ switch (ip->i_d.di_aformat) { case XFS_DINODE_FMT_EXTENTS: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); - if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) && - (ip->i_d.di_anextents > 0) && - (ip->i_afp->if_bytes > 0)) { - ASSERT(ip->i_afp->if_u1.if_extents != NULL); + if ((iip->ili_fields & XFS_ILOG_AEXT) && + ip->i_d.di_anextents > 0 && + ip->i_afp->if_bytes > 0) nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT; - } break; case XFS_DINODE_FMT_BTREE: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); - if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) && - (ip->i_afp->if_broot_bytes > 0)) { - ASSERT(ip->i_afp->if_broot != NULL); + if ((iip->ili_fields & XFS_ILOG_ABROOT) && + ip->i_afp->if_broot_bytes > 0) nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT; - } break; case XFS_DINODE_FMT_LOCAL: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); - if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) && - (ip->i_afp->if_bytes > 0)) { - ASSERT(ip->i_afp->if_u1.if_data != NULL); + if ((iip->ili_fields & XFS_ILOG_ADATA) && + ip->i_afp->if_bytes > 0) nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA; - } break; default: @@ -256,48 +178,11 @@ xfs_inode_item_format( vecp++; nvecs = 1; - /* - * Clear i_update_core if the timestamps (or any other - * non-transactional modification) need flushing/logging - * and we're about to log them with the rest of the core. - * - * This is the same logic as xfs_iflush() but this code can't - * run at the same time as xfs_iflush because we're in commit - * processing here and so we have the inode lock held in - * exclusive mode. Although it doesn't really matter - * for the timestamps if both routines were to grab the - * timestamps or not. That would be ok. - * - * We clear i_update_core before copying out the data. - * This is for coordination with our timestamp updates - * that don't hold the inode lock. They will always - * update the timestamps BEFORE setting i_update_core, - * so if we clear i_update_core after they set it we - * are guaranteed to see their updates to the timestamps - * either here. Likewise, if they set it after we clear it - * here, we'll see it either on the next commit of this - * inode or the next time the inode gets flushed via - * xfs_iflush(). This depends on strongly ordered memory - * semantics, but we have that. We use the SYNCHRONIZE - * macro to make sure that the compiler does not reorder - * the i_update_core access below the data copy below. - */ - if (ip->i_update_core) { - ip->i_update_core = 0; - SYNCHRONIZE(); - } - - /* - * Make sure to get the latest timestamps from the Linux inode. - */ - xfs_synchronize_times(ip); - vecp->i_addr = &ip->i_d; vecp->i_len = sizeof(struct xfs_icdinode); vecp->i_type = XLOG_REG_TYPE_ICORE; vecp++; nvecs++; - iip->ili_format.ilf_fields |= XFS_ILOG_CORE; /* * If this is really an old format inode, then we need to @@ -330,16 +215,17 @@ xfs_inode_item_format( switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEV | XFS_ILOG_UUID))); - if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) { - ASSERT(ip->i_df.if_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + + if ((iip->ili_fields & XFS_ILOG_DEXT) && + ip->i_d.di_nextents > 0 && + ip->i_df.if_bytes > 0) { ASSERT(ip->i_df.if_u1.if_extents != NULL); - ASSERT(ip->i_d.di_nextents > 0); + ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); ASSERT(iip->ili_extents_buf == NULL); - ASSERT((ip->i_df.if_bytes / - (uint)sizeof(xfs_bmbt_rec_t)) > 0); + #ifdef XFS_NATIVE_HOST if (ip->i_d.di_nextents == ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { @@ -361,15 +247,18 @@ xfs_inode_item_format( iip->ili_format.ilf_dsize = vecp->i_len; vecp++; nvecs++; + } else { + iip->ili_fields &= ~XFS_ILOG_DEXT; } break; case XFS_DINODE_FMT_BTREE: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DDATA | XFS_ILOG_DEXT | - XFS_ILOG_DEV | XFS_ILOG_UUID))); - if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) { - ASSERT(ip->i_df.if_broot_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + + if ((iip->ili_fields & XFS_ILOG_DBROOT) && + ip->i_df.if_broot_bytes > 0) { ASSERT(ip->i_df.if_broot != NULL); vecp->i_addr = ip->i_df.if_broot; vecp->i_len = ip->i_df.if_broot_bytes; @@ -377,15 +266,30 @@ xfs_inode_item_format( vecp++; nvecs++; iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; + } else { + ASSERT(!(iip->ili_fields & + XFS_ILOG_DBROOT)); +#ifdef XFS_TRANS_DEBUG + if (iip->ili_root_size > 0) { + ASSERT(iip->ili_root_size == + ip->i_df.if_broot_bytes); + ASSERT(memcmp(iip->ili_orig_root, + ip->i_df.if_broot, + iip->ili_root_size) == 0); + } else { + ASSERT(ip->i_df.if_broot_bytes == 0); + } +#endif + iip->ili_fields &= ~XFS_ILOG_DBROOT; } break; case XFS_DINODE_FMT_LOCAL: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | - XFS_ILOG_DEV | XFS_ILOG_UUID))); - if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) { - ASSERT(ip->i_df.if_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + if ((iip->ili_fields & XFS_ILOG_DDATA) && + ip->i_df.if_bytes > 0) { ASSERT(ip->i_df.if_u1.if_data != NULL); ASSERT(ip->i_d.di_size > 0); @@ -403,24 +307,26 @@ xfs_inode_item_format( vecp++; nvecs++; iip->ili_format.ilf_dsize = (unsigned)data_bytes; + } else { + iip->ili_fields &= ~XFS_ILOG_DDATA; } break; case XFS_DINODE_FMT_DEV: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | - XFS_ILOG_DDATA | XFS_ILOG_UUID))); - if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEXT | XFS_ILOG_UUID); + if (iip->ili_fields & XFS_ILOG_DEV) { iip->ili_format.ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev; } break; case XFS_DINODE_FMT_UUID: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | - XFS_ILOG_DDATA | XFS_ILOG_DEV))); - if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEXT | XFS_ILOG_DEV); + if (iip->ili_fields & XFS_ILOG_UUID) { iip->ili_format.ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid; } @@ -432,32 +338,25 @@ xfs_inode_item_format( } /* - * If there are no attributes associated with the file, - * then we're done. - * Assert that no attribute-related log flags are set. + * If there are no attributes associated with the file, then we're done. */ if (!XFS_IFORK_Q(ip)) { - ASSERT(nvecs == lip->li_desc->lid_size); - iip->ili_format.ilf_size = nvecs; - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); - return; + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); + goto out; } switch (ip->i_d.di_aformat) { case XFS_DINODE_FMT_EXTENTS: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_ADATA | XFS_ILOG_ABROOT))); - if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) { -#ifdef DEBUG - int nrecs = ip->i_afp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(nrecs > 0); - ASSERT(nrecs == ip->i_d.di_anextents); - ASSERT(ip->i_afp->if_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); + + if ((iip->ili_fields & XFS_ILOG_AEXT) && + ip->i_d.di_anextents > 0 && + ip->i_afp->if_bytes > 0) { + ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == + ip->i_d.di_anextents); ASSERT(ip->i_afp->if_u1.if_extents != NULL); - ASSERT(ip->i_d.di_anextents > 0); -#endif #ifdef XFS_NATIVE_HOST /* * There are not delayed allocation extents @@ -474,29 +373,36 @@ xfs_inode_item_format( iip->ili_format.ilf_asize = vecp->i_len; vecp++; nvecs++; + } else { + iip->ili_fields &= ~XFS_ILOG_AEXT; } break; case XFS_DINODE_FMT_BTREE: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_ADATA | XFS_ILOG_AEXT))); - if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) { - ASSERT(ip->i_afp->if_broot_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); + + if ((iip->ili_fields & XFS_ILOG_ABROOT) && + ip->i_afp->if_broot_bytes > 0) { ASSERT(ip->i_afp->if_broot != NULL); + vecp->i_addr = ip->i_afp->if_broot; vecp->i_len = ip->i_afp->if_broot_bytes; vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; vecp++; nvecs++; iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; + } else { + iip->ili_fields &= ~XFS_ILOG_ABROOT; } break; case XFS_DINODE_FMT_LOCAL: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); - if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) { - ASSERT(ip->i_afp->if_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); + + if ((iip->ili_fields & XFS_ILOG_ADATA) && + ip->i_afp->if_bytes > 0) { ASSERT(ip->i_afp->if_u1.if_data != NULL); vecp->i_addr = ip->i_afp->if_u1.if_data; @@ -513,6 +419,8 @@ xfs_inode_item_format( vecp++; nvecs++; iip->ili_format.ilf_asize = (unsigned)data_bytes; + } else { + iip->ili_fields &= ~XFS_ILOG_ADATA; } break; @@ -521,7 +429,15 @@ xfs_inode_item_format( break; } - ASSERT(nvecs == lip->li_desc->lid_size); +out: + /* + * Now update the log format that goes out to disk from the in-core + * values. We always write the inode core to make the arithmetic + * games in recovery easier, which isn't a big deal as just about any + * transaction would dirty it anyway. + */ + iip->ili_format.ilf_fields = XFS_ILOG_CORE | + (iip->ili_fields & ~XFS_ILOG_TIMESTAMP); iip->ili_format.ilf_size = nvecs; } @@ -559,28 +475,19 @@ xfs_inode_item_unpin( trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) - wake_up(&ip->i_ipin_wait); + wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } -/* - * This is called to attempt to lock the inode associated with this - * inode log item, in preparation for the push routine which does the actual - * iflush. Don't sleep on the inode lock or the flush lock. - * - * If the flush lock is already held, indicating that the inode has - * been or is in the process of being flushed, then (ideally) we'd like to - * see if the inode's buffer is still incore, and if so give it a nudge. - * We delay doing so until the pushbuf routine, though, to avoid holding - * the AIL lock across a call to the blackhole which is the buffer cache. - * Also we don't want to sleep in any device strategy routines, which can happen - * if we do the subsequent bawrite in here. - */ STATIC uint -xfs_inode_item_trylock( - struct xfs_log_item *lip) +xfs_inode_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; + struct xfs_buf *bp = NULL; + uint rval = XFS_ITEM_SUCCESS; + int error; if (xfs_ipincount(ip) > 0) return XFS_ITEM_PINNED; @@ -588,34 +495,49 @@ xfs_inode_item_trylock( if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED; - if (!xfs_iflock_nowait(ip)) { - /* - * inode has already been flushed to the backing buffer, - * leave it locked in shared mode, pushbuf routine will - * unlock it. - */ - return XFS_ITEM_PUSHBUF; + /* + * Re-check the pincount now that we stabilized the value by + * taking the ilock. + */ + if (xfs_ipincount(ip) > 0) { + rval = XFS_ITEM_PINNED; + goto out_unlock; } - /* Stale items should force out the iclog */ + /* + * Stale inode items should force out the iclog. + */ if (ip->i_flags & XFS_ISTALE) { - xfs_ifunlock(ip); - /* - * we hold the AIL lock - notify the unlock routine of this - * so it doesn't try to get the lock again. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); - return XFS_ITEM_PINNED; + rval = XFS_ITEM_PINNED; + goto out_unlock; } -#ifdef DEBUG - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - ASSERT(iip->ili_format.ilf_fields != 0); - ASSERT(iip->ili_logged == 0); - ASSERT(lip->li_flags & XFS_LI_IN_AIL); + /* + * Someone else is already flushing the inode. Nothing we can do + * here but wait for the flush to finish and remove the item from + * the AIL. + */ + if (!xfs_iflock_nowait(ip)) { + rval = XFS_ITEM_FLUSHING; + goto out_unlock; } -#endif - return XFS_ITEM_SUCCESS; + + ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); + ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); + + spin_unlock(&lip->li_ailp->xa_lock); + + error = xfs_iflush(ip, &bp); + if (!error) { + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_relse(bp); + } + + spin_lock(&lip->li_ailp->xa_lock); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return rval; } /* @@ -642,7 +564,7 @@ xfs_inode_item_unlock( if (iip->ili_extents_buf != NULL) { ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS); ASSERT(ip->i_d.di_nextents > 0); - ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT); + ASSERT(iip->ili_fields & XFS_ILOG_DEXT); ASSERT(ip->i_df.if_bytes > 0); kmem_free(iip->ili_extents_buf); iip->ili_extents_buf = NULL; @@ -650,7 +572,7 @@ xfs_inode_item_unlock( if (iip->ili_aextents_buf != NULL) { ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS); ASSERT(ip->i_d.di_anextents > 0); - ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT); + ASSERT(iip->ili_fields & XFS_ILOG_AEXT); ASSERT(ip->i_afp->if_bytes > 0); kmem_free(iip->ili_aextents_buf); iip->ili_aextents_buf = NULL; @@ -700,87 +622,6 @@ xfs_inode_item_committed( } /* - * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK - * failed to get the inode flush lock but did get the inode locked SHARED. - * Here we're trying to see if the inode buffer is incore, and if so whether it's - * marked delayed write. If that's the case, we'll promote it and that will - * allow the caller to write the buffer by triggering the xfsbufd to run. - */ -STATIC bool -xfs_inode_item_pushbuf( - struct xfs_log_item *lip) -{ - struct xfs_inode_log_item *iip = INODE_ITEM(lip); - struct xfs_inode *ip = iip->ili_inode; - struct xfs_buf *bp; - bool ret = true; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); - - /* - * If a flush is not in progress anymore, chances are that the - * inode was taken off the AIL. So, just get out. - */ - if (completion_done(&ip->i_flush) || - !(lip->li_flags & XFS_LI_IN_AIL)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return true; - } - - bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno, - iip->ili_format.ilf_len, XBF_TRYLOCK); - - xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (!bp) - return true; - if (XFS_BUF_ISDELAYWRITE(bp)) - xfs_buf_delwri_promote(bp); - if (xfs_buf_ispinned(bp)) - ret = false; - xfs_buf_relse(bp); - return ret; -} - -/* - * This is called to asynchronously write the inode associated with this - * inode log item out to disk. The inode will already have been locked by - * a successful call to xfs_inode_item_trylock(). - */ -STATIC void -xfs_inode_item_push( - struct xfs_log_item *lip) -{ - struct xfs_inode_log_item *iip = INODE_ITEM(lip); - struct xfs_inode *ip = iip->ili_inode; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); - ASSERT(!completion_done(&ip->i_flush)); - - /* - * Since we were able to lock the inode's flush lock and - * we found it on the AIL, the inode must be dirty. This - * is because the inode is removed from the AIL while still - * holding the flush lock in xfs_iflush_done(). Thus, if - * we found it in the AIL and were able to obtain the flush - * lock without sleeping, then there must not have been - * anyone in the process of flushing the inode. - */ - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || - iip->ili_format.ilf_fields != 0); - - /* - * Push the inode to it's backing buffer. This will not remove the - * inode from the AIL - a further push will be required to trigger a - * buffer push. However, this allows all the dirty inodes to be pushed - * to the buffer before it is pushed to disk. The buffer IO completion - * will pull the inode from the AIL, mark it clean and unlock the flush - * lock. - */ - (void) xfs_iflush(ip, SYNC_TRYLOCK); - xfs_iunlock(ip, XFS_ILOCK_SHARED); -} - -/* * XXX rcc - this one really has to do something. Probably needs * to stamp in a new field in the incore inode. */ @@ -800,11 +641,9 @@ static const struct xfs_item_ops xfs_inode_item_ops = { .iop_format = xfs_inode_item_format, .iop_pin = xfs_inode_item_pin, .iop_unpin = xfs_inode_item_unpin, - .iop_trylock = xfs_inode_item_trylock, .iop_unlock = xfs_inode_item_unlock, .iop_committed = xfs_inode_item_committed, .iop_push = xfs_inode_item_push, - .iop_pushbuf = xfs_inode_item_pushbuf, .iop_committing = xfs_inode_item_committing }; @@ -935,7 +774,8 @@ xfs_iflush_done( ASSERT(i <= need_ail); } /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ - xfs_trans_ail_delete_bulk(ailp, log_items, i); + xfs_trans_ail_delete_bulk(ailp, log_items, i, + SHUTDOWN_CORRUPT_INCORE); } @@ -956,16 +796,15 @@ xfs_iflush_done( } /* - * This is the inode flushing abort routine. It is called - * from xfs_iflush when the filesystem is shutting down to clean - * up the inode state. - * It is responsible for removing the inode item - * from the AIL if it has not been re-logged, and unlocking the inode's - * flush lock. + * This is the inode flushing abort routine. It is called from xfs_iflush when + * the filesystem is shutting down to clean up the inode state. It is + * responsible for removing the inode item from the AIL if it has not been + * re-logged, and unlocking the inode's flush lock. */ void xfs_iflush_abort( - xfs_inode_t *ip) + xfs_inode_t *ip, + bool stale) { xfs_inode_log_item_t *iip = ip->i_itemp; @@ -975,7 +814,10 @@ xfs_iflush_abort( spin_lock(&ailp->xa_lock); if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip); + xfs_trans_ail_delete(ailp, &iip->ili_item, + stale ? + SHUTDOWN_LOG_IO_ERROR : + SHUTDOWN_CORRUPT_INCORE); } else spin_unlock(&ailp->xa_lock); } @@ -989,7 +831,7 @@ xfs_iflush_abort( * Clear the inode logging fields so no more flushes are * attempted. */ - iip->ili_format.ilf_fields = 0; + iip->ili_fields = 0; } /* * Release the inode's flush lock since we're done with it. @@ -1002,7 +844,7 @@ xfs_istale_done( struct xfs_buf *bp, struct xfs_log_item *lip) { - xfs_iflush_abort(INODE_ITEM(lip)->ili_inode); + xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true); } /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index d3dee61e6d9..376d4d0b263 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -86,6 +86,15 @@ typedef struct xfs_inode_log_format_64 { #define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ #define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ + +/* + * The timestamps are dirty, but not necessarily anything else in the inode + * core. Unlike the other fields above this one must never make it to disk + * in the ilf_fields of the inode_log_format, but is purely store in-memory in + * ili_fields in the inode_log_item. + */ +#define XFS_ILOG_TIMESTAMP 0x4000 + #define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ XFS_ILOG_UUID | XFS_ILOG_ADATA | \ @@ -101,7 +110,7 @@ typedef struct xfs_inode_log_format_64 { XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ XFS_ILOG_DEV | XFS_ILOG_UUID | \ XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT) + XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP) static inline int xfs_ilog_fbroot(int w) { @@ -134,6 +143,7 @@ typedef struct xfs_inode_log_item { unsigned short ili_lock_flags; /* lock flags */ unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ + unsigned int ili_fields; /* fields to be logged */ struct xfs_bmbt_rec *ili_extents_buf; /* array of logged data exts */ struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged @@ -148,16 +158,14 @@ typedef struct xfs_inode_log_item { static inline int xfs_inode_clean(xfs_inode_t *ip) { - return (!ip->i_itemp || - !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && - !ip->i_update_core; + return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL); } extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); -extern void xfs_iflush_abort(struct xfs_inode *); +extern void xfs_iflush_abort(struct xfs_inode *, bool); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, xfs_inode_log_format_t *); diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h index b253c0ea5be..90efdaf1706 100644 --- a/fs/xfs/xfs_inum.h +++ b/fs/xfs/xfs_inum.h @@ -26,11 +26,6 @@ * high agno_log-agblklog-inopblog bits - 0 */ -typedef __uint32_t xfs_agino_t; /* within allocation grp inode number */ - -#define NULLFSINO ((xfs_ino_t)-1) -#define NULLAGINO ((xfs_agino_t)-1) - struct xfs_mount; #define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d99a9051890..3a05a41b5d7 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -209,6 +207,7 @@ xfs_open_by_handle( struct file *filp; struct inode *inode; struct dentry *dentry; + fmode_t fmode; if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); @@ -228,26 +227,21 @@ xfs_open_by_handle( hreq->oflags |= O_LARGEFILE; #endif - /* Put open permission in namei format. */ permflag = hreq->oflags; - if ((permflag+1) & O_ACCMODE) - permflag++; - if (permflag & O_TRUNC) - permflag |= 2; - + fmode = OPEN_FMODE(permflag); if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && - (permflag & FMODE_WRITE) && IS_APPEND(inode)) { + (fmode & FMODE_WRITE) && IS_APPEND(inode)) { error = -XFS_ERROR(EPERM); goto out_dput; } - if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) { + if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { error = -XFS_ERROR(EACCES); goto out_dput; } /* Can't write directories. */ - if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) { + if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { error = -XFS_ERROR(EISDIR); goto out_dput; } @@ -450,9 +444,12 @@ xfs_attrmulti_attr_get( if (*len > XATTR_SIZE_MAX) return EINVAL; - kbuf = kmalloc(*len, GFP_KERNEL); - if (!kbuf) - return ENOMEM; + kbuf = kmem_zalloc(*len, KM_SLEEP | KM_MAYFAIL); + if (!kbuf) { + kbuf = kmem_zalloc_large(*len); + if (!kbuf) + return ENOMEM; + } error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); if (error) @@ -462,7 +459,10 @@ xfs_attrmulti_attr_get( error = EFAULT; out_kfree: - kfree(kbuf); + if (is_vmalloc_addr(kbuf)) + kmem_free_large(kbuf); + else + kmem_free(kbuf); return error; } @@ -559,23 +559,23 @@ xfs_attrmulti_by_handle( ops[i].am_flags); break; case ATTR_OP_SET: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_set( dentry->d_inode, attr_name, ops[i].am_attrvalue, ops[i].am_length, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_remove( dentry->d_inode, attr_name, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; default: ops[i].am_error = EINVAL; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 54e623bfbb8..c4f2da0d2bf 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -22,9 +22,7 @@ #include <asm/uaccess.h> #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -293,7 +291,7 @@ xfs_compat_ioc_bulkstat( int res; error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer, - sizeof(compat_xfs_bstat_t), 0, &res); + sizeof(compat_xfs_bstat_t), NULL, &res); } else if (cmd == XFS_IOC_FSBULKSTAT_32) { error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t), @@ -454,23 +452,23 @@ xfs_compat_attrmulti_by_handle( &ops[i].am_length, ops[i].am_flags); break; case ATTR_OP_SET: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_set( dentry->d_inode, attr_name, compat_ptr(ops[i].am_attrvalue), ops[i].am_length, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_remove( dentry->d_inode, attr_name, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; default: ops[i].am_error = EINVAL; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 9afa282aa93..aadfce6681e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -31,12 +29,12 @@ #include "xfs_ialloc_btree.h" #include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" #include "xfs_btree.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" @@ -57,26 +55,26 @@ xfs_iomap_eof_align_last_fsb( xfs_fileoff_t *last_fsb) { xfs_fileoff_t new_last_fsb = 0; - xfs_extlen_t align; + xfs_extlen_t align = 0; int eof, error; - if (XFS_IS_REALTIME_INODE(ip)) - ; - /* - * If mounted with the "-o swalloc" option, roundup the allocation - * request to a stripe width boundary if the file size is >= - * stripe width and we are allocating past the allocation eof. - */ - else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) && - (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth))) - new_last_fsb = roundup_64(*last_fsb, mp->m_swidth); - /* - * Roundup the allocation request to a stripe unit (m_dalign) boundary - * if the file size is >= stripe unit size, and we are allocating past - * the allocation eof. - */ - else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign))) - new_last_fsb = roundup_64(*last_fsb, mp->m_dalign); + if (!XFS_IS_REALTIME_INODE(ip)) { + /* + * Round up the allocation request to a stripe unit + * (m_dalign) boundary if the file size is >= stripe unit + * size, and we are allocating past the allocation eof. + * + * If mounted with the "-o swalloc" option the alignment is + * increased from the strip unit size to the stripe width. + */ + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + align = mp->m_swidth; + else if (mp->m_dalign) + align = mp->m_dalign; + + if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align)) + new_last_fsb = roundup_64(*last_fsb, align); + } /* * Always round up the allocation request to an extent boundary @@ -141,11 +139,7 @@ xfs_iomap_write_direct( int committed; int error; - /* - * Make sure that the dquots are there. This doesn't hold - * the ilock across a disk read. - */ - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach(ip, 0); if (error) return XFS_ERROR(error); @@ -154,10 +148,10 @@ xfs_iomap_write_direct( offset_fsb = XFS_B_TO_FSBT(mp, offset); last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); - if ((offset + count) > ip->i_size) { + if ((offset + count) > XFS_ISIZE(ip)) { error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) - goto error_out; + return XFS_ERROR(error); } else { if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) @@ -189,7 +183,6 @@ xfs_iomap_write_direct( /* * Allocate and setup the transaction */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), resrtextents, @@ -198,20 +191,21 @@ xfs_iomap_write_direct( /* * Check for running out of space, note: need lock to return */ - if (error) + if (error) { xfs_trans_cancel(tp, 0); + return XFS_ERROR(error); + } + xfs_ilock(ip, XFS_ILOCK_EXCL); - if (error) - goto error_out; error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); if (error) - goto error1; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, 0); bmapi_flag = 0; - if (offset < ip->i_size || extsz) + if (offset < XFS_ISIZE(ip) || extsz) bmapi_flag |= XFS_BMAPI_PREALLOC; /* @@ -223,42 +217,39 @@ xfs_iomap_write_direct( error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, &firstfsb, 0, imap, &nimaps, &free_list); if (error) - goto error0; + goto out_bmap_cancel; /* * Complete the transaction */ error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) - goto error0; + goto out_bmap_cancel; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) - goto error_out; + goto out_unlock; /* * Copy any maps to caller's array and return any error. */ if (nimaps == 0) { - error = ENOSPC; - goto error_out; + error = XFS_ERROR(ENOSPC); + goto out_unlock; } - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) { + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) error = xfs_alert_fsblock_zero(ip, imap); - goto error_out; - } - return 0; +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; -error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ +out_bmap_cancel: xfs_bmap_cancel(&free_list); - xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); - -error1: /* Just cancel transaction */ + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); +out_trans_cancel: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - -error_out: - return XFS_ERROR(error); + goto out_unlock; } /* @@ -286,7 +277,7 @@ xfs_iomap_eof_want_preallocate( int found_delalloc = 0; *prealloc = 0; - if ((offset + count) <= ip->i_size) + if (offset + count <= XFS_ISIZE(ip)) return 0; /* @@ -340,7 +331,7 @@ xfs_iomap_prealloc_size( * if we pass in alloc_blocks = 0. Hence the "+ 1" to * ensure we always pass in a non-zero value. */ - alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1; + alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1; alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, rounddown_pow_of_two(alloc_blocks)); @@ -421,6 +412,15 @@ retry: return error; } + /* + * Make sure preallocation does not create extents beyond the range we + * actually support in this filesystem. + */ + if (last_fsb > XFS_B_TO_FSB(mp, mp->m_maxioffset)) + last_fsb = XFS_B_TO_FSB(mp, mp->m_maxioffset); + + ASSERT(last_fsb > offset_fsb); + nimaps = XFS_WRITE_IMAPS; error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, imap, &nimaps, XFS_BMAPI_ENTIRE); @@ -564,7 +564,7 @@ xfs_iomap_write_allocate( * back.... */ nimaps = 1; - end_fsb = XFS_B_TO_FSB(mp, ip->i_size); + end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); error = xfs_bmap_last_offset(NULL, ip, &last_block, XFS_DATA_FORK); if (error) @@ -645,6 +645,7 @@ xfs_iomap_write_unwritten( xfs_trans_t *tp; xfs_bmbt_irec_t imap; xfs_bmap_free_t free_list; + xfs_fsize_t i_size; uint resblks; int committed; int error; @@ -705,7 +706,22 @@ xfs_iomap_write_unwritten( if (error) goto error_on_bmapi_transaction; - error = xfs_bmap_finish(&(tp), &(free_list), &committed); + /* + * Log the updated inode size as we go. We have to be careful + * to only log it up to the actual write offset if it is + * halfway into a block. + */ + i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); + if (i_size > offset + count) + i_size = offset + count; + + i_size = xfs_new_eof(ip, i_size); + if (i_size) { + ip->i_d.di_size = i_size; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 23ce927973a..1a25fd80279 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_acl.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -34,7 +32,6 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" @@ -50,65 +47,15 @@ #include <linux/fiemap.h> #include <linux/slab.h> -/* - * Bring the timestamps in the XFS inode uptodate. - * - * Used before writing the inode to disk. - */ -void -xfs_synchronize_times( - xfs_inode_t *ip) -{ - struct inode *inode = VFS_I(ip); - - ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; - ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec; - ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec; -} - -/* - * If the linux inode is valid, mark it dirty, else mark the dirty state - * in the XFS inode to make sure we pick it up when reclaiming the inode. - */ -void -xfs_mark_inode_dirty_sync( - xfs_inode_t *ip) -{ - struct inode *inode = VFS_I(ip); - - if (!(inode->i_state & (I_WILL_FREE|I_FREEING))) - mark_inode_dirty_sync(inode); - else { - barrier(); - ip->i_update_core = 1; - } -} - -void -xfs_mark_inode_dirty( - xfs_inode_t *ip) -{ - struct inode *inode = VFS_I(ip); - - if (!(inode->i_state & (I_WILL_FREE|I_FREEING))) - mark_inode_dirty(inode); - else { - barrier(); - ip->i_update_core = 1; - } - -} - - -int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int +xfs_initxattrs( + struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) { - const struct xattr *xattr; - struct xfs_inode *ip = XFS_I(inode); - int error = 0; + const struct xattr *xattr; + struct xfs_inode *ip = XFS_I(inode); + int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { error = xfs_attr_set(ip, xattr->name, xattr->value, @@ -168,7 +115,7 @@ STATIC int xfs_vn_mknod( struct inode *dir, struct dentry *dentry, - int mode, + umode_t mode, dev_t rdev) { struct inode *inode; @@ -231,7 +178,7 @@ STATIC int xfs_vn_create( struct inode *dir, struct dentry *dentry, - int mode, + umode_t mode, struct nameidata *nd) { return xfs_vn_mknod(dir, dentry, mode, 0); @@ -241,7 +188,7 @@ STATIC int xfs_vn_mkdir( struct inode *dir, struct dentry *dentry, - int mode) + umode_t mode) { return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0); } @@ -366,7 +313,7 @@ xfs_vn_symlink( struct xfs_inode *cip = NULL; struct xfs_name name; int error; - mode_t mode; + umode_t mode; mode = S_IFLNK | (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); @@ -678,19 +625,16 @@ xfs_setattr_nonsize( inode->i_atime = iattr->ia_atime; ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; - ip->i_update_core = 1; } if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - ip->i_update_core = 1; } if (mask & ATTR_MTIME) { inode->i_mtime = iattr->ia_mtime; ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - ip->i_update_core = 1; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -750,9 +694,10 @@ xfs_setattr_size( struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); int mask = iattr->ia_valid; + xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; - uint lock_flags; + uint lock_flags = 0; uint commit_flags = 0; trace_xfs_setattr(ip); @@ -772,16 +717,18 @@ xfs_setattr_size( ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); - lock_flags = XFS_ILOCK_EXCL; - if (!(flags & XFS_ATTR_NOLOCK)) + if (!(flags & XFS_ATTR_NOLOCK)) { lock_flags |= XFS_IOLOCK_EXCL; - xfs_ilock(ip, lock_flags); + xfs_ilock(ip, lock_flags); + } + + oldsize = inode->i_size; + newsize = iattr->ia_size; /* * Short circuit the truncate case for zero length files. */ - if (iattr->ia_size == 0 && - ip->i_size == 0 && ip->i_d.di_nextents == 0) { + if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { if (!(mask & (ATTR_CTIME|ATTR_MTIME))) goto out_unlock; @@ -796,7 +743,7 @@ xfs_setattr_size( /* * Make sure that the dquots are attached to the inode. */ - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach(ip, 0); if (error) goto out_unlock; @@ -807,19 +754,17 @@ xfs_setattr_size( * the inode to the transaction, because the inode cannot be unlocked * once it is a part of the transaction. */ - if (iattr->ia_size > ip->i_size) { + if (newsize > oldsize) { /* * Do the first part of growing a file: zero any data in the * last block that is beyond the old EOF. We need to do this * before the inode is joined to the transaction to modify * i_size. */ - error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); + error = xfs_zero_eof(ip, newsize, oldsize); if (error) goto out_unlock; } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - lock_flags &= ~XFS_ILOCK_EXCL; /* * We are going to log the inode size change in this transaction so @@ -833,8 +778,8 @@ xfs_setattr_size( * here and prevents waiting for other data not within the range we * care about here. */ - if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { - error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0, + if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { + error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, FI_NONE); if (error) goto out_unlock; @@ -845,8 +790,7 @@ xfs_setattr_size( */ inode_dio_wait(inode); - error = -block_truncate_page(inode->i_mapping, iattr->ia_size, - xfs_get_blocks); + error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) goto out_unlock; @@ -857,7 +801,7 @@ xfs_setattr_size( if (error) goto out_trans_cancel; - truncate_setsize(inode, iattr->ia_size); + truncate_setsize(inode, newsize); commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; @@ -876,19 +820,29 @@ xfs_setattr_size( * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (iattr->ia_size != ip->i_size && - (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { + if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); mask |= ATTR_CTIME | ATTR_MTIME; } - if (iattr->ia_size > ip->i_size) { - ip->i_d.di_size = iattr->ia_size; - ip->i_size = iattr->ia_size; - } else if (iattr->ia_size <= ip->i_size || - (iattr->ia_size == 0 && ip->i_d.di_nextents)) { - error = xfs_itruncate_data(&tp, ip, iattr->ia_size); + /* + * The first thing we do is set the size to new_size permanently on + * disk. This way we don't have to worry about anyone ever being able + * to look at the data being freed even in the face of a crash. + * What we're getting around here is the case where we free a block, it + * is allocated to another file, it is written to, and then we crash. + * If the new data gets written to the file but the log buffers + * containing the free and reallocation don't, then we'd end up with + * garbage in the blocks being freed. As long as we make the new size + * permanent before actually freeing any blocks it doesn't matter if + * they get written to. + */ + ip->i_d.di_size = newsize; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (newsize <= oldsize) { + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) goto out_trans_abort; @@ -906,13 +860,11 @@ xfs_setattr_size( inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - ip->i_update_core = 1; } if (mask & ATTR_MTIME) { inode->i_mtime = iattr->ia_mtime; ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - ip->i_update_core = 1; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 751e94fe1f7..eff577a9b67 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -18,7 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -62,7 +61,6 @@ xfs_bulkstat_one_int( { struct xfs_icdinode *dic; /* dinode core info pointer */ struct xfs_inode *ip; /* incore inode pointer */ - struct inode *inode; struct xfs_bstat *buf; /* return buffer */ int error = 0; /* error value */ @@ -76,7 +74,8 @@ xfs_bulkstat_one_int( return XFS_ERROR(ENOMEM); error = xfs_iget(mp, NULL, ino, - XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip); + (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED), + XFS_ILOCK_SHARED, &ip); if (error) { *stat = BULKSTAT_RV_NOTHING; goto out_free; @@ -86,7 +85,6 @@ xfs_bulkstat_one_int( ASSERT(ip->i_imap.im_blkno != 0); dic = &ip->i_d; - inode = VFS_I(ip); /* xfs_iget returns the following without needing * further change. @@ -99,19 +97,12 @@ xfs_bulkstat_one_int( buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; - - /* - * We need to read the timestamps from the Linux inode because - * the VFS keeps writing directly into the inode structure instead - * of telling us about the updates. - */ - buf->bs_atime.tv_sec = inode->i_atime.tv_sec; - buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec; - buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec; - buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec; - buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec; - buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec; - + buf->bs_atime.tv_sec = dic->di_atime.t_sec; + buf->bs_atime.tv_nsec = dic->di_atime.t_nsec; + buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; + buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; + buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; + buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; buf->bs_extents = dic->di_nextents; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 34817adf4b9..d90d4a38860 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -35,19 +33,26 @@ #include "xfs_trans_priv.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_rw.h" #include "xfs_trace.h" kmem_zone_t *xfs_log_ticket_zone; /* Local miscellaneous function prototypes */ -STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket, - xlog_in_core_t **, xfs_lsn_t *); +STATIC int +xlog_commit_record( + struct xlog *log, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + xfs_lsn_t *commitlsnp); + STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, xfs_buftarg_t *log_target, xfs_daddr_t blk_offset, int num_bblks); -STATIC int xlog_space_left(struct log *log, atomic64_t *head); +STATIC int +xlog_space_left( + struct xlog *log, + atomic64_t *head); STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_dealloc_log(xlog_t *log); @@ -67,21 +72,20 @@ STATIC void xlog_state_switch_iclogs(xlog_t *log, int eventual_size); STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); -/* local functions to manipulate grant head */ -STATIC int xlog_grant_log_space(xlog_t *log, - xlog_ticket_t *xtic); -STATIC void xlog_grant_push_ail(struct log *log, - int need_bytes); +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes); STATIC void xlog_regrant_reserve_log_space(xlog_t *log, xlog_ticket_t *ticket); -STATIC int xlog_regrant_write_log_space(xlog_t *log, - xlog_ticket_t *ticket); STATIC void xlog_ungrant_log_space(xlog_t *log, xlog_ticket_t *ticket); #if defined(DEBUG) STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); -STATIC void xlog_verify_grant_tail(struct log *log); +STATIC void +xlog_verify_grant_tail( + struct xlog *log); STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, int count, boolean_t syncing); STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, @@ -97,9 +101,9 @@ STATIC int xlog_iclogs_empty(xlog_t *log); static void xlog_grant_sub_space( - struct log *log, - atomic64_t *head, - int bytes) + struct xlog *log, + atomic64_t *head, + int bytes) { int64_t head_val = atomic64_read(head); int64_t new, old; @@ -123,9 +127,9 @@ xlog_grant_sub_space( static void xlog_grant_add_space( - struct log *log, - atomic64_t *head, - int bytes) + struct xlog *log, + atomic64_t *head, + int bytes) { int64_t head_val = atomic64_read(head); int64_t new, old; @@ -150,78 +154,93 @@ xlog_grant_add_space( } while (head_val != old); } -STATIC bool -xlog_reserveq_wake( - struct log *log, - int *free_bytes) +STATIC void +xlog_grant_head_init( + struct xlog_grant_head *head) +{ + xlog_assign_grant_head(&head->grant, 1, 0); + INIT_LIST_HEAD(&head->waiters); + spin_lock_init(&head->lock); +} + +STATIC void +xlog_grant_head_wake_all( + struct xlog_grant_head *head) { struct xlog_ticket *tic; - int need_bytes; - list_for_each_entry(tic, &log->l_reserveq, t_queue) { + spin_lock(&head->lock); + list_for_each_entry(tic, &head->waiters, t_queue) + wake_up_process(tic->t_task); + spin_unlock(&head->lock); +} + +static inline int +xlog_ticket_reservation( + struct xlog *log, + struct xlog_grant_head *head, + struct xlog_ticket *tic) +{ + if (head == &log->l_write_head) { + ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); + return tic->t_unit_res; + } else { if (tic->t_flags & XLOG_TIC_PERM_RESERV) - need_bytes = tic->t_unit_res * tic->t_cnt; + return tic->t_unit_res * tic->t_cnt; else - need_bytes = tic->t_unit_res; - - if (*free_bytes < need_bytes) - return false; - *free_bytes -= need_bytes; - - trace_xfs_log_grant_wake_up(log, tic); - wake_up(&tic->t_wait); + return tic->t_unit_res; } - - return true; } STATIC bool -xlog_writeq_wake( - struct log *log, +xlog_grant_head_wake( + struct xlog *log, + struct xlog_grant_head *head, int *free_bytes) { struct xlog_ticket *tic; int need_bytes; - list_for_each_entry(tic, &log->l_writeq, t_queue) { - ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); - - need_bytes = tic->t_unit_res; - + list_for_each_entry(tic, &head->waiters, t_queue) { + need_bytes = xlog_ticket_reservation(log, head, tic); if (*free_bytes < need_bytes) return false; - *free_bytes -= need_bytes; - trace_xfs_log_regrant_write_wake_up(log, tic); - wake_up(&tic->t_wait); + *free_bytes -= need_bytes; + trace_xfs_log_grant_wake_up(log, tic); + wake_up_process(tic->t_task); } return true; } STATIC int -xlog_reserveq_wait( - struct log *log, +xlog_grant_head_wait( + struct xlog *log, + struct xlog_grant_head *head, struct xlog_ticket *tic, int need_bytes) { - list_add_tail(&tic->t_queue, &log->l_reserveq); + list_add_tail(&tic->t_queue, &head->waiters); do { if (XLOG_FORCED_SHUTDOWN(log)) goto shutdown; xlog_grant_push_ail(log, need_bytes); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&head->lock); + XFS_STATS_INC(xs_sleep_logspace); - trace_xfs_log_grant_sleep(log, tic); - xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); + trace_xfs_log_grant_sleep(log, tic); + schedule(); trace_xfs_log_grant_wake(log, tic); - spin_lock(&log->l_grant_reserve_lock); + spin_lock(&head->lock); if (XLOG_FORCED_SHUTDOWN(log)) goto shutdown; - } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes); + } while (xlog_space_left(log, &head->grant) < need_bytes); list_del_init(&tic->t_queue); return 0; @@ -230,35 +249,58 @@ shutdown: return XFS_ERROR(EIO); } +/* + * Atomically get the log space required for a log ticket. + * + * Once a ticket gets put onto head->waiters, it will only return after the + * needed reservation is satisfied. + * + * This function is structured so that it has a lock free fast path. This is + * necessary because every new transaction reservation will come through this + * path. Hence any lock will be globally hot if we take it unconditionally on + * every pass. + * + * As tickets are only ever moved on and off head->waiters under head->lock, we + * only need to take that lock if we are going to add the ticket to the queue + * and sleep. We can avoid taking the lock if the ticket was never added to + * head->waiters because the t_queue list head will be empty and we hold the + * only reference to it so it can safely be checked unlocked. + */ STATIC int -xlog_writeq_wait( - struct log *log, +xlog_grant_head_check( + struct xlog *log, + struct xlog_grant_head *head, struct xlog_ticket *tic, - int need_bytes) + int *need_bytes) { - list_add_tail(&tic->t_queue, &log->l_writeq); - - do { - if (XLOG_FORCED_SHUTDOWN(log)) - goto shutdown; - xlog_grant_push_ail(log, need_bytes); - - XFS_STATS_INC(xs_sleep_logspace); - trace_xfs_log_regrant_write_sleep(log, tic); + int free_bytes; + int error = 0; - xlog_wait(&tic->t_wait, &log->l_grant_write_lock); - trace_xfs_log_regrant_write_wake(log, tic); + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); - spin_lock(&log->l_grant_write_lock); - if (XLOG_FORCED_SHUTDOWN(log)) - goto shutdown; - } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes); + /* + * If there are other waiters on the queue then give them a chance at + * logspace before us. Wake up the first waiters, if we do not wake + * up all the waiters then go to sleep waiting for more free space, + * otherwise try to get some space for this transaction. + */ + *need_bytes = xlog_ticket_reservation(log, head, tic); + free_bytes = xlog_space_left(log, &head->grant); + if (!list_empty_careful(&head->waiters)) { + spin_lock(&head->lock); + if (!xlog_grant_head_wake(log, head, &free_bytes) || + free_bytes < *need_bytes) { + error = xlog_grant_head_wait(log, head, tic, + *need_bytes); + } + spin_unlock(&head->lock); + } else if (free_bytes < *need_bytes) { + spin_lock(&head->lock); + error = xlog_grant_head_wait(log, head, tic, *need_bytes); + spin_unlock(&head->lock); + } - list_del_init(&tic->t_queue); - return 0; -shutdown: - list_del_init(&tic->t_queue); - return XFS_ERROR(EIO); + return error; } static void @@ -286,6 +328,128 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) } /* + * Replenish the byte reservation required by moving the grant write head. + */ +int +xfs_log_regrant( + struct xfs_mount *mp, + struct xlog_ticket *tic) +{ + struct xlog *log = mp->m_log; + int need_bytes; + int error = 0; + + if (XLOG_FORCED_SHUTDOWN(log)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_try_logspace); + + /* + * This is a new transaction on the ticket, so we need to change the + * transaction ID so that the next transaction has a different TID in + * the log. Just add one to the existing tid so that we can see chains + * of rolling transactions in the log easily. + */ + tic->t_tid++; + + xlog_grant_push_ail(log, tic->t_unit_res); + + tic->t_curr_res = tic->t_unit_res; + xlog_tic_reset_res(tic); + + if (tic->t_cnt > 0) + return 0; + + trace_xfs_log_regrant(log, tic); + + error = xlog_grant_head_check(log, &log->l_write_head, tic, + &need_bytes); + if (error) + goto out_error; + + xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + trace_xfs_log_regrant_exit(log, tic); + xlog_verify_grant_tail(log); + return 0; + +out_error: + /* + * If we are failing, make sure the ticket doesn't have any current + * reservations. We don't want to add this back when the ticket/ + * transaction gets cancelled. + */ + tic->t_curr_res = 0; + tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ + return error; +} + +/* + * Reserve log space and return a ticket corresponding the reservation. + * + * Each reservation is going to reserve extra space for a log record header. + * When writes happen to the on-disk log, we don't subtract the length of the + * log record header from any reservation. By wasting space in each + * reservation, we prevent over allocation problems. + */ +int +xfs_log_reserve( + struct xfs_mount *mp, + int unit_bytes, + int cnt, + struct xlog_ticket **ticp, + __uint8_t client, + bool permanent, + uint t_type) +{ + struct xlog *log = mp->m_log; + struct xlog_ticket *tic; + int need_bytes; + int error = 0; + + ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); + + if (XLOG_FORCED_SHUTDOWN(log)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_try_logspace); + + ASSERT(*ticp == NULL); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, + KM_SLEEP | KM_MAYFAIL); + if (!tic) + return XFS_ERROR(ENOMEM); + + tic->t_trans_type = t_type; + *ticp = tic; + + xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt); + + trace_xfs_log_reserve(log, tic); + + error = xlog_grant_head_check(log, &log->l_reserve_head, tic, + &need_bytes); + if (error) + goto out_error; + + xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes); + xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + trace_xfs_log_reserve_exit(log, tic); + xlog_verify_grant_tail(log); + return 0; + +out_error: + /* + * If we are failing, make sure the ticket doesn't have any current + * reservations. We don't want to add this back when the ticket/ + * transaction gets cancelled. + */ + tic->t_curr_res = 0; + tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ + return error; +} + + +/* * NOTES: * * 1. currblock field gets updated at startup and after in-core logs @@ -313,7 +477,7 @@ xfs_log_done( struct xlog_in_core **iclog, uint flags) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; xfs_lsn_t lsn = 0; if (XLOG_FORCED_SHUTDOWN(log) || @@ -395,88 +559,6 @@ xfs_log_release_iclog( } /* - * 1. Reserve an amount of on-disk log space and return a ticket corresponding - * to the reservation. - * 2. Potentially, push buffers at tail of log to disk. - * - * Each reservation is going to reserve extra space for a log record header. - * When writes happen to the on-disk log, we don't subtract the length of the - * log record header from any reservation. By wasting space in each - * reservation, we prevent over allocation problems. - */ -int -xfs_log_reserve( - struct xfs_mount *mp, - int unit_bytes, - int cnt, - struct xlog_ticket **ticket, - __uint8_t client, - uint flags, - uint t_type) -{ - struct log *log = mp->m_log; - struct xlog_ticket *internal_ticket; - int retval = 0; - - ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); - - if (XLOG_FORCED_SHUTDOWN(log)) - return XFS_ERROR(EIO); - - XFS_STATS_INC(xs_try_logspace); - - - if (*ticket != NULL) { - ASSERT(flags & XFS_LOG_PERM_RESERV); - internal_ticket = *ticket; - - /* - * this is a new transaction on the ticket, so we need to - * change the transaction ID so that the next transaction has a - * different TID in the log. Just add one to the existing tid - * so that we can see chains of rolling transactions in the log - * easily. - */ - internal_ticket->t_tid++; - - trace_xfs_log_reserve(log, internal_ticket); - - xlog_grant_push_ail(log, internal_ticket->t_unit_res); - retval = xlog_regrant_write_log_space(log, internal_ticket); - } else { - /* may sleep if need to allocate more tickets */ - internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, - client, flags, - KM_SLEEP|KM_MAYFAIL); - if (!internal_ticket) - return XFS_ERROR(ENOMEM); - internal_ticket->t_trans_type = t_type; - *ticket = internal_ticket; - - trace_xfs_log_reserve(log, internal_ticket); - - xlog_grant_push_ail(log, - (internal_ticket->t_unit_res * - internal_ticket->t_cnt)); - retval = xlog_grant_log_space(log, internal_ticket); - } - - if (unlikely(retval)) { - /* - * If we are failing, make sure the ticket doesn't have any - * current reservations. We don't want to add this back - * when the ticket/ transaction gets cancelled. - */ - internal_ticket->t_curr_res = 0; - /* ungrant will give back unit_res * t_cnt. */ - internal_ticket->t_cnt = 0; - } - - return retval; -} - - -/* * Mount a log filesystem * * mp - ubiquitous xfs mount point structure @@ -653,8 +735,9 @@ xfs_log_unmount_write(xfs_mount_t *mp) .lv_iovecp = ®, }; - /* remove inited flag */ + /* remove inited flag, and account for space used */ tic->t_flags = 0; + tic->t_curr_res -= sizeof(magic); error = xlog_write(log, &vec, tic, &lsn, NULL, XLOG_UNMOUNT_TRANS); /* @@ -739,6 +822,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) void xfs_log_unmount(xfs_mount_t *mp) { + cancel_delayed_work_sync(&mp->m_sync_work); xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } @@ -761,95 +845,34 @@ xfs_log_item_init( } /* - * Write region vectors to log. The write happens using the space reservation - * of the ticket (tic). It is not a requirement that all writes for a given - * transaction occur with one call to xfs_log_write(). However, it is important - * to note that the transaction reservation code makes an assumption about the - * number of log headers a transaction requires that may be violated if you - * don't pass all the transaction vectors in one call.... + * Wake up processes waiting for log space after we have moved the log tail. */ -int -xfs_log_write( - struct xfs_mount *mp, - struct xfs_log_iovec reg[], - int nentries, - struct xlog_ticket *tic, - xfs_lsn_t *start_lsn) -{ - struct log *log = mp->m_log; - int error; - struct xfs_log_vec vec = { - .lv_niovecs = nentries, - .lv_iovecp = reg, - }; - - if (XLOG_FORCED_SHUTDOWN(log)) - return XFS_ERROR(EIO); - - error = xlog_write(log, &vec, tic, start_lsn, NULL, 0); - if (error) - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - return error; -} - void -xfs_log_move_tail(xfs_mount_t *mp, - xfs_lsn_t tail_lsn) +xfs_log_space_wake( + struct xfs_mount *mp) { - xlog_ticket_t *tic; - xlog_t *log = mp->m_log; - int need_bytes, free_bytes; + struct xlog *log = mp->m_log; + int free_bytes; if (XLOG_FORCED_SHUTDOWN(log)) return; - if (tail_lsn == 0) - tail_lsn = atomic64_read(&log->l_last_sync_lsn); + if (!list_empty_careful(&log->l_write_head.waiters)) { + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); - /* tail_lsn == 1 implies that we weren't passed a valid value. */ - if (tail_lsn != 1) - atomic64_set(&log->l_tail_lsn, tail_lsn); - - if (!list_empty_careful(&log->l_writeq)) { -#ifdef DEBUG - if (log->l_flags & XLOG_ACTIVE_RECOVERY) - panic("Recovery problem"); -#endif - spin_lock(&log->l_grant_write_lock); - free_bytes = xlog_space_left(log, &log->l_grant_write_head); - list_for_each_entry(tic, &log->l_writeq, t_queue) { - ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); - - if (free_bytes < tic->t_unit_res && tail_lsn != 1) - break; - tail_lsn = 0; - free_bytes -= tic->t_unit_res; - trace_xfs_log_regrant_write_wake_up(log, tic); - wake_up(&tic->t_wait); - } - spin_unlock(&log->l_grant_write_lock); + spin_lock(&log->l_write_head.lock); + free_bytes = xlog_space_left(log, &log->l_write_head.grant); + xlog_grant_head_wake(log, &log->l_write_head, &free_bytes); + spin_unlock(&log->l_write_head.lock); } - if (!list_empty_careful(&log->l_reserveq)) { -#ifdef DEBUG - if (log->l_flags & XLOG_ACTIVE_RECOVERY) - panic("Recovery problem"); -#endif - spin_lock(&log->l_grant_reserve_lock); - free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); - list_for_each_entry(tic, &log->l_reserveq, t_queue) { - if (tic->t_flags & XLOG_TIC_PERM_RESERV) - need_bytes = tic->t_unit_res*tic->t_cnt; - else - need_bytes = tic->t_unit_res; - if (free_bytes < need_bytes && tail_lsn != 1) - break; - tail_lsn = 0; - free_bytes -= need_bytes; - trace_xfs_log_grant_wake_up(log, tic); - wake_up(&tic->t_wait); - } - spin_unlock(&log->l_grant_reserve_lock); + if (!list_empty_careful(&log->l_reserve_head.waiters)) { + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + + spin_lock(&log->l_reserve_head.lock); + free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); + xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes); + spin_unlock(&log->l_reserve_head.lock); } } @@ -899,38 +922,46 @@ xfs_log_need_covered(xfs_mount_t *mp) return needed; } -/****************************************************************************** - * - * local routines - * - ****************************************************************************** - */ - -/* xfs_trans_tail_ail returns 0 when there is nothing in the list. - * The log manager must keep track of the last LR which was committed - * to disk. The lsn of this LR will become the new tail_lsn whenever - * xfs_trans_tail_ail returns 0. If we don't do this, we run into - * the situation where stuff could be written into the log but nothing - * was ever in the AIL when asked. Eventually, we panic since the - * tail hits the head. - * +/* * We may be holding the log iclog lock upon entering this routine. */ xfs_lsn_t -xlog_assign_tail_lsn( +xlog_assign_tail_lsn_locked( struct xfs_mount *mp) { + struct xlog *log = mp->m_log; + struct xfs_log_item *lip; xfs_lsn_t tail_lsn; - struct log *log = mp->m_log; - tail_lsn = xfs_ail_min_lsn(mp->m_ail); - if (!tail_lsn) - tail_lsn = atomic64_read(&log->l_last_sync_lsn); + assert_spin_locked(&mp->m_ail->xa_lock); + /* + * To make sure we always have a valid LSN for the log tail we keep + * track of the last LSN which was committed in log->l_last_sync_lsn, + * and use that when the AIL was empty. + */ + lip = xfs_ail_min(mp->m_ail); + if (lip) + tail_lsn = lip->li_lsn; + else + tail_lsn = atomic64_read(&log->l_last_sync_lsn); atomic64_set(&log->l_tail_lsn, tail_lsn); return tail_lsn; } +xfs_lsn_t +xlog_assign_tail_lsn( + struct xfs_mount *mp) +{ + xfs_lsn_t tail_lsn; + + spin_lock(&mp->m_ail->xa_lock); + tail_lsn = xlog_assign_tail_lsn_locked(mp); + spin_unlock(&mp->m_ail->xa_lock); + + return tail_lsn; +} + /* * Return the space in the log between the tail and the head. The head * is passed in the cycle/bytes formal parms. In the special case where @@ -947,7 +978,7 @@ xlog_assign_tail_lsn( */ STATIC int xlog_space_left( - struct log *log, + struct xlog *log, atomic64_t *head) { int free_bytes; @@ -1132,12 +1163,9 @@ xlog_alloc_log(xfs_mount_t *mp, xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0); xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ - xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0); - xlog_assign_grant_head(&log->l_grant_write_head, 1, 0); - INIT_LIST_HEAD(&log->l_reserveq); - INIT_LIST_HEAD(&log->l_writeq); - spin_lock_init(&log->l_grant_reserve_lock); - spin_lock_init(&log->l_grant_write_lock); + + xlog_grant_head_init(&log->l_reserve_head); + xlog_grant_head_init(&log->l_write_head); error = EFSCORRUPTED; if (xfs_sb_version_hassector(&mp->m_sb)) { @@ -1169,7 +1197,7 @@ xlog_alloc_log(xfs_mount_t *mp, xlog_get_iclog_buffer_size(mp, log); error = ENOMEM; - bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0); + bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; bp->b_iodone = xlog_iodone; @@ -1179,9 +1207,6 @@ xlog_alloc_log(xfs_mount_t *mp, spin_lock_init(&log->l_icloglock); init_waitqueue_head(&log->l_flush_wait); - /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ - ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); - iclogp = &log->l_iclog; /* * The amount of memory to allocate for the iclog structure is @@ -1201,7 +1226,7 @@ xlog_alloc_log(xfs_mount_t *mp, prev_iclog = iclog; bp = xfs_buf_get_uncached(mp->m_logdev_targp, - log->l_iclog_size, 0); + BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_iclog; @@ -1221,7 +1246,7 @@ xlog_alloc_log(xfs_mount_t *mp, head->h_fmt = cpu_to_be32(XLOG_FMT); memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); - iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; + iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); @@ -1265,7 +1290,7 @@ out: */ STATIC int xlog_commit_record( - struct log *log, + struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, xfs_lsn_t *commitlsnp) @@ -1299,7 +1324,7 @@ xlog_commit_record( */ STATIC void xlog_grant_push_ail( - struct log *log, + struct xlog *log, int need_bytes) { xfs_lsn_t threshold_lsn = 0; @@ -1312,7 +1337,7 @@ xlog_grant_push_ail( ASSERT(BTOBB(need_bytes) < log->l_logBBsize); - free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); + free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); free_blocks = BTOBBT(free_bytes); /* @@ -1444,8 +1469,8 @@ xlog_sync(xlog_t *log, roundoff < BBTOB(1))); /* move grant heads by roundoff in sync */ - xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff); - xlog_grant_add_space(log, &log->l_grant_write_head, roundoff); + xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); + xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); /* put cycle number in every block */ xlog_pack_data(log, iclog, roundoff); @@ -1472,7 +1497,7 @@ xlog_sync(xlog_t *log, } else { iclog->ic_bwritecnt = 1; } - XFS_BUF_SET_COUNT(bp, count); + bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); XFS_BUF_ASYNC(bp); @@ -1570,7 +1595,7 @@ xlog_dealloc_log(xlog_t *log) * always need to ensure that the extra buffer does not point to memory * owned by another log buffer before we free it. */ - xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size); + xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; @@ -1685,7 +1710,7 @@ xlog_print_tic_res( }; xfs_warn(mp, - "xfs_log_write: reservation summary:\n" + "xlog_write: reservation summary:\n" " trans type = %s (%u)\n" " unit res = %d bytes\n" " current res = %d bytes\n" @@ -1714,7 +1739,7 @@ xlog_print_tic_res( } xfs_alert_tag(mp, XFS_PTAG_LOGRES, - "xfs_log_write: reservation ran out. Need to up reservation"); + "xlog_write: reservation ran out. Need to up reservation"); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } @@ -1778,7 +1803,7 @@ xlog_write_start_rec( static xlog_op_header_t * xlog_write_setup_ophdr( - struct log *log, + struct xlog *log, struct xlog_op_header *ophdr, struct xlog_ticket *ticket, uint flags) @@ -1861,7 +1886,7 @@ xlog_write_setup_copy( static int xlog_write_copy_finish( - struct log *log, + struct xlog *log, struct xlog_in_core *iclog, uint flags, int *record_cnt, @@ -1946,7 +1971,7 @@ xlog_write_copy_finish( */ int xlog_write( - struct log *log, + struct xlog *log, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, @@ -1968,23 +1993,21 @@ xlog_write( *start_lsn = 0; len = xlog_write_calc_vec_length(ticket, log_vector); - if (log->l_cilp) { - /* - * Region headers and bytes are already accounted for. - * We only need to take into account start records and - * split regions in this function. - */ - if (ticket->t_flags & XLOG_TIC_INITED) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - /* - * Commit record headers need to be accounted for. These - * come in as separate writes so are easy to detect. - */ - if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - } else - ticket->t_curr_res -= len; + /* + * Region headers and bytes are already accounted for. + * We only need to take into account start records and + * split regions in this function. + */ + if (ticket->t_flags & XLOG_TIC_INITED) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + + /* + * Commit record headers need to be accounted for. These + * come in as separate writes so are easy to detect. + */ + if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + ticket->t_curr_res -= sizeof(xlog_op_header_t); if (ticket->t_curr_res < 0) xlog_print_tic_res(log->l_mp, ticket); @@ -2600,119 +2623,6 @@ restart: return 0; } /* xlog_state_get_iclog_space */ -/* - * Atomically get the log space required for a log ticket. - * - * Once a ticket gets put onto the reserveq, it will only return after the - * needed reservation is satisfied. - * - * This function is structured so that it has a lock free fast path. This is - * necessary because every new transaction reservation will come through this - * path. Hence any lock will be globally hot if we take it unconditionally on - * every pass. - * - * As tickets are only ever moved on and off the reserveq under the - * l_grant_reserve_lock, we only need to take that lock if we are going to add - * the ticket to the queue and sleep. We can avoid taking the lock if the ticket - * was never added to the reserveq because the t_queue list head will be empty - * and we hold the only reference to it so it can safely be checked unlocked. - */ -STATIC int -xlog_grant_log_space( - struct log *log, - struct xlog_ticket *tic) -{ - int free_bytes, need_bytes; - int error = 0; - - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); - - trace_xfs_log_grant_enter(log, tic); - - /* - * If there are other waiters on the queue then give them a chance at - * logspace before us. Wake up the first waiters, if we do not wake - * up all the waiters then go to sleep waiting for more free space, - * otherwise try to get some space for this transaction. - */ - need_bytes = tic->t_unit_res; - if (tic->t_flags & XFS_LOG_PERM_RESERV) - need_bytes *= tic->t_ocnt; - free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); - if (!list_empty_careful(&log->l_reserveq)) { - spin_lock(&log->l_grant_reserve_lock); - if (!xlog_reserveq_wake(log, &free_bytes) || - free_bytes < need_bytes) - error = xlog_reserveq_wait(log, tic, need_bytes); - spin_unlock(&log->l_grant_reserve_lock); - } else if (free_bytes < need_bytes) { - spin_lock(&log->l_grant_reserve_lock); - error = xlog_reserveq_wait(log, tic, need_bytes); - spin_unlock(&log->l_grant_reserve_lock); - } - if (error) - return error; - - xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes); - xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); - trace_xfs_log_grant_exit(log, tic); - xlog_verify_grant_tail(log); - return 0; -} - -/* - * Replenish the byte reservation required by moving the grant write head. - * - * Similar to xlog_grant_log_space, the function is structured to have a lock - * free fast path. - */ -STATIC int -xlog_regrant_write_log_space( - struct log *log, - struct xlog_ticket *tic) -{ - int free_bytes, need_bytes; - int error = 0; - - tic->t_curr_res = tic->t_unit_res; - xlog_tic_reset_res(tic); - - if (tic->t_cnt > 0) - return 0; - - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); - - trace_xfs_log_regrant_write_enter(log, tic); - - /* - * If there are other waiters on the queue then give them a chance at - * logspace before us. Wake up the first waiters, if we do not wake - * up all the waiters then go to sleep waiting for more free space, - * otherwise try to get some space for this transaction. - */ - need_bytes = tic->t_unit_res; - free_bytes = xlog_space_left(log, &log->l_grant_write_head); - if (!list_empty_careful(&log->l_writeq)) { - spin_lock(&log->l_grant_write_lock); - if (!xlog_writeq_wake(log, &free_bytes) || - free_bytes < need_bytes) - error = xlog_writeq_wait(log, tic, need_bytes); - spin_unlock(&log->l_grant_write_lock); - } else if (free_bytes < need_bytes) { - spin_lock(&log->l_grant_write_lock); - error = xlog_writeq_wait(log, tic, need_bytes); - spin_unlock(&log->l_grant_write_lock); - } - - if (error) - return error; - - xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); - trace_xfs_log_regrant_write_exit(log, tic); - xlog_verify_grant_tail(log); - return 0; -} - /* The first cnt-1 times through here we don't need to * move the grant write head because the permanent * reservation has reserved cnt times the unit amount. @@ -2729,9 +2639,9 @@ xlog_regrant_reserve_log_space(xlog_t *log, if (ticket->t_cnt > 0) ticket->t_cnt--; - xlog_grant_sub_space(log, &log->l_grant_reserve_head, + xlog_grant_sub_space(log, &log->l_reserve_head.grant, ticket->t_curr_res); - xlog_grant_sub_space(log, &log->l_grant_write_head, + xlog_grant_sub_space(log, &log->l_write_head.grant, ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); @@ -2742,7 +2652,7 @@ xlog_regrant_reserve_log_space(xlog_t *log, if (ticket->t_cnt > 0) return; - xlog_grant_add_space(log, &log->l_grant_reserve_head, + xlog_grant_add_space(log, &log->l_reserve_head.grant, ticket->t_unit_res); trace_xfs_log_regrant_reserve_exit(log, ticket); @@ -2788,14 +2698,13 @@ xlog_ungrant_log_space(xlog_t *log, bytes += ticket->t_unit_res*ticket->t_cnt; } - xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes); - xlog_grant_sub_space(log, &log->l_grant_write_head, bytes); + xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); + xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); trace_xfs_log_ungrant_exit(log, ticket); - xfs_log_move_tail(log->l_mp, 1); -} /* xlog_ungrant_log_space */ - + xfs_log_space_wake(log->l_mp); +} /* * Flush iclog to disk if this is the last reference to the given iclog and @@ -2925,14 +2834,13 @@ _xfs_log_force( uint flags, int *log_flushed) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; struct xlog_in_core *iclog; xfs_lsn_t lsn; XFS_STATS_INC(xs_log_force); - if (log->l_cilp) - xlog_cil_force(log); + xlog_cil_force(log); spin_lock(&log->l_icloglock); @@ -3046,6 +2954,7 @@ xfs_log_force( { int error; + trace_xfs_log_force(mp, 0); error = _xfs_log_force(mp, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); @@ -3073,7 +2982,7 @@ _xfs_log_force_lsn( uint flags, int *log_flushed) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; struct xlog_in_core *iclog; int already_slept = 0; @@ -3081,11 +2990,9 @@ _xfs_log_force_lsn( XFS_STATS_INC(xs_log_force); - if (log->l_cilp) { - lsn = xlog_cil_force_lsn(log, lsn); - if (lsn == NULLCOMMITLSN) - return 0; - } + lsn = xlog_cil_force_lsn(log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; try_again: spin_lock(&log->l_icloglock); @@ -3196,6 +3103,7 @@ xfs_log_force_lsn( { int error; + trace_xfs_log_force(mp, lsn); error = _xfs_log_force_lsn(mp, lsn, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); @@ -3252,12 +3160,12 @@ xfs_log_ticket_get( */ xlog_ticket_t * xlog_ticket_alloc( - struct log *log, + struct xlog *log, int unit_bytes, int cnt, char client, - uint xflags, - int alloc_flags) + bool permanent, + xfs_km_flags_t alloc_flags) { struct xlog_ticket *tic; uint num_headers; @@ -3350,6 +3258,7 @@ xlog_ticket_alloc( } atomic_set(&tic->t_ref, 1); + tic->t_task = current; INIT_LIST_HEAD(&tic->t_queue); tic->t_unit_res = unit_bytes; tic->t_curr_res = unit_bytes; @@ -3359,9 +3268,8 @@ xlog_ticket_alloc( tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; tic->t_trans_type = 0; - if (xflags & XFS_LOG_PERM_RESERV) + if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; - init_waitqueue_head(&tic->t_wait); xlog_tic_reset_res(tic); @@ -3383,7 +3291,7 @@ xlog_ticket_alloc( */ void xlog_verify_dest_ptr( - struct log *log, + struct xlog *log, char *ptr) { int i; @@ -3412,12 +3320,12 @@ xlog_verify_dest_ptr( */ STATIC void xlog_verify_grant_tail( - struct log *log) + struct xlog *log) { int tail_cycle, tail_blocks; int cycle, space; - xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space); + xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space); xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); if (tail_cycle != cycle) { if (cycle - 1 != tail_cycle && @@ -3619,7 +3527,6 @@ xfs_log_force_umount( struct xfs_mount *mp, int logerror) { - xlog_ticket_t *tic; xlog_t *log; int retval; @@ -3653,7 +3560,7 @@ xfs_log_force_umount( * completed transactions are flushed to disk with the xfs_log_force() * call below. */ - if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) + if (!logerror) xlog_cil_force(log); /* @@ -3687,15 +3594,8 @@ xfs_log_force_umount( * we don't enqueue anything once the SHUTDOWN flag is set, and this * action is protected by the grant locks. */ - spin_lock(&log->l_grant_reserve_lock); - list_for_each_entry(tic, &log->l_reserveq, t_queue) - wake_up(&tic->t_wait); - spin_unlock(&log->l_grant_reserve_lock); - - spin_lock(&log->l_grant_write_lock); - list_for_each_entry(tic, &log->l_writeq, t_queue) - wake_up(&tic->t_wait); - spin_unlock(&log->l_grant_write_lock); + xlog_grant_head_wake_all(&log->l_reserve_head); + xlog_grant_head_wake_all(&log->l_write_head); if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { ASSERT(!logerror); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 3f7bf451c03..748d312850e 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -53,15 +53,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XFS_LOG_REL_PERM_RESERV 0x1 /* - * Flags to xfs_log_reserve() - * - * XFS_LOG_PERM_RESERV: Permanent reservation. When writes are - * performed against this type of reservation, the reservation - * is not decreased. Long running transactions should use this. - */ -#define XFS_LOG_PERM_RESERV 0x2 - -/* * Flags to xfs_log_force() * * XFS_LOG_SYNC: Synchronous force in-core log to disk @@ -160,8 +151,9 @@ int xfs_log_mount(struct xfs_mount *mp, xfs_daddr_t start_block, int num_bblocks); int xfs_log_mount_finish(struct xfs_mount *mp); -void xfs_log_move_tail(struct xfs_mount *mp, - xfs_lsn_t tail_lsn); +xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); +xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); +void xfs_log_space_wake(struct xfs_mount *mp); int xfs_log_notify(struct xfs_mount *mp, struct xlog_in_core *iclog, xfs_log_callback_t *callback_entry); @@ -172,13 +164,9 @@ int xfs_log_reserve(struct xfs_mount *mp, int count, struct xlog_ticket **ticket, __uint8_t clientid, - uint flags, + bool permanent, uint t_type); -int xfs_log_write(struct xfs_mount *mp, - xfs_log_iovec_t region[], - int nentries, - struct xlog_ticket *ticket, - xfs_lsn_t *start_lsn); +int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); @@ -189,8 +177,7 @@ void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); -void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_log_vec *log_vector, +int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index c7755d5a5fb..ddc4529d07d 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log_priv.h" @@ -29,71 +27,10 @@ #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_discard.h" /* - * Perform initial CIL structure initialisation. If the CIL is not - * enabled in this filesystem, ensure the log->l_cilp is null so - * we can check this conditional to determine if we are doing delayed - * logging or not. - */ -int -xlog_cil_init( - struct log *log) -{ - struct xfs_cil *cil; - struct xfs_cil_ctx *ctx; - - log->l_cilp = NULL; - if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG)) - return 0; - - cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); - if (!cil) - return ENOMEM; - - ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); - if (!ctx) { - kmem_free(cil); - return ENOMEM; - } - - INIT_LIST_HEAD(&cil->xc_cil); - INIT_LIST_HEAD(&cil->xc_committing); - spin_lock_init(&cil->xc_cil_lock); - init_rwsem(&cil->xc_ctx_lock); - init_waitqueue_head(&cil->xc_commit_wait); - - INIT_LIST_HEAD(&ctx->committing); - INIT_LIST_HEAD(&ctx->busy_extents); - ctx->sequence = 1; - ctx->cil = cil; - cil->xc_ctx = ctx; - cil->xc_current_sequence = ctx->sequence; - - cil->xc_log = log; - log->l_cilp = cil; - return 0; -} - -void -xlog_cil_destroy( - struct log *log) -{ - if (!log->l_cilp) - return; - - if (log->l_cilp->xc_ctx) { - if (log->l_cilp->xc_ctx->ticket) - xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); - kmem_free(log->l_cilp->xc_ctx); - } - - ASSERT(list_empty(&log->l_cilp->xc_cil)); - kmem_free(log->l_cilp); -} - -/* * Allocate a new ticket. Failing to get a new ticket makes it really hard to * recover, so we don't allow failure here. Also, we allocate in a context that * we don't want to be issuing transactions from, so we need to tell the @@ -107,7 +44,7 @@ xlog_cil_destroy( */ static struct xlog_ticket * xlog_cil_ticket_alloc( - struct log *log) + struct xlog *log) { struct xlog_ticket *tic; @@ -135,11 +72,8 @@ xlog_cil_ticket_alloc( */ void xlog_cil_init_post_recovery( - struct log *log) + struct xlog *log) { - if (!log->l_cilp) - return; - log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); log->l_cilp->xc_ctx->sequence = 1; log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, @@ -172,37 +106,73 @@ xlog_cil_init_post_recovery( * format the regions into the iclog as though they are being formatted * directly out of the objects themselves. */ -static void -xlog_cil_format_items( - struct log *log, - struct xfs_log_vec *log_vector) +static struct xfs_log_vec * +xlog_cil_prepare_log_vecs( + struct xfs_trans *tp) { - struct xfs_log_vec *lv; + struct xfs_log_item_desc *lidp; + struct xfs_log_vec *lv = NULL; + struct xfs_log_vec *ret_lv = NULL; - ASSERT(log_vector); - for (lv = log_vector; lv; lv = lv->lv_next) { + + /* Bail out if we didn't find a log item. */ + if (list_empty(&tp->t_items)) { + ASSERT(0); + return NULL; + } + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_vec *new_lv; void *ptr; int index; int len = 0; + uint niovecs; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* Skip items that do not have any vectors for writing */ + niovecs = IOP_SIZE(lidp->lid_item); + if (!niovecs) + continue; + + new_lv = kmem_zalloc(sizeof(*new_lv) + + niovecs * sizeof(struct xfs_log_iovec), + KM_SLEEP); + + /* The allocated iovec region lies beyond the log vector. */ + new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; + new_lv->lv_niovecs = niovecs; + new_lv->lv_item = lidp->lid_item; /* build the vector array and calculate it's length */ - IOP_FORMAT(lv->lv_item, lv->lv_iovecp); - for (index = 0; index < lv->lv_niovecs; index++) - len += lv->lv_iovecp[index].i_len; + IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp); + for (index = 0; index < new_lv->lv_niovecs; index++) + len += new_lv->lv_iovecp[index].i_len; - lv->lv_buf_len = len; - lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); - ptr = lv->lv_buf; + new_lv->lv_buf_len = len; + new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len, + KM_SLEEP|KM_NOFS); + ptr = new_lv->lv_buf; - for (index = 0; index < lv->lv_niovecs; index++) { - struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; + for (index = 0; index < new_lv->lv_niovecs; index++) { + struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index]; memcpy(ptr, vec->i_addr, vec->i_len); vec->i_addr = ptr; ptr += vec->i_len; } - ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); + ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len); + + if (!ret_lv) + ret_lv = new_lv; + else + lv->lv_next = new_lv; + lv = new_lv; } + + return ret_lv; } /* @@ -212,7 +182,7 @@ xlog_cil_format_items( */ STATIC void xfs_cil_prepare_item( - struct log *log, + struct xlog *log, struct xfs_log_vec *lv, int *len, int *diff_iovecs) @@ -256,12 +226,12 @@ xfs_cil_prepare_item( * Insert the log items into the CIL and calculate the difference in space * consumed by the item. Add the space to the checkpoint ticket and calculate * if the change requires additional log metadata. If it does, take that space - * as well. Remove the amount of space we addded to the checkpoint ticket from + * as well. Remove the amount of space we added to the checkpoint ticket from * the current transaction ticket so that the accounting works out correctly. */ static void xlog_cil_insert_items( - struct log *log, + struct xlog *log, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket) { @@ -367,8 +337,8 @@ xlog_cil_committed( xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, ctx->start_lsn, abort); - xfs_alloc_busy_sort(&ctx->busy_extents); - xfs_alloc_busy_clear(mp, &ctx->busy_extents, + xfs_extent_busy_sort(&ctx->busy_extents); + xfs_extent_busy_clear(mp, &ctx->busy_extents, (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); spin_lock(&ctx->cil->xc_cil_lock); @@ -381,7 +351,7 @@ xlog_cil_committed( ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); xfs_discard_extents(mp, &ctx->busy_extents); - xfs_alloc_busy_clear(mp, &ctx->busy_extents, false); + xfs_extent_busy_clear(mp, &ctx->busy_extents, false); } kmem_free(ctx); @@ -403,8 +373,7 @@ xlog_cil_committed( */ STATIC int xlog_cil_push( - struct log *log, - xfs_lsn_t push_seq) + struct xlog *log) { struct xfs_cil *cil = log->l_cilp; struct xfs_log_vec *lv; @@ -420,39 +389,36 @@ xlog_cil_push( struct xfs_log_iovec lhdr; struct xfs_log_vec lvhdr = { NULL }; xfs_lsn_t commit_lsn; + xfs_lsn_t push_seq; if (!cil) return 0; - ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence); - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); - /* - * Lock out transaction commit, but don't block for background pushes - * unless we are well over the CIL space limit. See the definition of - * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic - * used here. - */ - if (!down_write_trylock(&cil->xc_ctx_lock)) { - if (!push_seq && - cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log)) - goto out_free_ticket; - down_write(&cil->xc_ctx_lock); - } + down_write(&cil->xc_ctx_lock); ctx = cil->xc_ctx; - /* check if we've anything to push */ - if (list_empty(&cil->xc_cil)) - goto out_skip; + spin_lock(&cil->xc_cil_lock); + push_seq = cil->xc_push_seq; + ASSERT(push_seq <= ctx->sequence); - /* check for spurious background flush */ - if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + /* + * Check if we've anything to push. If there is nothing, then we don't + * move on to a new sequence number and so we have to be able to push + * this sequence again later. + */ + if (list_empty(&cil->xc_cil)) { + cil->xc_push_seq = 0; + spin_unlock(&cil->xc_cil_lock); goto out_skip; + } + spin_unlock(&cil->xc_cil_lock); + /* check for a previously pushed seqeunce */ - if (push_seq && push_seq < cil->xc_ctx->sequence) + if (push_seq < cil->xc_ctx->sequence) goto out_skip; /* @@ -606,7 +572,6 @@ restart: out_skip: up_write(&cil->xc_ctx_lock); -out_free_ticket: xfs_log_ticket_put(new_ctx->ticket); kmem_free(new_ctx); return 0; @@ -618,6 +583,82 @@ out_abort: return XFS_ERROR(EIO); } +static void +xlog_cil_push_work( + struct work_struct *work) +{ + struct xfs_cil *cil = container_of(work, struct xfs_cil, + xc_push_work); + xlog_cil_push(cil->xc_log); +} + +/* + * We need to push CIL every so often so we don't cache more than we can fit in + * the log. The limit really is that a checkpoint can't be more than half the + * log (the current checkpoint is not allowed to overwrite the previous + * checkpoint), but commit latency and memory usage limit this to a smaller + * size. + */ +static void +xlog_cil_push_background( + struct xlog *log) +{ + struct xfs_cil *cil = log->l_cilp; + + /* + * The cil won't be empty because we are called while holding the + * context lock so whatever we added to the CIL will still be there + */ + ASSERT(!list_empty(&cil->xc_cil)); + + /* + * don't do a background push if we haven't used up all the + * space available yet. + */ + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + return; + + spin_lock(&cil->xc_cil_lock); + if (cil->xc_push_seq < cil->xc_current_sequence) { + cil->xc_push_seq = cil->xc_current_sequence; + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + } + spin_unlock(&cil->xc_cil_lock); + +} + +static void +xlog_cil_push_foreground( + struct xlog *log, + xfs_lsn_t push_seq) +{ + struct xfs_cil *cil = log->l_cilp; + + if (!cil) + return; + + ASSERT(push_seq && push_seq <= cil->xc_current_sequence); + + /* start on any pending background push to minimise wait time on it */ + flush_work(&cil->xc_push_work); + + /* + * If the CIL is empty or we've already pushed the sequence then + * there's no work we need to do. + */ + spin_lock(&cil->xc_cil_lock); + if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { + spin_unlock(&cil->xc_cil_lock); + return; + } + + cil->xc_push_seq = push_seq; + spin_unlock(&cil->xc_cil_lock); + + /* do the push now */ + xlog_cil_push(log); +} + /* * Commit a transaction with the given vector to the Committed Item List. * @@ -635,28 +676,29 @@ out_abort: * background commit, returns without it held once background commits are * allowed again. */ -void +int xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_log_vec *log_vector, xfs_lsn_t *commit_lsn, int flags) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; int log_flags = 0; - int push = 0; + struct xfs_log_vec *log_vector; if (flags & XFS_TRANS_RELEASE_LOG_RES) log_flags = XFS_LOG_REL_PERM_RESERV; /* - * do all the hard work of formatting items (including memory + * Do all the hard work of formatting items (including memory * allocation) outside the CIL context lock. This prevents stalling CIL * pushes when we are low on memory and a transaction commit spends a * lot of time in memory reclaim. */ - xlog_cil_format_items(log, log_vector); + log_vector = xlog_cil_prepare_log_vecs(tp); + if (!log_vector) + return ENOMEM; /* lock out background commit */ down_read(&log->l_cilp->xc_ctx_lock); @@ -694,21 +736,10 @@ xfs_log_commit_cil( */ xfs_trans_free_items(tp, *commit_lsn, 0); - /* check for background commit before unlock */ - if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) - push = 1; + xlog_cil_push_background(log); up_read(&log->l_cilp->xc_ctx_lock); - - /* - * We need to push CIL every so often so we don't cache more than we - * can fit in the log. The limit really is that a checkpoint can't be - * more than half the log (the current checkpoint is not allowed to - * overwrite the previous checkpoint), but commit latency and memory - * usage limit this to a smaller size in most cases. - */ - if (push) - xlog_cil_push(log, 0); + return 0; } /* @@ -720,13 +751,10 @@ xfs_log_commit_cil( * * We return the current commit lsn to allow the callers to determine if a * iclog flush is necessary following this call. - * - * XXX: Initially, just push the CIL unconditionally and return whatever - * commit lsn is there. It'll be empty, so this is broken for now. */ xfs_lsn_t xlog_cil_force_lsn( - struct log *log, + struct xlog *log, xfs_lsn_t sequence) { struct xfs_cil *cil = log->l_cilp; @@ -740,8 +768,7 @@ xlog_cil_force_lsn( * xlog_cil_push() handles racing pushes for the same sequence, * so no need to deal with it here. */ - if (sequence == cil->xc_current_sequence) - xlog_cil_push(log, sequence); + xlog_cil_push_foreground(log, sequence); /* * See if we can find a previous sequence still committing. @@ -786,8 +813,6 @@ xfs_log_item_in_current_chkpt( { struct xfs_cil_ctx *ctx; - if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG)) - return false; if (list_empty(&lip->li_cil)) return false; @@ -802,3 +827,57 @@ xfs_log_item_in_current_chkpt( return false; return true; } + +/* + * Perform initial CIL structure initialisation. + */ +int +xlog_cil_init( + struct xlog *log) +{ + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + if (!cil) + return ENOMEM; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + if (!ctx) { + kmem_free(cil); + return ENOMEM; + } + + INIT_WORK(&cil->xc_push_work, xlog_cil_push_work); + INIT_LIST_HEAD(&cil->xc_cil); + INIT_LIST_HEAD(&cil->xc_committing); + spin_lock_init(&cil->xc_cil_lock); + init_rwsem(&cil->xc_ctx_lock); + init_waitqueue_head(&cil->xc_commit_wait); + + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + ctx->sequence = 1; + ctx->cil = cil; + cil->xc_ctx = ctx; + cil->xc_current_sequence = ctx->sequence; + + cil->xc_log = log; + log->l_cilp = cil; + return 0; +} + +void +xlog_cil_destroy( + struct xlog *log) +{ + if (log->l_cilp->xc_ctx) { + if (log->l_cilp->xc_ctx->ticket) + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); + kmem_free(log->l_cilp->xc_ctx); + } + + ASSERT(list_empty(&log->l_cilp->xc_cil)); + kmem_free(log->l_cilp); +} + diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 2d3b6a498d6..72eba2201b1 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -19,7 +19,7 @@ #define __XFS_LOG_PRIV_H__ struct xfs_buf; -struct log; +struct xlog; struct xlog_ticket; struct xfs_mount; @@ -239,8 +239,8 @@ typedef struct xlog_res { } xlog_res_t; typedef struct xlog_ticket { - wait_queue_head_t t_wait; /* ticket wait queue */ struct list_head t_queue; /* reserve/write queue */ + struct task_struct *t_task; /* task that owns this ticket */ xlog_tid_t t_tid; /* transaction identifier : 4 */ atomic_t t_ref; /* ticket reference count : 4 */ int t_curr_res; /* current reservation in bytes : 4 */ @@ -352,7 +352,7 @@ typedef struct xlog_in_core { struct xlog_in_core *ic_next; struct xlog_in_core *ic_prev; struct xfs_buf *ic_bp; - struct log *ic_log; + struct xlog *ic_log; int ic_size; int ic_offset; int ic_bwritecnt; @@ -409,7 +409,7 @@ struct xfs_cil_ctx { * operations almost as efficient as the old logging methods. */ struct xfs_cil { - struct log *xc_log; + struct xlog *xc_log; struct list_head xc_cil; spinlock_t xc_cil_lock; struct xfs_cil_ctx *xc_ctx; @@ -417,6 +417,8 @@ struct xfs_cil { struct list_head xc_committing; wait_queue_head_t xc_commit_wait; xfs_lsn_t xc_current_sequence; + struct work_struct xc_push_work; + xfs_lsn_t xc_push_seq; }; /* @@ -470,12 +472,22 @@ struct xfs_cil { #define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4)) /* + * ticket grant locks, queues and accounting have their own cachlines + * as these are quite hot and can be operated on concurrently. + */ +struct xlog_grant_head { + spinlock_t lock ____cacheline_aligned_in_smp; + struct list_head waiters; + atomic64_t grant; +}; + +/* * The reservation head lsn is not made up of a cycle number and block number. * Instead, it uses a cycle number and byte number. Logs don't expect to * overflow 31 bits worth of byte offset, so using a byte number will mean * that round off problems won't occur when releasing partial reservations. */ -typedef struct log { +typedef struct xlog { /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ @@ -520,17 +532,8 @@ typedef struct log { /* lsn of 1st LR with unflushed * buffers */ atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; - /* - * ticket grant locks, queues and accounting have their own cachlines - * as these are quite hot and can be operated on concurrently. - */ - spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp; - struct list_head l_reserveq; - atomic64_t l_grant_reserve_head; - - spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp; - struct list_head l_writeq; - atomic64_t l_grant_write_head; + struct xlog_grant_head l_reserve_head; + struct xlog_grant_head l_write_head; /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG @@ -545,15 +548,19 @@ typedef struct log { #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) /* common routines */ -extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); extern int xlog_recover(xlog_t *log); extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); extern kmem_zone_t *xfs_log_ticket_zone; -struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, - int count, char client, uint xflags, - int alloc_flags); +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int count, + char client, + bool permanent, + xfs_km_flags_t alloc_flags); static inline void @@ -565,9 +572,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) } void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); -int xlog_write(struct log *log, struct xfs_log_vec *log_vector, - struct xlog_ticket *tic, xfs_lsn_t *start_lsn, - xlog_in_core_t **commit_iclog, uint flags); +int +xlog_write( + struct xlog *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags); /* * When we crack an atomic LSN, we sample it first so that the value will not @@ -627,17 +639,23 @@ xlog_assign_grant_head(atomic64_t *head, int cycle, int space) /* * Committed Item List interfaces */ -int xlog_cil_init(struct log *log); -void xlog_cil_init_post_recovery(struct log *log); -void xlog_cil_destroy(struct log *log); +int +xlog_cil_init(struct xlog *log); +void +xlog_cil_init_post_recovery(struct xlog *log); +void +xlog_cil_destroy(struct xlog *log); /* * CIL force routines */ -xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence); +xfs_lsn_t +xlog_cil_force_lsn( + struct xlog *log, + xfs_lsn_t sequence); static inline void -xlog_cil_force(struct log *log) +xlog_cil_force(struct xlog *log) { xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 541a508adea..a7be98abd6a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -40,7 +40,6 @@ #include "xfs_extfree_item.h" #include "xfs_trans_priv.h" #include "xfs_quota.h" -#include "xfs_rw.h" #include "xfs_utils.h" #include "xfs_trace.h" @@ -120,7 +119,7 @@ xlog_get_bp( nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, BBTOB(nbblks), 0); + bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0); if (bp) xfs_buf_unlock(bp); return bp; @@ -146,7 +145,7 @@ xlog_align( { xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); - ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(offset + nbblks <= bp->b_length); return bp->b_addr + BBTOB(offset); } @@ -174,11 +173,12 @@ xlog_bread_noalign( nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); + bp->b_io_length = nbblks; + bp->b_error = 0; xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); @@ -218,7 +218,7 @@ xlog_bread_offset( xfs_caddr_t offset) { xfs_caddr_t orig_offset = bp->b_addr; - int orig_len = bp->b_buffer_length; + int orig_len = BBTOB(bp->b_length); int error, error2; error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); @@ -259,13 +259,14 @@ xlog_bwrite( nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_ZEROFLAGS(bp); xfs_buf_hold(bp); xfs_buf_lock(bp); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); + bp->b_io_length = nbblks; + bp->b_error = 0; error = xfs_bwrite(bp); if (error) @@ -440,6 +441,8 @@ xlog_find_verify_cycle( * a log sector, or we're out of luck. */ bufblks = 1 << ffs(nbblks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < log->l_sectBBsize) @@ -965,9 +968,9 @@ xlog_find_tail( log->l_curr_cycle++; atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); - xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle, + xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, BBTOB(log->l_curr_block)); - xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle, + xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, BBTOB(log->l_curr_block)); /* @@ -1225,6 +1228,8 @@ xlog_write_log_records( * log sector, or we're out of luck. */ bufblks = 1 << ffs(blocks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < sectbb) @@ -1466,8 +1471,8 @@ xlog_recover_add_item( STATIC int xlog_recover_add_to_cont_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, xfs_caddr_t dp, int len) { @@ -1489,7 +1494,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u); + ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); memcpy(&ptr[old_len], dp, len); /* d, s, l */ item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -1512,8 +1517,8 @@ xlog_recover_add_to_cont_trans( */ STATIC int xlog_recover_add_to_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, xfs_caddr_t dp, int len) { @@ -1583,8 +1588,8 @@ xlog_recover_add_to_trans( */ STATIC int xlog_recover_reorder_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, int pass) { xlog_recover_item_t *item, *n; @@ -1637,8 +1642,8 @@ xlog_recover_reorder_trans( */ STATIC int xlog_recover_buffer_pass1( - struct log *log, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover_item *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; struct list_head *bucket; @@ -1691,7 +1696,7 @@ xlog_recover_buffer_pass1( */ STATIC int xlog_check_buffer_cancelled( - struct log *log, + struct xlog *log, xfs_daddr_t blkno, uint len, ushort flags) @@ -1772,7 +1777,7 @@ xlog_recover_do_inode_buffer( trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; + inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + offsetof(xfs_dinode_t, di_next_unlinked); @@ -1814,7 +1819,8 @@ xlog_recover_do_inode_buffer( ASSERT(item->ri_buf[item_index].i_addr != NULL); ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); - ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); + ASSERT((reg_buf_offset + reg_buf_bytes) <= + BBTOB(bp->b_io_length)); /* * The current logged region contains a copy of the @@ -1873,8 +1879,8 @@ xlog_recover_do_reg_buffer( ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); - ASSERT(XFS_BUF_COUNT(bp) >= - ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT)); + ASSERT(BBTOB(bp->b_io_length) >= + ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); /* * Do a sanity check if this is a dquot buffer. Just checking @@ -1981,7 +1987,7 @@ xfs_qm_dqcheck( if (!errs && ddq->d_id) { if (ddq->d_blk_softlimit && - be64_to_cpu(ddq->d_bcount) >= + be64_to_cpu(ddq->d_bcount) > be64_to_cpu(ddq->d_blk_softlimit)) { if (!ddq->d_btimer) { if (flags & XFS_QMOPT_DOWARN) @@ -1992,7 +1998,7 @@ xfs_qm_dqcheck( } } if (ddq->d_ino_softlimit && - be64_to_cpu(ddq->d_icount) >= + be64_to_cpu(ddq->d_icount) > be64_to_cpu(ddq->d_ino_softlimit)) { if (!ddq->d_itimer) { if (flags & XFS_QMOPT_DOWARN) @@ -2003,7 +2009,7 @@ xfs_qm_dqcheck( } } if (ddq->d_rtb_softlimit && - be64_to_cpu(ddq->d_rtbcount) >= + be64_to_cpu(ddq->d_rtbcount) > be64_to_cpu(ddq->d_rtb_softlimit)) { if (!ddq->d_rtbtimer) { if (flags & XFS_QMOPT_DOWARN) @@ -2103,6 +2109,7 @@ xlog_recover_do_dquot_buffer( STATIC int xlog_recover_buffer_pass2( xlog_t *log, + struct list_head *buffer_list, xlog_recover_item_t *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; @@ -2123,9 +2130,9 @@ xlog_recover_buffer_pass2( trace_xfs_log_recover_buf_recover(log, buf_f); - buf_flags = XBF_LOCK; - if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF)) - buf_flags |= XBF_MAPPED; + buf_flags = 0; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + buf_flags |= XBF_UNMAPPED; bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, buf_flags); @@ -2166,14 +2173,14 @@ xlog_recover_buffer_pass2( */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && - (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, + (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); } xfs_buf_relse(bp); @@ -2183,6 +2190,7 @@ xlog_recover_buffer_pass2( STATIC int xlog_recover_inode_pass2( xlog_t *log, + struct list_head *buffer_list, xlog_recover_item_t *item) { xfs_inode_log_format_t *in_f; @@ -2220,8 +2228,7 @@ xlog_recover_inode_pass2( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, - XBF_LOCK); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0); if (!bp) { error = ENOMEM; goto error; @@ -2436,7 +2443,7 @@ xlog_recover_inode_pass2( write_inode_buffer: ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); error: if (need_free) @@ -2477,6 +2484,7 @@ xlog_recover_quotaoff_pass1( STATIC int xlog_recover_dquot_pass2( xlog_t *log, + struct list_head *buffer_list, xlog_recover_item_t *item) { xfs_mount_t *mp = log->l_mp; @@ -2530,14 +2538,11 @@ xlog_recover_dquot_pass2( return XFS_ERROR(EIO); ASSERT(dq_f->qlf_len == 1); - error = xfs_read_buf(mp, mp->m_ddev_targp, - dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), - 0, &bp); - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)"); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); + if (error) return error; - } + ASSERT(bp); ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); @@ -2558,7 +2563,7 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); return (0); @@ -2642,7 +2647,8 @@ xlog_recover_efd_pass2( * xfs_trans_ail_delete() drops the * AIL lock. */ - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, + SHUTDOWN_CORRUPT_INCORE); xfs_efi_item_free(efip); spin_lock(&ailp->xa_lock); break; @@ -2683,9 +2689,9 @@ xlog_recover_free_trans( STATIC int xlog_recover_commit_pass1( - struct log *log, - struct xlog_recover *trans, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover *trans, + struct xlog_recover_item *item) { trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); @@ -2710,23 +2716,24 @@ xlog_recover_commit_pass1( STATIC int xlog_recover_commit_pass2( - struct log *log, - struct xlog_recover *trans, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct xlog_recover_item *item) { trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - return xlog_recover_buffer_pass2(log, item); + return xlog_recover_buffer_pass2(log, buffer_list, item); case XFS_LI_INODE: - return xlog_recover_inode_pass2(log, item); + return xlog_recover_inode_pass2(log, buffer_list, item); case XFS_LI_EFI: return xlog_recover_efi_pass2(log, item, trans->r_lsn); case XFS_LI_EFD: return xlog_recover_efd_pass2(log, item); case XFS_LI_DQUOT: - return xlog_recover_dquot_pass2(log, item); + return xlog_recover_dquot_pass2(log, buffer_list, item); case XFS_LI_QUOTAOFF: /* nothing to do in pass2 */ return 0; @@ -2746,12 +2753,13 @@ xlog_recover_commit_pass2( */ STATIC int xlog_recover_commit_trans( - struct log *log, + struct xlog *log, struct xlog_recover *trans, int pass) { - int error = 0; + int error = 0, error2; xlog_recover_item_t *item; + LIST_HEAD (buffer_list); hlist_del(&trans->r_list); @@ -2760,22 +2768,33 @@ xlog_recover_commit_trans( return error; list_for_each_entry(item, &trans->r_itemq, ri_list) { - if (pass == XLOG_RECOVER_PASS1) + switch (pass) { + case XLOG_RECOVER_PASS1: error = xlog_recover_commit_pass1(log, trans, item); - else - error = xlog_recover_commit_pass2(log, trans, item); + break; + case XLOG_RECOVER_PASS2: + error = xlog_recover_commit_pass2(log, trans, + &buffer_list, item); + break; + default: + ASSERT(0); + } + if (error) - return error; + goto out; } xlog_recover_free_trans(trans); - return 0; + +out: + error2 = xfs_buf_delwri_submit(&buffer_list); + return error ? error : error2; } STATIC int xlog_recover_unmount_trans( - struct log *log, - xlog_recover_t *trans) + struct xlog *log, + struct xlog_recover *trans) { /* Do nothing now */ xfs_warn(log->l_mp, "%s: Unmount LR", __func__); @@ -3079,7 +3098,7 @@ xlog_recover_process_one_iunlink( /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, NULL, ip, &dip, &ibp, 0); if (error) goto fail_iput; @@ -3161,37 +3180,26 @@ xlog_recover_process_iunlinks( */ continue; } + /* + * Unlock the buffer so that it can be acquired in the normal + * course of the transaction to truncate and free each inode. + * Because we are not racing with anyone else here for the AGI + * buffer, we don't even need to hold it locked to read the + * initial unlinked bucket entries out of the buffer. We keep + * buffer reference though, so that it stays pinned in memory + * while we need the buffer. + */ agi = XFS_BUF_TO_AGI(agibp); + xfs_buf_unlock(agibp); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { - /* - * Release the agi buffer so that it can - * be acquired in the normal course of the - * transaction to truncate and free the inode. - */ - xfs_buf_relse(agibp); - agino = xlog_recover_process_one_iunlink(mp, agno, agino, bucket); - - /* - * Reacquire the agibuffer and continue around - * the loop. This should never fail as we know - * the buffer was good earlier on. - */ - error = xfs_read_agi(mp, NULL, agno, &agibp); - ASSERT(error == 0); - agi = XFS_BUF_TO_AGI(agibp); } } - - /* - * Release the buffer for the current agi so we can - * go on to the next one. - */ - xfs_buf_relse(agibp); + xfs_buf_rele(agibp); } mp->m_dmevmask = mp_dmevmask; @@ -3650,11 +3658,8 @@ xlog_do_recover( * First replay the images in the log. */ error = xlog_do_log_recovery(log, head_blk, tail_blk); - if (error) { + if (error) return error; - } - - xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1); /* * If IO errors happened during recovery, bail out. @@ -3681,7 +3686,6 @@ xlog_do_recover( bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); ASSERT(!(XFS_BUF_ISWRITE(bp))); - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); xfsbdstrat(log->l_mp, bp); @@ -3695,7 +3699,7 @@ xlog_do_recover( /* Convert superblock from on-disk format */ sbp = &log->l_mp->m_sb; - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp)); ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); ASSERT(xfs_sb_good_version(sbp)); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index bd672def95a..331cd9f83a7 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d06afbc3540..536021fb3d4 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -22,6 +22,7 @@ #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" @@ -37,7 +38,6 @@ #include "xfs_rtalloc.h" #include "xfs_bmap.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_quota.h" #include "xfs_fsops.h" #include "xfs_utils.h" @@ -158,7 +158,7 @@ xfs_uuid_mount( out_duplicate: mutex_unlock(&xfs_uuid_table_mutex); - xfs_warn(mp, "Filesystem has duplicate UUID - can't mount"); + xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid); return XFS_ERROR(EINVAL); } @@ -553,9 +553,11 @@ out_unwind: void xfs_sb_from_disk( - xfs_sb_t *to, + struct xfs_mount *mp, xfs_dsb_t *from) { + struct xfs_sb *to = &mp->m_sb; + to->sb_magicnum = be32_to_cpu(from->sb_magicnum); to->sb_blocksize = be32_to_cpu(from->sb_blocksize); to->sb_dblocks = be64_to_cpu(from->sb_dblocks); @@ -681,8 +683,8 @@ xfs_readsb(xfs_mount_t *mp, int flags) sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); reread: - bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, - XFS_SB_DADDR, sector_size, 0); + bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, + BTOBB(sector_size), 0); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); @@ -693,7 +695,7 @@ reread: * Initialize the mount structure from the superblock. * But first do some basic consistency checking. */ - xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp)); error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); if (error) { if (loud) @@ -1030,9 +1032,9 @@ xfs_check_sizes(xfs_mount_t *mp) xfs_warn(mp, "filesystem size mismatch detected"); return XFS_ERROR(EFBIG); } - bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, + bp = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { xfs_warn(mp, "last sector read failed"); return EIO; @@ -1045,9 +1047,9 @@ xfs_check_sizes(xfs_mount_t *mp) xfs_warn(mp, "log size mismatch detected"); return XFS_ERROR(EFBIG); } - bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp, + bp = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_B(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0); if (!bp) { xfs_warn(mp, "log device read failed"); return EIO; @@ -1286,7 +1288,7 @@ xfs_mountfs( XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); if (error) { xfs_warn(mp, "log mount failed"); - goto out_free_perag; + goto out_fail_wait; } /* @@ -1313,7 +1315,7 @@ xfs_mountfs( !mp->m_sb.sb_inprogress) { error = xfs_initialize_perag_data(mp, sbp->sb_agcount); if (error) - goto out_free_perag; + goto out_fail_wait; } /* @@ -1437,6 +1439,10 @@ xfs_mountfs( IRELE(rip); out_log_dealloc: xfs_log_unmount(mp); + out_fail_wait: + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) + xfs_wait_buftarg(mp->m_logdev_targp); + xfs_wait_buftarg(mp->m_ddev_targp); out_free_perag: xfs_free_perag(mp); out_remove_uuid: @@ -1473,15 +1479,15 @@ xfs_unmountfs( xfs_log_force(mp, XFS_LOG_SYNC); /* - * Do a delwri reclaim pass first so that as many dirty inodes are - * queued up for IO as possible. Then flush the buffers before making - * a synchronous path to catch all the remaining inodes are reclaimed. - * This makes the reclaim process as quick as possible by avoiding - * synchronous writeout and blocking on inodes already in the delwri - * state as much as possible. + * Flush all pending changes from the AIL. + */ + xfs_ail_push_all_sync(mp->m_ail); + + /* + * And reclaim all inodes. At this point there should be no dirty + * inode, and none should be pinned or locked, but use synchronous + * reclaim just to be sure. */ - xfs_reclaim_inodes(mp, 0); - xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); @@ -1517,15 +1523,12 @@ xfs_unmountfs( if (error) xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); - xfs_unmountfs_writesb(mp); /* - * Make sure all buffers have been flushed and completed before - * unmounting the log. + * At this point we might have modified the superblock again and thus + * added an item to the AIL, thus flush it again. */ - error = xfs_flush_buftarg(mp->m_ddev_targp, 1); - if (error) - xfs_warn(mp, "%d busy buffers during unmount.", error); + xfs_ail_push_all_sync(mp->m_ail); xfs_wait_buftarg(mp->m_ddev_targp); xfs_log_unmount_write(mp); @@ -1586,36 +1589,6 @@ xfs_log_sbcount(xfs_mount_t *mp) return error; } -int -xfs_unmountfs_writesb(xfs_mount_t *mp) -{ - xfs_buf_t *sbp; - int error = 0; - - /* - * skip superblock write if fs is read-only, or - * if we are doing a forced umount. - */ - if (!((mp->m_flags & XFS_MOUNT_RDONLY) || - XFS_FORCED_SHUTDOWN(mp))) { - - sbp = xfs_getsb(mp, 0); - - XFS_BUF_UNDONE(sbp); - XFS_BUF_UNREAD(sbp); - xfs_buf_delwri_dequeue(sbp); - XFS_BUF_WRITE(sbp); - XFS_BUF_UNASYNC(sbp); - ASSERT(sbp->b_target == mp->m_ddev_targp); - xfsbdstrat(mp, sbp); - error = xfs_buf_iowait(sbp); - if (error) - xfs_buf_ioerror_alert(sbp, __func__); - xfs_buf_relse(sbp); - } - return error; -} - /* * xfs_mod_sb() can be used to copy arbitrary changes to the * in-core superblock into the superblock buffer to be logged. diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bb24dac42a2..90c1fc9eaea 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -53,7 +53,7 @@ typedef struct xfs_trans_reservations { #include "xfs_sync.h" -struct log; +struct xlog; struct xfs_mount_args; struct xfs_inode; struct xfs_bmbt_irec; @@ -133,7 +133,7 @@ typedef struct xfs_mount { uint m_readio_blocks; /* min read size blocks */ uint m_writeio_log; /* min write size log bytes */ uint m_writeio_blocks; /* min write size blocks */ - struct log *m_log; /* log specific stuff */ + struct xlog *m_log; /* log specific stuff */ int m_logbufs; /* number of log buffers */ int m_logbsize; /* size of each log buffer */ uint m_rsumlevels; /* rt summary levels */ @@ -211,6 +211,10 @@ typedef struct xfs_mount { struct shrinker m_inode_shrink; /* inode reclaim shrinker */ int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ + + struct workqueue_struct *m_data_workqueue; + struct workqueue_struct *m_unwritten_workqueue; + struct workqueue_struct *m_cil_workqueue; } xfs_mount_t; /* @@ -219,7 +223,6 @@ typedef struct xfs_mount { #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ -#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for @@ -376,7 +379,6 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); -extern int xfs_unmountfs_writesb(xfs_mount_t *); extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); @@ -396,7 +398,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); -extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); +extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *); extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 0bbb1a41998..249db198776 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -48,227 +47,182 @@ * quota functionality, including maintaining the freelist and hash * tables of dquots. */ -struct mutex xfs_Gqm_lock; -struct xfs_qm *xfs_Gqm; -uint ndquot; - -kmem_zone_t *qm_dqzone; -kmem_zone_t *qm_dqtrxzone; - -STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); -STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); - STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *); -static struct shrinker xfs_qm_shaker = { - .shrink = xfs_qm_shake, - .seeks = DEFAULT_SEEKS, -}; - /* - * Initialize the XQM structure. - * Note that there is not one quota manager per file system. + * We use the batch lookup interface to iterate over the dquots as it + * currently is the only interface into the radix tree code that allows + * fuzzy lookups instead of exact matches. Holding the lock over multiple + * operations is fine as all callers are used either during mount/umount + * or quotaoff. */ -STATIC struct xfs_qm * -xfs_Gqm_init(void) +#define XFS_DQ_LOOKUP_BATCH 32 + +STATIC int +xfs_qm_dquot_walk( + struct xfs_mount *mp, + int type, + int (*execute)(struct xfs_dquot *dqp, void *data), + void *data) { - xfs_dqhash_t *udqhash, *gdqhash; - xfs_qm_t *xqm; - size_t hsize; - uint i; + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); + uint32_t next_index; + int last_error = 0; + int skipped; + int nr_found; + +restart: + skipped = 0; + next_index = 0; + nr_found = 0; + + while (1) { + struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH]; + int error = 0; + int i; + + mutex_lock(&qi->qi_tree_lock); + nr_found = radix_tree_gang_lookup(tree, (void **)batch, + next_index, XFS_DQ_LOOKUP_BATCH); + if (!nr_found) { + mutex_unlock(&qi->qi_tree_lock); + break; + } - /* - * Initialize the dquot hash tables. - */ - udqhash = kmem_zalloc_greedy(&hsize, - XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t), - XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t)); - if (!udqhash) - goto out; + for (i = 0; i < nr_found; i++) { + struct xfs_dquot *dqp = batch[i]; - gdqhash = kmem_zalloc_large(hsize); - if (!gdqhash) - goto out_free_udqhash; + next_index = be32_to_cpu(dqp->q_core.d_id) + 1; - hsize /= sizeof(xfs_dqhash_t); - ndquot = hsize << 8; + error = execute(batch[i], data); + if (error == EAGAIN) { + skipped++; + continue; + } + if (error && last_error != EFSCORRUPTED) + last_error = error; + } - xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP); - xqm->qm_dqhashmask = hsize - 1; - xqm->qm_usr_dqhtable = udqhash; - xqm->qm_grp_dqhtable = gdqhash; - ASSERT(xqm->qm_usr_dqhtable != NULL); - ASSERT(xqm->qm_grp_dqhtable != NULL); + mutex_unlock(&qi->qi_tree_lock); - for (i = 0; i < hsize; i++) { - xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i); - xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i); + /* bail out if the filesystem is corrupted. */ + if (last_error == EFSCORRUPTED) { + skipped = 0; + break; + } } - /* - * Freelist of all dquots of all file systems - */ - INIT_LIST_HEAD(&xqm->qm_dqfrlist); - xqm->qm_dqfrlist_cnt = 0; - mutex_init(&xqm->qm_dqfrlist_lock); - - /* - * dquot zone. we register our own low-memory callback. - */ - if (!qm_dqzone) { - xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t), - "xfs_dquots"); - qm_dqzone = xqm->qm_dqzone; - } else - xqm->qm_dqzone = qm_dqzone; - - register_shrinker(&xfs_qm_shaker); - - /* - * The t_dqinfo portion of transactions. - */ - if (!qm_dqtrxzone) { - xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t), - "xfs_dqtrx"); - qm_dqtrxzone = xqm->qm_dqtrxzone; - } else - xqm->qm_dqtrxzone = qm_dqtrxzone; - - atomic_set(&xqm->qm_totaldquots, 0); - xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO; - xqm->qm_nrefs = 0; - return xqm; + if (skipped) { + delay(1); + goto restart; + } - out_free_udqhash: - kmem_free_large(udqhash); - out: - return NULL; + return last_error; } -/* - * Destroy the global quota manager when its reference count goes to zero. - */ -STATIC void -xfs_qm_destroy( - struct xfs_qm *xqm) -{ - struct xfs_dquot *dqp, *n; - int hsize, i; - - ASSERT(xqm != NULL); - ASSERT(xqm->qm_nrefs == 0); - unregister_shrinker(&xfs_qm_shaker); - hsize = xqm->qm_dqhashmask + 1; - for (i = 0; i < hsize; i++) { - xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); - xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i])); - } - kmem_free_large(xqm->qm_usr_dqhtable); - kmem_free_large(xqm->qm_grp_dqhtable); - xqm->qm_usr_dqhtable = NULL; - xqm->qm_grp_dqhtable = NULL; - xqm->qm_dqhashmask = 0; - - /* frlist cleanup */ - mutex_lock(&xqm->qm_dqfrlist_lock); - list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) { - xfs_dqlock(dqp); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - xfs_dqunlock(dqp); - xfs_qm_dqdestroy(dqp); - } - mutex_unlock(&xqm->qm_dqfrlist_lock); - mutex_destroy(&xqm->qm_dqfrlist_lock); - kmem_free(xqm); -} /* - * Called at mount time to let XQM know that another file system is - * starting quotas. This isn't crucial information as the individual mount - * structures are pretty independent, but it helps the XQM keep a - * global view of what's going on. + * Purge a dquot from all tracking data structures and free it. */ -/* ARGSUSED */ STATIC int -xfs_qm_hold_quotafs_ref( - struct xfs_mount *mp) +xfs_qm_dqpurge( + struct xfs_dquot *dqp, + void *data) { - /* - * Need to lock the xfs_Gqm structure for things like this. For example, - * the structure could disappear between the entry to this routine and - * a HOLD operation if not locked. - */ - mutex_lock(&xfs_Gqm_lock); + struct xfs_mount *mp = dqp->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_dquot *gdqp = NULL; - if (!xfs_Gqm) { - xfs_Gqm = xfs_Gqm_init(); - if (!xfs_Gqm) { - mutex_unlock(&xfs_Gqm_lock); - return ENOMEM; - } + xfs_dqlock(dqp); + if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { + xfs_dqunlock(dqp); + return EAGAIN; } /* - * We can keep a list of all filesystems with quotas mounted for - * debugging and statistical purposes, but ... - * Just take a reference and get out. + * If this quota has a group hint attached, prepare for releasing it + * now. */ - xfs_Gqm->qm_nrefs++; - mutex_unlock(&xfs_Gqm_lock); + gdqp = dqp->q_gdquot; + if (gdqp) { + xfs_dqlock(gdqp); + dqp->q_gdquot = NULL; + } - return 0; -} + dqp->dq_flags |= XFS_DQ_FREEING; - -/* - * Release the reference that a filesystem took at mount time, - * so that we know when we need to destroy the entire quota manager. - */ -/* ARGSUSED */ -STATIC void -xfs_qm_rele_quotafs_ref( - struct xfs_mount *mp) -{ - xfs_dquot_t *dqp, *n; - - ASSERT(xfs_Gqm); - ASSERT(xfs_Gqm->qm_nrefs > 0); + xfs_dqflock(dqp); /* - * Go thru the freelist and destroy all inactive dquots. + * If we are turning this type of quotas off, we don't care + * about the dirty metadata sitting in this dquot. OTOH, if + * we're unmounting, we do care, so we flush it and wait. */ - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - - list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) { - xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(list_empty(&dqp->q_hashlist)); - ASSERT(list_empty(&dqp->q_mplist)); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - xfs_dqunlock(dqp); - xfs_qm_dqdestroy(dqp); + if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; + int error; + + /* + * We don't care about getting disk errors here. We need + * to purge this dquot anyway, so we go ahead regardless. + */ + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(mp, "%s: dquot %p flush failed", + __func__, dqp); } else { - xfs_dqunlock(dqp); + error = xfs_bwrite(bp); + xfs_buf_relse(bp); } + xfs_dqflock(dqp); } - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + + ASSERT(atomic_read(&dqp->q_pincount) == 0); + ASSERT(XFS_FORCED_SHUTDOWN(mp) || + !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); + + xfs_dqfunlock(dqp); + xfs_dqunlock(dqp); + + radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), + be32_to_cpu(dqp->q_core.d_id)); + qi->qi_dquots--; /* - * Destroy the entire XQM. If somebody mounts with quotaon, this'll - * be restarted. + * We move dquots to the freelist as soon as their reference count + * hits zero, so it really should be on the freelist here. */ - mutex_lock(&xfs_Gqm_lock); - if (--xfs_Gqm->qm_nrefs == 0) { - xfs_qm_destroy(xfs_Gqm); - xfs_Gqm = NULL; - } - mutex_unlock(&xfs_Gqm_lock); + mutex_lock(&qi->qi_lru_lock); + ASSERT(!list_empty(&dqp->q_lru)); + list_del_init(&dqp->q_lru); + qi->qi_lru_count--; + XFS_STATS_DEC(xs_qm_dquot_unused); + mutex_unlock(&qi->qi_lru_lock); + + xfs_qm_dqdestroy(dqp); + + if (gdqp) + xfs_qm_dqput(gdqp); + return 0; +} + +/* + * Purge the dquot cache. + */ +void +xfs_qm_dqpurge_all( + struct xfs_mount *mp, + uint flags) +{ + if (flags & XFS_QMOPT_UQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); + if (flags & XFS_QMOPT_GQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); + if (flags & XFS_QMOPT_PQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL); } /* @@ -409,200 +363,6 @@ xfs_qm_unmount_quotas( } } -/* - * Flush all dquots of the given file system to disk. The dquots are - * _not_ purged from memory here, just their data written to disk. - */ -STATIC int -xfs_qm_dqflush_all( - struct xfs_mount *mp, - int sync_mode) -{ - struct xfs_quotainfo *q = mp->m_quotainfo; - int recl; - struct xfs_dquot *dqp; - int error; - - if (!q) - return 0; -again: - mutex_lock(&q->qi_dqlist_lock); - list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { - xfs_dqlock(dqp); - if (! XFS_DQ_IS_DIRTY(dqp)) { - xfs_dqunlock(dqp); - continue; - } - - /* XXX a sentinel would be better */ - recl = q->qi_dqreclaims; - if (!xfs_dqflock_nowait(dqp)) { - /* - * If we can't grab the flush lock then check - * to see if the dquot has been flushed delayed - * write. If so, grab its buffer and send it - * out immediately. We'll be able to acquire - * the flush lock when the I/O completes. - */ - xfs_qm_dqflock_pushbuf_wait(dqp); - } - /* - * Let go of the mplist lock. We don't want to hold it - * across a disk write. - */ - mutex_unlock(&q->qi_dqlist_lock); - error = xfs_qm_dqflush(dqp, sync_mode); - xfs_dqunlock(dqp); - if (error) - return error; - - mutex_lock(&q->qi_dqlist_lock); - if (recl != q->qi_dqreclaims) { - mutex_unlock(&q->qi_dqlist_lock); - /* XXX restart limit */ - goto again; - } - } - - mutex_unlock(&q->qi_dqlist_lock); - /* return ! busy */ - return 0; -} -/* - * Release the group dquot pointers the user dquots may be - * carrying around as a hint. mplist is locked on entry and exit. - */ -STATIC void -xfs_qm_detach_gdquots( - struct xfs_mount *mp) -{ - struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_dquot *dqp, *gdqp; - int nrecl; - - again: - ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); - list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { - xfs_dqlock(dqp); - if ((gdqp = dqp->q_gdquot)) { - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - xfs_dqunlock(dqp); - - if (gdqp) { - /* - * Can't hold the mplist lock across a dqput. - * XXXmust convert to marker based iterations here. - */ - nrecl = q->qi_dqreclaims; - mutex_unlock(&q->qi_dqlist_lock); - xfs_qm_dqput(gdqp); - - mutex_lock(&q->qi_dqlist_lock); - if (nrecl != q->qi_dqreclaims) - goto again; - } - } -} - -/* - * Go through all the incore dquots of this file system and take them - * off the mplist and hashlist, if the dquot type matches the dqtype - * parameter. This is used when turning off quota accounting for - * users and/or groups, as well as when the filesystem is unmounting. - */ -STATIC int -xfs_qm_dqpurge_int( - struct xfs_mount *mp, - uint flags) -{ - struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_dquot *dqp, *n; - uint dqtype; - int nrecl; - int nmisses; - - if (!q) - return 0; - - dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; - dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0; - dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0; - - mutex_lock(&q->qi_dqlist_lock); - - /* - * In the first pass through all incore dquots of this filesystem, - * we release the group dquot pointers the user dquots may be - * carrying around as a hint. We need to do this irrespective of - * what's being turned off. - */ - xfs_qm_detach_gdquots(mp); - - again: - nmisses = 0; - ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); - /* - * Try to get rid of all of the unwanted dquots. The idea is to - * get them off mplist and hashlist, but leave them on freelist. - */ - list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) { - /* - * It's OK to look at the type without taking dqlock here. - * We're holding the mplist lock here, and that's needed for - * a dqreclaim. - */ - if ((dqp->dq_flags & dqtype) == 0) - continue; - - if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - nrecl = q->qi_dqreclaims; - mutex_unlock(&q->qi_dqlist_lock); - mutex_lock(&dqp->q_hash->qh_lock); - mutex_lock(&q->qi_dqlist_lock); - - /* - * XXXTheoretically, we can get into a very long - * ping pong game here. - * No one can be adding dquots to the mplist at - * this point, but somebody might be taking things off. - */ - if (nrecl != q->qi_dqreclaims) { - mutex_unlock(&dqp->q_hash->qh_lock); - goto again; - } - } - - /* - * Take the dquot off the mplist and hashlist. It may remain on - * freelist in INACTIVE state. - */ - nmisses += xfs_qm_dqpurge(dqp); - } - mutex_unlock(&q->qi_dqlist_lock); - return nmisses; -} - -int -xfs_qm_dqpurge_all( - xfs_mount_t *mp, - uint flags) -{ - int ndquots; - - /* - * Purge the dquot cache. - * None of the dquots should really be busy at this point. - */ - if (mp->m_quotainfo) { - while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) { - delay(ndquots * 10); - } - } - return 0; -} - STATIC int xfs_qm_dqattach_one( xfs_inode_t *ip, @@ -648,12 +408,9 @@ xfs_qm_dqattach_one( */ dqp = udqhint->q_gdquot; if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) { - xfs_dqlock(dqp); - XFS_DQHOLD(dqp); ASSERT(*IO_idqpp == NULL); - *IO_idqpp = dqp; - xfs_dqunlock(dqp); + *IO_idqpp = xfs_qm_dqhold(dqp); xfs_dqunlock(udqhint); return 0; } @@ -693,11 +450,7 @@ xfs_qm_dqattach_one( /* * Given a udquot and gdquot, attach a ptr to the group dquot in the - * udquot as a hint for future lookups. The idea sounds simple, but the - * execution isn't, because the udquot might have a group dquot attached - * already and getting rid of that gets us into lock ordering constraints. - * The process is complicated more by the fact that the dquots may or may not - * be locked on entry. + * udquot as a hint for future lookups. */ STATIC void xfs_qm_dqattach_grouphint( @@ -708,48 +461,37 @@ xfs_qm_dqattach_grouphint( xfs_dqlock(udq); - if ((tmp = udq->q_gdquot)) { - if (tmp == gdq) { - xfs_dqunlock(udq); - return; - } + tmp = udq->q_gdquot; + if (tmp) { + if (tmp == gdq) + goto done; udq->q_gdquot = NULL; - /* - * We can't keep any dqlocks when calling dqrele, - * because the freelist lock comes before dqlocks. - */ - xfs_dqunlock(udq); - /* - * we took a hard reference once upon a time in dqget, - * so give it back when the udquot no longer points at it - * dqput() does the unlocking of the dquot. - */ xfs_qm_dqrele(tmp); - - xfs_dqlock(udq); - xfs_dqlock(gdq); - - } else { - ASSERT(XFS_DQ_IS_LOCKED(udq)); - xfs_dqlock(gdq); } - ASSERT(XFS_DQ_IS_LOCKED(udq)); - ASSERT(XFS_DQ_IS_LOCKED(gdq)); - /* - * Somebody could have attached a gdquot here, - * when we dropped the uqlock. If so, just do nothing. - */ - if (udq->q_gdquot == NULL) { - XFS_DQHOLD(gdq); - udq->q_gdquot = gdq; - } - - xfs_dqunlock(gdq); + udq->q_gdquot = xfs_qm_dqhold(gdq); +done: xfs_dqunlock(udq); } +static bool +xfs_qm_need_dqattach( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return false; + if (!XFS_IS_QUOTA_ON(mp)) + return false; + if (!XFS_NOT_DQATTACHED(mp, ip)) + return false; + if (ip->i_ino == mp->m_sb.sb_uquotino || + ip->i_ino == mp->m_sb.sb_gquotino) + return false; + return true; +} /* * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON @@ -767,11 +509,7 @@ xfs_qm_dqattach_locked( uint nquotas = 0; int error = 0; - if (!XFS_IS_QUOTA_RUNNING(mp) || - !XFS_IS_QUOTA_ON(mp) || - !XFS_NOT_DQATTACHED(mp, ip) || - ip->i_ino == mp->m_sb.sb_uquotino || - ip->i_ino == mp->m_sb.sb_gquotino) + if (!xfs_qm_need_dqattach(ip)) return 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -813,17 +551,13 @@ xfs_qm_dqattach_locked( ASSERT(ip->i_gdquot); /* - * We may or may not have the i_udquot locked at this point, - * but this check is OK since we don't depend on the i_gdquot to - * be accurate 100% all the time. It is just a hint, and this - * will succeed in general. - */ - if (ip->i_udquot->q_gdquot == ip->i_gdquot) - goto done; - /* - * Attach i_gdquot to the gdquot hint inside the i_udquot. + * We do not have i_udquot locked at this point, but this check + * is OK since we don't depend on the i_gdquot to be accurate + * 100% all the time. It is just a hint, and this will + * succeed in general. */ - xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); + if (ip->i_udquot->q_gdquot != ip->i_gdquot) + xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); } done: @@ -846,6 +580,9 @@ xfs_qm_dqattach( { int error; + if (!xfs_qm_need_dqattach(ip)) + return 0; + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqattach_locked(ip, flags); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -879,108 +616,6 @@ xfs_qm_dqdetach( } } -int -xfs_qm_sync( - struct xfs_mount *mp, - int flags) -{ - struct xfs_quotainfo *q = mp->m_quotainfo; - int recl, restarts; - struct xfs_dquot *dqp; - int error; - - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) - return 0; - - restarts = 0; - - again: - mutex_lock(&q->qi_dqlist_lock); - /* - * dqpurge_all() also takes the mplist lock and iterate thru all dquots - * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared - * when we have the mplist lock, we know that dquots will be consistent - * as long as we have it locked. - */ - if (!XFS_IS_QUOTA_ON(mp)) { - mutex_unlock(&q->qi_dqlist_lock); - return 0; - } - ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); - list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { - /* - * If this is vfs_sync calling, then skip the dquots that - * don't 'seem' to be dirty. ie. don't acquire dqlock. - * This is very similar to what xfs_sync does with inodes. - */ - if (flags & SYNC_TRYLOCK) { - if (!XFS_DQ_IS_DIRTY(dqp)) - continue; - if (!xfs_qm_dqlock_nowait(dqp)) - continue; - } else { - xfs_dqlock(dqp); - } - - /* - * Now, find out for sure if this dquot is dirty or not. - */ - if (! XFS_DQ_IS_DIRTY(dqp)) { - xfs_dqunlock(dqp); - continue; - } - - /* XXX a sentinel would be better */ - recl = q->qi_dqreclaims; - if (!xfs_dqflock_nowait(dqp)) { - if (flags & SYNC_TRYLOCK) { - xfs_dqunlock(dqp); - continue; - } - /* - * If we can't grab the flush lock then if the caller - * really wanted us to give this our best shot, so - * see if we can give a push to the buffer before we wait - * on the flush lock. At this point, we know that - * even though the dquot is being flushed, - * it has (new) dirty data. - */ - xfs_qm_dqflock_pushbuf_wait(dqp); - } - /* - * Let go of the mplist lock. We don't want to hold it - * across a disk write - */ - mutex_unlock(&q->qi_dqlist_lock); - error = xfs_qm_dqflush(dqp, flags); - xfs_dqunlock(dqp); - if (error && XFS_FORCED_SHUTDOWN(mp)) - return 0; /* Need to prevent umount failure */ - else if (error) - return error; - - mutex_lock(&q->qi_dqlist_lock); - if (recl != q->qi_dqreclaims) { - if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) - break; - - mutex_unlock(&q->qi_dqlist_lock); - goto again; - } - } - - mutex_unlock(&q->qi_dqlist_lock); - return 0; -} - -/* - * The hash chains and the mplist use the same xfs_dqhash structure as - * their list head, but we can take the mplist qh_lock and one of the - * hash qh_locks at the same time without any problem as they aren't - * related. - */ -static struct lock_class_key xfs_quota_mplist_class; - /* * This initializes all the quota information that's kept in the * mount structure @@ -995,13 +630,6 @@ xfs_qm_init_quotainfo( ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - /* - * Tell XQM that we exist as soon as possible. - */ - if ((error = xfs_qm_hold_quotafs_ref(mp))) { - return error; - } - qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); /* @@ -1014,11 +642,13 @@ xfs_qm_init_quotainfo( return error; } - INIT_LIST_HEAD(&qinf->qi_dqlist); - mutex_init(&qinf->qi_dqlist_lock); - lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class); + INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); + INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); + mutex_init(&qinf->qi_tree_lock); - qinf->qi_dqreclaims = 0; + INIT_LIST_HEAD(&qinf->qi_lru_list); + qinf->qi_lru_count = 0; + mutex_init(&qinf->qi_lru_lock); /* mutex used to serialize quotaoffs */ mutex_init(&qinf->qi_quotaofflock); @@ -1034,18 +664,21 @@ xfs_qm_init_quotainfo( /* * We try to get the limits from the superuser's limits fields. * This is quite hacky, but it is standard quota practice. + * * We look at the USR dquot with id == 0 first, but if user quotas * are not enabled we goto the GRP dquot with id == 0. * We don't really care to keep separate default limits for user * and group quotas, at least not at this point. + * + * Since we may not have done a quotacheck by this point, just read + * the dquot without attaching it to any hashtables or lists. */ - error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0, - XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : - (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : - XFS_DQ_PROJ), - XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN, - &dqp); - if (! error) { + error = xfs_qm_dqread(mp, 0, + XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : + (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : + XFS_DQ_PROJ), + XFS_QMOPT_DOWARN, &dqp); + if (!error) { xfs_disk_dquot_t *ddqp = &dqp->q_core; /* @@ -1072,11 +705,6 @@ xfs_qm_init_quotainfo( qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - /* - * We sent the XFS_QMOPT_DQSUSER flag to dqget because - * we don't want this dquot cached. We haven't done a - * quotacheck yet, and quotacheck doesn't like incore dquots. - */ xfs_qm_dqdestroy(dqp); } else { qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; @@ -1087,6 +715,9 @@ xfs_qm_init_quotainfo( qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; } + qinf->qi_shrinker.shrink = xfs_qm_shake; + qinf->qi_shrinker.seeks = DEFAULT_SEEKS; + register_shrinker(&qinf->qi_shrinker); return 0; } @@ -1104,17 +735,8 @@ xfs_qm_destroy_quotainfo( qi = mp->m_quotainfo; ASSERT(qi != NULL); - ASSERT(xfs_Gqm != NULL); - - /* - * Release the reference that XQM kept, so that we know - * when the XQM structure should be freed. We cannot assume - * that xfs_Gqm is non-null after this point. - */ - xfs_qm_rele_quotafs_ref(mp); - ASSERT(list_empty(&qi->qi_dqlist)); - mutex_destroy(&qi->qi_dqlist_lock); + unregister_shrinker(&qi->qi_shrinker); if (qi->qi_uquotaip) { IRELE(qi->qi_uquotaip); @@ -1129,30 +751,6 @@ xfs_qm_destroy_quotainfo( mp->m_quotainfo = NULL; } - - -/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */ - -/* ARGSUSED */ -STATIC void -xfs_qm_list_init( - xfs_dqlist_t *list, - char *str, - int n) -{ - mutex_init(&list->qh_lock); - INIT_LIST_HEAD(&list->qh_list); - list->qh_version = 0; - list->qh_nelems = 0; -} - -STATIC void -xfs_qm_list_destroy( - xfs_dqlist_t *list) -{ - mutex_destroy(&(list->qh_lock)); -} - /* * Create an inode and return with a reference already taken, but unlocked * This is how we create quota inodes @@ -1265,15 +863,16 @@ xfs_qm_reset_dqcounts( STATIC int xfs_qm_dqiter_bufs( - xfs_mount_t *mp, - xfs_dqid_t firstid, - xfs_fsblock_t bno, - xfs_filblks_t blkcnt, - uint flags) + struct xfs_mount *mp, + xfs_dqid_t firstid, + xfs_fsblock_t bno, + xfs_filblks_t blkcnt, + uint flags, + struct list_head *buffer_list) { - xfs_buf_t *bp; - int error; - int type; + struct xfs_buf *bp; + int error; + int type; ASSERT(blkcnt > 0); type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : @@ -1297,7 +896,7 @@ xfs_qm_dqiter_bufs( break; xfs_qm_reset_dqcounts(mp, bp, firstid, type); - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); /* * goto the next block. @@ -1305,6 +904,7 @@ xfs_qm_dqiter_bufs( bno++; firstid += mp->m_quotainfo->qi_dqperchunk; } + return error; } @@ -1314,11 +914,12 @@ xfs_qm_dqiter_bufs( */ STATIC int xfs_qm_dqiterate( - xfs_mount_t *mp, - xfs_inode_t *qip, - uint flags) + struct xfs_mount *mp, + struct xfs_inode *qip, + uint flags, + struct list_head *buffer_list) { - xfs_bmbt_irec_t *map; + struct xfs_bmbt_irec *map; int i, nmaps; /* number of map entries */ int error; /* return value */ xfs_fileoff_t lblkno; @@ -1385,21 +986,17 @@ xfs_qm_dqiterate( * Iterate thru all the blks in the extent and * reset the counters of all the dquots inside them. */ - if ((error = xfs_qm_dqiter_bufs(mp, - firstid, - map[i].br_startblock, - map[i].br_blockcount, - flags))) { - break; - } + error = xfs_qm_dqiter_bufs(mp, firstid, + map[i].br_startblock, + map[i].br_blockcount, + flags, buffer_list); + if (error) + goto out; } - - if (error) - break; } while (nmaps > 0); +out: kmem_free(map); - return error; } @@ -1590,6 +1187,33 @@ error0: return error; } +STATIC int +xfs_qm_flush_one( + struct xfs_dquot *dqp, + void *data) +{ + struct list_head *buffer_list = data; + struct xfs_buf *bp = NULL; + int error = 0; + + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) + goto out_unlock; + if (!XFS_DQ_IS_DIRTY(dqp)) + goto out_unlock; + + xfs_dqflock(dqp); + error = xfs_qm_dqflush(dqp, &bp); + if (error) + goto out_unlock; + + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); +out_unlock: + xfs_dqunlock(dqp); + return error; +} + /* * Walk thru all the filesystem inodes and construct a consistent view * of the disk quota world. If the quotacheck fails, disable quotas. @@ -1598,11 +1222,12 @@ int xfs_qm_quotacheck( xfs_mount_t *mp) { - int done, count, error; + int done, count, error, error2; xfs_ino_t lastino; size_t structsz; xfs_inode_t *uip, *gip; uint flags; + LIST_HEAD (buffer_list); count = INT_MAX; structsz = 1; @@ -1612,12 +1237,6 @@ xfs_qm_quotacheck( ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - /* - * There should be no cached dquots. The (simplistic) quotacheck - * algorithm doesn't like that. - */ - ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist)); - xfs_notice(mp, "Quotacheck needed: Please wait."); /* @@ -1627,7 +1246,8 @@ xfs_qm_quotacheck( */ uip = mp->m_quotainfo->qi_uquotaip; if (uip) { - error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA); + error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, + &buffer_list); if (error) goto error_return; flags |= XFS_UQUOTA_CHKD; @@ -1636,7 +1256,8 @@ xfs_qm_quotacheck( gip = mp->m_quotainfo->qi_gquotaip; if (gip) { error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? - XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); + XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA, + &buffer_list); if (error) goto error_return; flags |= XFS_OQUOTA_CHKD; @@ -1656,12 +1277,29 @@ xfs_qm_quotacheck( } while (!done); /* - * We've made all the changes that we need to make incore. - * Flush them down to disk buffers if everything was updated - * successfully. + * We've made all the changes that we need to make incore. Flush them + * down to disk buffers if everything was updated successfully. */ + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one, + &buffer_list); + } + if (XFS_IS_GQUOTA_ON(mp)) { + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one, + &buffer_list); + if (!error) + error = error2; + } + if (XFS_IS_PQUOTA_ON(mp)) { + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one, + &buffer_list); + if (!error) + error = error2; + } + + error2 = xfs_buf_delwri_submit(&buffer_list); if (!error) - error = xfs_qm_dqflush_all(mp, 0); + error = error2; /* * We can get this error if we couldn't do a dquot allocation inside @@ -1676,23 +1314,21 @@ xfs_qm_quotacheck( } /* - * We didn't log anything, because if we crashed, we'll have to - * start the quotacheck from scratch anyway. However, we must make - * sure that our dquot changes are secure before we put the - * quotacheck'd stamp on the superblock. So, here we do a synchronous - * flush. - */ - xfs_flush_buftarg(mp->m_ddev_targp, 1); - - /* * If one type of quotas is off, then it will lose its * quotachecked status, since we won't be doing accounting for * that type anymore. */ - mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); + mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD; mp->m_qflags |= flags; error_return: + while (!list_empty(&buffer_list)) { + struct xfs_buf *bp = + list_first_entry(&buffer_list, struct xfs_buf, b_list); + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + } + if (error) { xfs_warn(mp, "Quotacheck: Unsuccessful (Error %d): Disabling quotas.", @@ -1701,7 +1337,6 @@ xfs_qm_quotacheck( * We must turn off quotas. */ ASSERT(mp->m_quotainfo != NULL); - ASSERT(xfs_Gqm != NULL); xfs_qm_destroy_quotainfo(mp); if (xfs_mount_reset_sbqflags(mp)) { xfs_warn(mp, @@ -1790,257 +1425,150 @@ xfs_qm_init_quotainos( return 0; } +STATIC void +xfs_qm_dqfree_one( + struct xfs_dquot *dqp) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + mutex_lock(&qi->qi_tree_lock); + radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), + be32_to_cpu(dqp->q_core.d_id)); -/* - * Just pop the least recently used dquot off the freelist and - * recycle it. The returned dquot is locked. - */ -STATIC xfs_dquot_t * -xfs_qm_dqreclaim_one(void) -{ - xfs_dquot_t *dqpout; - xfs_dquot_t *dqp; - int restarts; - int startagain; + qi->qi_dquots--; + mutex_unlock(&qi->qi_tree_lock); - restarts = 0; - dqpout = NULL; + xfs_qm_dqdestroy(dqp); +} - /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ -again: - startagain = 0; - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); +STATIC void +xfs_qm_dqreclaim_one( + struct xfs_dquot *dqp, + struct list_head *buffer_list, + struct list_head *dispose_list) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + int error; - list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { - struct xfs_mount *mp = dqp->q_mount; - xfs_dqlock(dqp); + if (!xfs_dqlock_nowait(dqp)) + goto out_busy; - /* - * We are racing with dqlookup here. Naturally we don't - * want to reclaim a dquot that lookup wants. We release the - * freelist lock and start over, so that lookup will grab - * both the dquot and the freelistlock. - */ - if (dqp->dq_flags & XFS_DQ_WANT) { - ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); - - trace_xfs_dqreclaim_want(dqp); - XQM_STATS_INC(xqmstats.xs_qm_dqwants); - restarts++; - startagain = 1; - goto dqunlock; - } + /* + * This dquot has acquired a reference in the meantime remove it from + * the freelist and try again. + */ + if (dqp->q_nrefs) { + xfs_dqunlock(dqp); - /* - * If the dquot is inactive, we are assured that it is - * not on the mplist or the hashlist, and that makes our - * life easier. - */ - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(mp == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(list_empty(&dqp->q_hashlist)); - ASSERT(list_empty(&dqp->q_mplist)); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - dqpout = dqp; - XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - goto dqunlock; - } + trace_xfs_dqreclaim_want(dqp); + XFS_STATS_INC(xs_qm_dqwants); - ASSERT(dqp->q_hash); - ASSERT(!list_empty(&dqp->q_mplist)); + list_del_init(&dqp->q_lru); + qi->qi_lru_count--; + XFS_STATS_DEC(xs_qm_dquot_unused); + return; + } - /* - * Try to grab the flush lock. If this dquot is in the process - * of getting flushed to disk, we don't want to reclaim it. - */ - if (!xfs_dqflock_nowait(dqp)) - goto dqunlock; + /* + * Try to grab the flush lock. If this dquot is in the process of + * getting flushed to disk, we don't want to reclaim it. + */ + if (!xfs_dqflock_nowait(dqp)) + goto out_busy; - /* - * We have the flush lock so we know that this is not in the - * process of being flushed. So, if this is dirty, flush it - * DELWRI so that we don't get a freelist infested with - * dirty dquots. - */ - if (XFS_DQ_IS_DIRTY(dqp)) { - int error; + if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; - trace_xfs_dqreclaim_dirty(dqp); + trace_xfs_dqreclaim_dirty(dqp); - /* - * We flush it delayed write, so don't bother - * releasing the freelist lock. - */ - error = xfs_qm_dqflush(dqp, 0); - if (error) { - xfs_warn(mp, "%s: dquot %p flush failed", - __func__, dqp); - } - goto dqunlock; + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(mp, "%s: dquot %p flush failed", + __func__, dqp); + goto out_busy; } + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); /* - * We're trying to get the hashlock out of order. This races - * with dqlookup; so, we giveup and goto the next dquot if - * we couldn't get the hashlock. This way, we won't starve - * a dqlookup process that holds the hashlock that is - * waiting for the freelist lock. + * Give the dquot another try on the freelist, as the + * flushing will take some time. */ - if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - restarts++; - goto dqfunlock; - } + goto out_busy; + } + xfs_dqfunlock(dqp); - /* - * This races with dquot allocation code as well as dqflush_all - * and reclaim code. So, if we failed to grab the mplist lock, - * giveup everything and start over. - */ - if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { - restarts++; - startagain = 1; - goto qhunlock; - } + /* + * Prevent lookups now that we are past the point of no return. + */ + dqp->dq_flags |= XFS_DQ_FREEING; + xfs_dqunlock(dqp); - ASSERT(dqp->q_nrefs == 0); - list_del_init(&dqp->q_mplist); - mp->m_quotainfo->qi_dquots--; - mp->m_quotainfo->qi_dqreclaims++; - list_del_init(&dqp->q_hashlist); - dqp->q_hash->qh_version++; - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - dqpout = dqp; - mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); -qhunlock: - mutex_unlock(&dqp->q_hash->qh_lock); -dqfunlock: - xfs_dqfunlock(dqp); -dqunlock: - xfs_dqunlock(dqp); - if (dqpout) - break; - if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - break; - if (startagain) { - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - goto again; - } - } - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - return dqpout; -} + ASSERT(dqp->q_nrefs == 0); + list_move_tail(&dqp->q_lru, dispose_list); + qi->qi_lru_count--; + XFS_STATS_DEC(xs_qm_dquot_unused); -/* - * Traverse the freelist of dquots and attempt to reclaim a maximum of - * 'howmany' dquots. This operation races with dqlookup(), and attempts to - * favor the lookup function ... - */ -STATIC int -xfs_qm_shake_freelist( - int howmany) -{ - int nreclaimed = 0; - xfs_dquot_t *dqp; + trace_xfs_dqreclaim_done(dqp); + XFS_STATS_INC(xs_qm_dqreclaims); + return; - if (howmany <= 0) - return 0; +out_busy: + xfs_dqunlock(dqp); - while (nreclaimed < howmany) { - dqp = xfs_qm_dqreclaim_one(); - if (!dqp) - return nreclaimed; - xfs_qm_dqdestroy(dqp); - nreclaimed++; - } - return nreclaimed; + /* + * Move the dquot to the tail of the list so that we don't spin on it. + */ + list_move_tail(&dqp->q_lru, &qi->qi_lru_list); + + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); } -/* - * The kmem_shake interface is invoked when memory is running low. - */ -/* ARGSUSED */ STATIC int xfs_qm_shake( - struct shrinker *shrink, - struct shrink_control *sc) + struct shrinker *shrink, + struct shrink_control *sc) { - int ndqused, nfree, n; - gfp_t gfp_mask = sc->gfp_mask; - - if (!kmem_shake_allow(gfp_mask)) - return 0; - if (!xfs_Gqm) - return 0; - - nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */ - /* incore dquots in all f/s's */ - ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; - - ASSERT(ndqused >= 0); + struct xfs_quotainfo *qi = + container_of(shrink, struct xfs_quotainfo, qi_shrinker); + int nr_to_scan = sc->nr_to_scan; + LIST_HEAD (buffer_list); + LIST_HEAD (dispose_list); + struct xfs_dquot *dqp; + int error; - if (nfree <= ndqused && nfree < ndquot) + if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) return 0; + if (!nr_to_scan) + goto out; - ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */ - n = nfree - ndqused - ndquot; /* # over target */ - - return xfs_qm_shake_freelist(MAX(nfree, n)); -} - - -/*------------------------------------------------------------------*/ + mutex_lock(&qi->qi_lru_lock); + while (!list_empty(&qi->qi_lru_list)) { + if (nr_to_scan-- <= 0) + break; + dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot, + q_lru); + xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list); + } + mutex_unlock(&qi->qi_lru_lock); -/* - * Return a new incore dquot. Depending on the number of - * dquots in the system, we either allocate a new one on the kernel heap, - * or reclaim a free one. - * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed - * to reclaim an existing one from the freelist. - */ -boolean_t -xfs_qm_dqalloc_incore( - xfs_dquot_t **O_dqpp) -{ - xfs_dquot_t *dqp; + error = xfs_buf_delwri_submit(&buffer_list); + if (error) + xfs_warn(NULL, "%s: dquot reclaim failed", __func__); - /* - * Check against high water mark to see if we want to pop - * a nincompoop dquot off the freelist. - */ - if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) { - /* - * Try to recycle a dquot from the freelist. - */ - if ((dqp = xfs_qm_dqreclaim_one())) { - XQM_STATS_INC(xqmstats.xs_qm_dqreclaims); - /* - * Just zero the core here. The rest will get - * reinitialized by caller. XXX we shouldn't even - * do this zero ... - */ - memset(&dqp->q_core, 0, sizeof(dqp->q_core)); - *O_dqpp = dqp; - return B_FALSE; - } - XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses); + while (!list_empty(&dispose_list)) { + dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru); + list_del_init(&dqp->q_lru); + xfs_qm_dqfree_one(dqp); } - /* - * Allocate a brand new dquot on the kernel heap and return it - * to the caller to initialize. - */ - ASSERT(xfs_Gqm->qm_dqzone != NULL); - *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP); - atomic_inc(&xfs_Gqm->qm_totaldquots); - - return B_TRUE; +out: + return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure; } - /* * Start a transaction and write the incore superblock changes to * disk. flags parameter indicates which fields have changed. @@ -2151,10 +1679,7 @@ xfs_qm_vop_dqalloc( * this to caller */ ASSERT(ip->i_udquot); - uq = ip->i_udquot; - xfs_dqlock(uq); - XFS_DQHOLD(uq); - xfs_dqunlock(uq); + uq = xfs_qm_dqhold(ip->i_udquot); } } if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { @@ -2175,10 +1700,7 @@ xfs_qm_vop_dqalloc( xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_gdquot); - gq = ip->i_gdquot; - xfs_dqlock(gq); - XFS_DQHOLD(gq); - xfs_dqunlock(gq); + gq = xfs_qm_dqhold(ip->i_gdquot); } } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (xfs_get_projid(ip) != prid) { @@ -2198,10 +1720,7 @@ xfs_qm_vop_dqalloc( xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_gdquot); - gq = ip->i_gdquot; - xfs_dqlock(gq); - XFS_DQHOLD(gq); - xfs_dqunlock(gq); + gq = xfs_qm_dqhold(ip->i_gdquot); } } if (uq) @@ -2251,14 +1770,10 @@ xfs_qm_vop_chown( xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); /* - * Take an extra reference, because the inode - * is going to keep this dquot pointer even - * after the trans_commit. + * Take an extra reference, because the inode is going to keep + * this dquot pointer even after the trans_commit. */ - xfs_dqlock(newdq); - XFS_DQHOLD(newdq); - xfs_dqunlock(newdq); - *IO_olddq = newdq; + *IO_olddq = xfs_qm_dqhold(newdq); return prevdq; } @@ -2390,25 +1905,21 @@ xfs_qm_vop_create_dqattach( ASSERT(XFS_IS_QUOTA_RUNNING(mp)); if (udqp) { - xfs_dqlock(udqp); - XFS_DQHOLD(udqp); - xfs_dqunlock(udqp); ASSERT(ip->i_udquot == NULL); - ip->i_udquot = udqp; ASSERT(XFS_IS_UQUOTA_ON(mp)); ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); + + ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp) { - xfs_dqlock(gdqp); - XFS_DQHOLD(gdqp); - xfs_dqunlock(gdqp); ASSERT(ip->i_gdquot == NULL); - ip->i_gdquot = gdqp; ASSERT(XFS_IS_OQUOTA_ON(mp)); ASSERT((XFS_IS_GQUOTA_ON(mp) ? ip->i_d.di_gid : xfs_get_projid(ip)) == be32_to_cpu(gdqp->q_core.d_id)); + + ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 43b9abe1052..44b858b79d7 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -21,39 +21,10 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_quota_priv.h" -#include "xfs_qm_stats.h" -struct xfs_qm; struct xfs_inode; -extern uint ndquot; -extern struct mutex xfs_Gqm_lock; -extern struct xfs_qm *xfs_Gqm; -extern kmem_zone_t *qm_dqzone; -extern kmem_zone_t *qm_dqtrxzone; - -/* - * Used in xfs_qm_sync called by xfs_sync to count the max times that it can - * iterate over the mountpt's dquot list in one call. - */ -#define XFS_QM_SYNC_MAX_RESTARTS 7 - -/* - * Ditto, for xfs_qm_dqreclaim_one. - */ -#define XFS_QM_RECLAIM_MAX_RESTARTS 4 - -/* - * Ideal ratio of free to in use dquots. Quota manager makes an attempt - * to keep this balance. - */ -#define XFS_QM_DQFREE_RATIO 2 - -/* - * Dquot hashtable constants/threshold values. - */ -#define XFS_QM_HASHSIZE_LOW (PAGE_SIZE / sizeof(xfs_dqhash_t)) -#define XFS_QM_HASHSIZE_HIGH ((PAGE_SIZE * 4) / sizeof(xfs_dqhash_t)) +extern struct kmem_zone *xfs_qm_dqtrxzone; /* * This defines the unit of allocation of dquots. @@ -66,37 +37,20 @@ extern kmem_zone_t *qm_dqtrxzone; */ #define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 -typedef xfs_dqhash_t xfs_dqlist_t; - -/* - * Quota Manager (global) structure. Lives only in core. - */ -typedef struct xfs_qm { - xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */ - xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */ - uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */ - struct list_head qm_dqfrlist; /* freelist of dquots */ - struct mutex qm_dqfrlist_lock; - int qm_dqfrlist_cnt; - atomic_t qm_totaldquots; /* total incore dquots */ - uint qm_nrefs; /* file systems with quota on */ - int qm_dqfree_ratio;/* ratio of free to inuse dquots */ - kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */ - kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */ -} xfs_qm_t; - /* * Various quota information for individual filesystems. * The mount structure keeps a pointer to this. */ typedef struct xfs_quotainfo { + struct radix_tree_root qi_uquota_tree; + struct radix_tree_root qi_gquota_tree; + struct mutex qi_tree_lock; xfs_inode_t *qi_uquotaip; /* user quota inode */ xfs_inode_t *qi_gquotaip; /* group quota inode */ - struct list_head qi_dqlist; /* all dquots in filesys */ - struct mutex qi_dqlist_lock; + struct list_head qi_lru_list; + struct mutex qi_lru_lock; + int qi_lru_count; int qi_dquots; - int qi_dqreclaims; /* a change here indicates - a removal in the dqlist */ time_t qi_btimelimit; /* limit for blks timer */ time_t qi_itimelimit; /* limit for inodes timer */ time_t qi_rtbtimelimit;/* limit for rt blks timer */ @@ -112,8 +66,14 @@ typedef struct xfs_quotainfo { xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */ xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */ xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */ + struct shrinker qi_shrinker; } xfs_quotainfo_t; +#define XFS_DQUOT_TREE(qi, type) \ + ((type & XFS_DQ_USER) ? \ + &((qi)->qi_uquota_tree) : \ + &((qi)->qi_gquota_tree)) + extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, @@ -149,8 +109,7 @@ extern int xfs_qm_quotacheck(xfs_mount_t *); extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); /* dquot stuff */ -extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **); -extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); +extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint); extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); /* quota ops */ diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index a0a829addca..6b39115bf14 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -40,28 +38,28 @@ STATIC void xfs_fill_statvfs_from_dquot( struct kstatfs *statp, - xfs_disk_dquot_t *dp) + struct xfs_dquot *dqp) { __uint64_t limit; - limit = dp->d_blk_softlimit ? - be64_to_cpu(dp->d_blk_softlimit) : - be64_to_cpu(dp->d_blk_hardlimit); + limit = dqp->q_core.d_blk_softlimit ? + be64_to_cpu(dqp->q_core.d_blk_softlimit) : + be64_to_cpu(dqp->q_core.d_blk_hardlimit); if (limit && statp->f_blocks > limit) { statp->f_blocks = limit; statp->f_bfree = statp->f_bavail = - (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? - (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; + (statp->f_blocks > dqp->q_res_bcount) ? + (statp->f_blocks - dqp->q_res_bcount) : 0; } - limit = dp->d_ino_softlimit ? - be64_to_cpu(dp->d_ino_softlimit) : - be64_to_cpu(dp->d_ino_hardlimit); + limit = dqp->q_core.d_ino_softlimit ? + be64_to_cpu(dqp->q_core.d_ino_softlimit) : + be64_to_cpu(dqp->q_core.d_ino_hardlimit); if (limit && statp->f_files > limit) { statp->f_files = limit; statp->f_ffree = - (statp->f_files > be64_to_cpu(dp->d_icount)) ? - (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0; + (statp->f_files > dqp->q_res_icount) ? + (statp->f_ffree - dqp->q_res_icount) : 0; } } @@ -82,7 +80,7 @@ xfs_qm_statvfs( xfs_dquot_t *dqp; if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) { - xfs_fill_statvfs_from_dquot(statp, &dqp->q_core); + xfs_fill_statvfs_from_dquot(statp, dqp); xfs_qm_dqput(dqp); } } @@ -156,21 +154,3 @@ xfs_qm_newmount( return 0; } - -void __init -xfs_qm_init(void) -{ - printk(KERN_INFO "SGI XFS Quota Management subsystem\n"); - mutex_init(&xfs_Gqm_lock); - xfs_qm_init_procfs(); -} - -void __exit -xfs_qm_exit(void) -{ - xfs_qm_cleanup_procfs(); - if (qm_dqzone) - kmem_zone_destroy(qm_dqzone); - if (qm_dqtrxzone) - kmem_zone_destroy(qm_dqtrxzone); -} diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c deleted file mode 100644 index 8671a0b3264..00000000000 --- a/fs/xfs/xfs_qm_stats.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_inode.h" -#include "xfs_itable.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_qm.h" - -struct xqmstats xqmstats; - -static int xqm_proc_show(struct seq_file *m, void *v) -{ - /* maximum; incore; ratio free to inuse; freelist */ - seq_printf(m, "%d\t%d\t%d\t%u\n", - ndquot, - xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, - xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, - xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0); - return 0; -} - -static int xqm_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, xqm_proc_show, NULL); -} - -static const struct file_operations xqm_proc_fops = { - .owner = THIS_MODULE, - .open = xqm_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int xqmstat_proc_show(struct seq_file *m, void *v) -{ - /* quota performance statistics */ - seq_printf(m, "qm %u %u %u %u %u %u %u %u\n", - xqmstats.xs_qm_dqreclaims, - xqmstats.xs_qm_dqreclaim_misses, - xqmstats.xs_qm_dquot_dups, - xqmstats.xs_qm_dqcachemisses, - xqmstats.xs_qm_dqcachehits, - xqmstats.xs_qm_dqwants, - xqmstats.xs_qm_dqshake_reclaims, - xqmstats.xs_qm_dqinact_reclaims); - return 0; -} - -static int xqmstat_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, xqmstat_proc_show, NULL); -} - -static const struct file_operations xqmstat_proc_fops = { - .owner = THIS_MODULE, - .open = xqmstat_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -void -xfs_qm_init_procfs(void) -{ - proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops); - proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops); -} - -void -xfs_qm_cleanup_procfs(void) -{ - remove_proc_entry("fs/xfs/xqm", NULL); - remove_proc_entry("fs/xfs/xqmstat", NULL); -} diff --git a/fs/xfs/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h deleted file mode 100644 index 5b964fc0dc0..00000000000 --- a/fs/xfs/xfs_qm_stats.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2002 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_QM_STATS_H__ -#define __XFS_QM_STATS_H__ - -#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF) - -/* - * XQM global statistics - */ -struct xqmstats { - __uint32_t xs_qm_dqreclaims; - __uint32_t xs_qm_dqreclaim_misses; - __uint32_t xs_qm_dquot_dups; - __uint32_t xs_qm_dqcachemisses; - __uint32_t xs_qm_dqcachehits; - __uint32_t xs_qm_dqwants; - __uint32_t xs_qm_dqshake_reclaims; - __uint32_t xs_qm_dqinact_reclaims; -}; - -extern struct xqmstats xqmstats; - -# define XQM_STATS_INC(count) ( (count)++ ) - -extern void xfs_qm_init_procfs(void); -extern void xfs_qm_cleanup_procfs(void); - -#else - -# define XQM_STATS_INC(count) do { } while (0) - -static inline void xfs_qm_init_procfs(void) { }; -static inline void xfs_qm_cleanup_procfs(void) { }; - -#endif - -#endif /* __XFS_QM_STATS_H__ */ diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 5cc3dde1bc9..858a3b18611 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -22,7 +22,6 @@ #include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -31,6 +30,7 @@ #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" #include "xfs_itable.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" @@ -46,9 +46,6 @@ STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, uint); STATIC uint xfs_qm_export_flags(uint); STATIC uint xfs_qm_export_qtype_flags(uint); -STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *, - fs_disk_quota_t *); - /* * Turn off quota accounting and/or enforcement for all udquots and/or @@ -68,7 +65,6 @@ xfs_qm_scall_quotaoff( int error; uint inactivate_flags; xfs_qoff_logitem_t *qoffstart; - int nculprits; /* * No file system can have quotas enabled on disk but not in core. @@ -174,18 +170,13 @@ xfs_qm_scall_quotaoff( * This isn't protected by a particular lock directly, because we * don't want to take a mrlock every time we depend on quotas being on. */ - mp->m_qflags &= ~(flags); + mp->m_qflags &= ~flags; /* * Go through all the dquots of this file system and purge them, - * according to what was turned off. We may not be able to get rid - * of all dquots, because dquots can have temporary references that - * are not attached to inodes. eg. xfs_setattr, xfs_create. - * So, if we couldn't purge all the dquots from the filesystem, - * we can't get rid of the incore data structures. + * according to what was turned off. */ - while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype))) - delay(10 * nculprits); + xfs_qm_dqpurge_all(mp, dqtype); /* * Transactions that had started before ACTIVE state bit was cleared @@ -263,13 +254,18 @@ xfs_qm_scall_trunc_qfile( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_itruncate_data(&tp, ip, 0); + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); if (error) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); goto out_unlock; } + ASSERT(ip->i_d.di_nextents == 0); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); @@ -629,42 +625,6 @@ xfs_qm_scall_setqlim( return error; } -int -xfs_qm_scall_getquota( - xfs_mount_t *mp, - xfs_dqid_t id, - uint type, - fs_disk_quota_t *out) -{ - xfs_dquot_t *dqp; - int error; - - /* - * Try to get the dquot. We don't want it allocated on disk, so - * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't - * exist, we'll get ENOENT back. - */ - if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) { - return (error); - } - - /* - * If everything's NULL, this dquot doesn't quite exist as far as - * our utility programs are concerned. - */ - if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { - xfs_qm_dqput(dqp); - return XFS_ERROR(ENOENT); - } - /* - * Convert the disk dquot to the exportable format - */ - xfs_qm_export_dquot(mp, &dqp->q_core, out); - xfs_qm_dqput(dqp); - return (error ? XFS_ERROR(EFAULT) : 0); -} - - STATIC int xfs_qm_log_quotaoff_end( xfs_mount_t *mp, @@ -753,50 +713,66 @@ error0: } -/* - * Translate an internal style on-disk-dquot to the exportable format. - * The main differences are that the counters/limits are all in Basic - * Blocks (BBs) instead of the internal FSBs, and all on-disk data has - * to be converted to the native endianness. - */ -STATIC void -xfs_qm_export_dquot( - xfs_mount_t *mp, - xfs_disk_dquot_t *src, +int +xfs_qm_scall_getquota( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, struct fs_disk_quota *dst) { + struct xfs_dquot *dqp; + int error; + + /* + * Try to get the dquot. We don't want it allocated on disk, so + * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't + * exist, we'll get ENOENT back. + */ + error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp); + if (error) + return error; + + /* + * If everything's NULL, this dquot doesn't quite exist as far as + * our utility programs are concerned. + */ + if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + error = XFS_ERROR(ENOENT); + goto out_put; + } + memset(dst, 0, sizeof(*dst)); - dst->d_version = FS_DQUOT_VERSION; /* different from src->d_version */ - dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags); - dst->d_id = be32_to_cpu(src->d_id); + dst->d_version = FS_DQUOT_VERSION; + dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags); + dst->d_id = be32_to_cpu(dqp->q_core.d_id); dst->d_blk_hardlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit)); + XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); dst->d_blk_softlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit)); - dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit); - dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit); - dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount)); - dst->d_icount = be64_to_cpu(src->d_icount); - dst->d_btimer = be32_to_cpu(src->d_btimer); - dst->d_itimer = be32_to_cpu(src->d_itimer); - dst->d_iwarns = be16_to_cpu(src->d_iwarns); - dst->d_bwarns = be16_to_cpu(src->d_bwarns); + XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit)); + dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); + dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); + dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount); + dst->d_icount = dqp->q_res_icount; + dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer); + dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer); + dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns); + dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns); dst->d_rtb_hardlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit)); + XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit)); dst->d_rtb_softlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit)); - dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount)); - dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer); - dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns); + XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit)); + dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount); + dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer); + dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns); /* * Internally, we don't reset all the timers when quota enforcement * gets turned off. No need to confuse the user level code, * so return zeroes in that case. */ - if ((!XFS_IS_UQUOTA_ENFORCED(mp) && src->d_flags == XFS_DQ_USER) || + if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) || (!XFS_IS_OQUOTA_ENFORCED(mp) && - (src->d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) { + (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) { dst->d_btimer = 0; dst->d_itimer = 0; dst->d_rtbtimer = 0; @@ -807,16 +783,19 @@ xfs_qm_export_dquot( (XFS_IS_OQUOTA_ENFORCED(mp) && (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && dst->d_id != 0) { - if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) && + if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) && (dst->d_blk_softlimit > 0)) { ASSERT(dst->d_btimer != 0); } - if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) && + if (((int) dst->d_icount > (int) dst->d_ino_softlimit) && (dst->d_ino_softlimit > 0)) { ASSERT(dst->d_itimer != 0); } } #endif +out_put: + xfs_qm_dqput(dqp); + return error; } STATIC uint diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index a595f29567f..b50ec5b95d5 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -87,8 +87,7 @@ typedef struct xfs_dqblk { #define XFS_DQ_PROJ 0x0002 /* project quota */ #define XFS_DQ_GROUP 0x0004 /* a group quota */ #define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ -#define XFS_DQ_WANT 0x0010 /* for lookup/reclaim race */ -#define XFS_DQ_INACTIVE 0x0020 /* dq off mplist & hashlist */ +#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */ #define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) @@ -97,8 +96,7 @@ typedef struct xfs_dqblk { { XFS_DQ_PROJ, "PROJ" }, \ { XFS_DQ_GROUP, "GROUP" }, \ { XFS_DQ_DIRTY, "DIRTY" }, \ - { XFS_DQ_WANT, "WANT" }, \ - { XFS_DQ_INACTIVE, "INACTIVE" } + { XFS_DQ_FREEING, "FREEING" } /* * In the worst case, when both user and group quotas are on, @@ -176,6 +174,8 @@ typedef struct xfs_qoff_logformat { #define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */ #define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */ #define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */ +#define XFS_ALL_QUOTA_ACTIVE \ + (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE) /* * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees @@ -199,7 +199,6 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ #define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ -#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ #define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ #define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ @@ -326,7 +325,6 @@ extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint); extern void xfs_qm_dqdetach(struct xfs_inode *); extern void xfs_qm_dqrele(struct xfs_dquot *); extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *); -extern int xfs_qm_sync(struct xfs_mount *, int); extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *); extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); @@ -366,10 +364,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_qm_dqdetach(ip) #define xfs_qm_dqrele(d) #define xfs_qm_statvfs(ip, s) -static inline int xfs_qm_sync(struct xfs_mount *mp, int flags) -{ - return 0; -} #define xfs_qm_newmount(mp, a, b) (0) #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h index 94a3d927d71..6d86219d93d 100644 --- a/fs/xfs/xfs_quota_priv.h +++ b/fs/xfs/xfs_quota_priv.h @@ -24,17 +24,6 @@ */ #define XFS_DQITER_MAP_SIZE 10 -/* - * Hash into a bucket in the dquot hash table, based on <mp, id>. - */ -#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \ - (__psunsigned_t)(id)) & \ - (xfs_Gqm->qm_dqhashmask - 1)) -#define XFS_DQ_HASH(mp, id, type) (type == XFS_DQ_USER ? \ - (xfs_Gqm->qm_usr_dqhtable + \ - XFS_DQ_HASHVAL(mp, id)) : \ - (xfs_Gqm->qm_grp_dqhtable + \ - XFS_DQ_HASHVAL(mp, id))) #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ !dqp->q_core.d_blk_hardlimit && \ !dqp->q_core.d_blk_softlimit && \ diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 7e76f537abb..fed504fc299 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -17,7 +17,6 @@ */ #include "xfs.h" #include "xfs_sb.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 866de277079..30ff5f401d2 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -118,17 +117,6 @@ xfs_rename( new_parent = (src_dp != target_dp); src_is_directory = S_ISDIR(src_ip->i_d.di_mode); - if (src_is_directory) { - /* - * Check for link count overflow on target_dp - */ - if (target_ip == NULL && new_parent && - target_dp->i_d.di_nlink >= XFS_MAXLINK) { - error = XFS_ERROR(EMLINK); - goto std_return; - } - } - xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, inodes, &num_inodes); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 87323f1ded6..92d4331cd4f 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -34,7 +33,6 @@ #include "xfs_rtalloc.h" #include "xfs_fsops.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_inode_item.h" #include "xfs_trans_space.h" #include "xfs_utils.h" @@ -183,6 +181,7 @@ error_cancel: oblocks = map.br_startoff + map.br_blockcount; } return 0; + error: return error; } @@ -1871,9 +1870,9 @@ xfs_growfs_rt( /* * Read in the last block of the device, make sure it exists. */ - bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp, + bp = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, nrblocks - 1), - XFS_FSB_TO_B(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0); if (!bp) return EIO; xfs_buf_relse(bp); @@ -2139,11 +2138,9 @@ xfs_rtfree_extent( xfs_buf_t *sumbp; /* summary file block buffer */ mp = tp->t_mountp; - /* - * Synchronize by locking the bitmap inode. - */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + + ASSERT(mp->m_rbmip->i_itemp != NULL); + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); #if defined(__KERNEL__) && defined(DEBUG) /* @@ -2220,9 +2217,9 @@ xfs_rtmount_init( (unsigned long long) mp->m_sb.sb_rblocks); return XFS_ERROR(EFBIG); } - bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp, + bp = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_B(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0); if (!bp) { xfs_warn(mp, "realtime device size check failed"); return EIO; diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c deleted file mode 100644 index 597d044a09a..00000000000 --- a/fs/xfs/xfs_rw.c +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_error.h" -#include "xfs_rw.h" - -/* - * Force a shutdown of the filesystem instantly while keeping - * the filesystem consistent. We don't do an unmount here; just shutdown - * the shop, make sure that absolutely nothing persistent happens to - * this filesystem after this point. - */ -void -xfs_do_force_shutdown( - xfs_mount_t *mp, - int flags, - char *fname, - int lnnum) -{ - int logerror; - - logerror = flags & SHUTDOWN_LOG_IO_ERROR; - - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - xfs_notice(mp, - "%s(0x%x) called from line %d of file %s. Return address = 0x%p", - __func__, flags, lnnum, fname, __return_address); - } - /* - * No need to duplicate efforts. - */ - if (XFS_FORCED_SHUTDOWN(mp) && !logerror) - return; - - /* - * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't - * queue up anybody new on the log reservations, and wakes up - * everybody who's sleeping on log reservations to tell them - * the bad news. - */ - if (xfs_log_force_umount(mp, logerror)) - return; - - if (flags & SHUTDOWN_CORRUPT_INCORE) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, - "Corruption of in-memory data detected. Shutting down filesystem"); - if (XFS_ERRLEVEL_HIGH <= xfs_error_level) - xfs_stack_trace(); - } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - if (logerror) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, - "Log I/O Error Detected. Shutting down filesystem"); - } else if (flags & SHUTDOWN_DEVICE_REQ) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "All device paths lost. Shutting down filesystem"); - } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "I/O Error Detected. Shutting down filesystem"); - } - } - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - xfs_alert(mp, - "Please umount the filesystem and rectify the problem(s)"); - } -} - -/* - * This isn't an absolute requirement, but it is - * just a good idea to call xfs_read_buf instead of - * directly doing a read_buf call. For one, we shouldn't - * be doing this disk read if we are in SHUTDOWN state anyway, - * so this stops that from happening. Secondly, this does all - * the error checking stuff and the brelse if appropriate for - * the caller, so the code can be a little leaner. - */ - -int -xfs_read_buf( - struct xfs_mount *mp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len, - uint flags, - xfs_buf_t **bpp) -{ - xfs_buf_t *bp; - int error; - - if (!flags) - flags = XBF_LOCK | XBF_MAPPED; - - bp = xfs_buf_read(target, blkno, len, flags); - if (!bp) - return XFS_ERROR(EIO); - error = bp->b_error; - if (!error && !XFS_FORCED_SHUTDOWN(mp)) { - *bpp = bp; - } else { - *bpp = NULL; - if (error) { - xfs_buf_ioerror_alert(bp, __func__); - } else { - error = XFS_ERROR(EIO); - } - if (bp) { - XFS_BUF_UNDONE(bp); - xfs_buf_stale(bp); - /* - * brelse clears B_ERROR and b_error - */ - xfs_buf_relse(bp); - } - } - return (error); -} - -/* - * helper function to extract extent size hint from inode - */ -xfs_extlen_t -xfs_get_extsz_hint( - struct xfs_inode *ip) -{ - if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) - return ip->i_d.di_extsize; - if (XFS_IS_REALTIME_INODE(ip)) - return ip->i_mount->m_sb.sb_rextsize; - return 0; -} diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h deleted file mode 100644 index bbdb9ad6a4b..00000000000 --- a/fs/xfs/xfs_rw.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_RW_H__ -#define __XFS_RW_H__ - -struct xfs_buf; -struct xfs_inode; -struct xfs_mount; - -/* - * Convert the given file system block to a disk block. - * We have to treat it differently based on whether the - * file is a real time file or not, because the bmap code - * does. - */ -static inline xfs_daddr_t -xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) -{ - return (XFS_IS_REALTIME_INODE(ip) ? \ - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); -} - -/* - * Prototypes for functions in xfs_rw.c. - */ -extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, - xfs_daddr_t blkno, int len, uint flags, - struct xfs_buf **bpp); -extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); - -#endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index cb6ae715814..f429d9d5d32 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -529,7 +529,6 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) #define XFS_BB_TO_FSB(mp,bb) \ (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log) #define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log) -#define XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1)) /* * File system block to byte conversions. diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 76fdc586193..ce372b7d564 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -20,9 +20,18 @@ DEFINE_PER_CPU(struct xfsstats, xfsstats); +static int counter_val(int idx) +{ + int val = 0, cpu; + + for_each_possible_cpu(cpu) + val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx)); + return val; +} + static int xfs_stat_proc_show(struct seq_file *m, void *v) { - int c, i, j, val; + int i, j; __uint64_t xs_xstrat_bytes = 0; __uint64_t xs_write_bytes = 0; __uint64_t xs_read_bytes = 0; @@ -50,20 +59,16 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v) { "abtc2", XFSSTAT_END_ABTC_V2 }, { "bmbt2", XFSSTAT_END_BMBT_V2 }, { "ibt2", XFSSTAT_END_IBT_V2 }, + /* we print both series of quota information together */ + { "qm", XFSSTAT_END_QM }, }; /* Loop over all stats groups */ - for (i=j = 0; i < ARRAY_SIZE(xstats); i++) { + for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { seq_printf(m, "%s", xstats[i].desc); /* inner loop does each group */ - while (j < xstats[i].endpoint) { - val = 0; - /* sum over all cpus */ - for_each_possible_cpu(c) - val += *(((__u32*)&per_cpu(xfsstats, c) + j)); - seq_printf(m, " %u", val); - j++; - } + for (; j < xstats[i].endpoint; j++) + seq_printf(m, " %u", counter_val(j)); seq_putc(m, '\n'); } /* extra precision counters */ @@ -97,6 +102,58 @@ static const struct file_operations xfs_stat_proc_fops = { .release = single_release, }; +/* legacy quota interfaces */ +#ifdef CONFIG_XFS_QUOTA +static int xqm_proc_show(struct seq_file *m, void *v) +{ + /* maximum; incore; ratio free to inuse; freelist */ + seq_printf(m, "%d\t%d\t%d\t%u\n", + 0, + counter_val(XFSSTAT_END_XQMSTAT), + 0, + counter_val(XFSSTAT_END_XQMSTAT + 1)); + return 0; +} + +static int xqm_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xqm_proc_show, NULL); +} + +static const struct file_operations xqm_proc_fops = { + .owner = THIS_MODULE, + .open = xqm_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* legacy quota stats interface no 2 */ +static int xqmstat_proc_show(struct seq_file *m, void *v) +{ + int j; + + seq_printf(m, "qm"); + for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++) + seq_printf(m, " %u", counter_val(j)); + seq_putc(m, '\n'); + return 0; +} + +static int xqmstat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xqmstat_proc_show, NULL); +} + +static const struct file_operations xqmstat_proc_fops = { + .owner = THIS_MODULE, + .open = xqmstat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_XFS_QUOTA */ + int xfs_init_procfs(void) { @@ -105,10 +162,24 @@ xfs_init_procfs(void) if (!proc_create("fs/xfs/stat", 0, NULL, &xfs_stat_proc_fops)) - goto out_remove_entry; + goto out_remove_xfs_dir; +#ifdef CONFIG_XFS_QUOTA + if (!proc_create("fs/xfs/xqmstat", 0, NULL, + &xqmstat_proc_fops)) + goto out_remove_stat_file; + if (!proc_create("fs/xfs/xqm", 0, NULL, + &xqm_proc_fops)) + goto out_remove_xqmstat_file; +#endif return 0; - out_remove_entry: +#ifdef CONFIG_XFS_QUOTA + out_remove_xqmstat_file: + remove_proc_entry("fs/xfs/xqmstat", NULL); + out_remove_stat_file: + remove_proc_entry("fs/xfs/stat", NULL); +#endif + out_remove_xfs_dir: remove_proc_entry("fs/xfs", NULL); out: return -ENOMEM; @@ -117,6 +188,10 @@ xfs_init_procfs(void) void xfs_cleanup_procfs(void) { +#ifdef CONFIG_XFS_QUOTA + remove_proc_entry("fs/xfs/xqm", NULL); + remove_proc_entry("fs/xfs/xqmstat", NULL); +#endif remove_proc_entry("fs/xfs/stat", NULL); remove_proc_entry("fs/xfs", NULL); } diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 736854b1ca1..c03ad38ceae 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -183,6 +183,16 @@ struct xfsstats { __uint32_t xs_ibt_2_alloc; __uint32_t xs_ibt_2_free; __uint32_t xs_ibt_2_moves; +#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_IBT_V2+6) + __uint32_t xs_qm_dqreclaims; + __uint32_t xs_qm_dqreclaim_misses; + __uint32_t xs_qm_dquot_dups; + __uint32_t xs_qm_dqcachemisses; + __uint32_t xs_qm_dqcachehits; + __uint32_t xs_qm_dqwants; +#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2) + __uint32_t xs_qm_dquot; + __uint32_t xs_qm_dquot_unused; /* Extra precision counters */ __uint64_t xs_xstrat_bytes; __uint64_t xs_write_bytes; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8a899496fd5..0d9de41a715 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -17,7 +17,6 @@ */ #include "xfs.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -199,7 +198,6 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_BARRIER; mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - mp->m_flags |= XFS_MOUNT_DELAYLOG; /* * These can be overridden by the mount option parsing. @@ -325,10 +323,9 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { mp->m_flags |= XFS_MOUNT_FILESTREAMS; } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) { - mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | - XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | - XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | - XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD); + mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; + mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; + mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; } else if (!strcmp(this_char, MNTOPT_QUOTA) || !strcmp(this_char, MNTOPT_UQUOTA) || !strcmp(this_char, MNTOPT_USRQUOTA)) { @@ -353,11 +350,11 @@ xfs_parseargs( mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); mp->m_qflags &= ~XFS_OQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { - mp->m_flags |= XFS_MOUNT_DELAYLOG; + xfs_warn(mp, + "delaylog is the default now, option is deprecated."); } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { - mp->m_flags &= ~XFS_MOUNT_DELAYLOG; xfs_warn(mp, - "nodelaylog is deprecated and will be removed in Linux 3.3"); + "nodelaylog support has been removed, option is deprecated."); } else if (!strcmp(this_char, MNTOPT_DISCARD)) { mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { @@ -395,13 +392,6 @@ xfs_parseargs( return EINVAL; } - if ((mp->m_flags & XFS_MOUNT_DISCARD) && - !(mp->m_flags & XFS_MOUNT_DELAYLOG)) { - xfs_warn(mp, - "the discard option is incompatible with the nodelaylog option"); - return EINVAL; - } - #ifndef CONFIG_XFS_QUOTA if (XFS_IS_QUOTA_RUNNING(mp)) { xfs_warn(mp, "quota support not available in this kernel."); @@ -501,7 +491,6 @@ xfs_showargs( { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, - { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, { 0, NULL } }; @@ -632,7 +621,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL); + blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL); } STATIC void @@ -769,6 +758,43 @@ xfs_setup_devices( return 0; } +STATIC int +xfs_init_mount_workqueues( + struct xfs_mount *mp) +{ + mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", + WQ_MEM_RECLAIM, 0, mp->m_fsname); + if (!mp->m_data_workqueue) + goto out; + + mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", + WQ_MEM_RECLAIM, 0, mp->m_fsname); + if (!mp->m_unwritten_workqueue) + goto out_destroy_data_iodone_queue; + + mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", + WQ_MEM_RECLAIM, 0, mp->m_fsname); + if (!mp->m_cil_workqueue) + goto out_destroy_unwritten; + return 0; + +out_destroy_unwritten: + destroy_workqueue(mp->m_unwritten_workqueue); +out_destroy_data_iodone_queue: + destroy_workqueue(mp->m_data_workqueue); +out: + return -ENOMEM; +} + +STATIC void +xfs_destroy_mount_workqueues( + struct xfs_mount *mp) +{ + destroy_workqueue(mp->m_cil_workqueue); + destroy_workqueue(mp->m_data_workqueue); + destroy_workqueue(mp->m_unwritten_workqueue); +} + /* Catch misguided souls that try to use this interface on XFS */ STATIC struct inode * xfs_fs_alloc_inode( @@ -837,105 +863,64 @@ xfs_fs_inode_init_once( /* xfs inode */ atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); - init_waitqueue_head(&ip->i_ipin_wait); - /* - * Because we want to use a counting completion, complete - * the flush completion once to allow a single access to - * the flush completion without blocking. - */ - init_completion(&ip->i_flush); - complete(&ip->i_flush); mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); } /* - * Dirty the XFS inode when mark_inode_dirty_sync() is called so that - * we catch unlogged VFS level updates to the inode. + * This is called by the VFS when dirtying inode metadata. This can happen + * for a few reasons, but we only care about timestamp updates, given that + * we handled the rest ourselves. In theory no other calls should happen, + * but for example generic_write_end() keeps dirtying the inode after + * updating i_size. Thus we check that the flags are exactly I_DIRTY_SYNC, + * and skip this call otherwise. * - * We need the barrier() to maintain correct ordering between unlogged - * updates and the transaction commit code that clears the i_update_core - * field. This requires all updates to be completed before marking the - * inode dirty. + * We'll hopefull get a different method just for updating timestamps soon, + * at which point this hack can go away, and maybe we'll also get real + * error handling here. */ STATIC void xfs_fs_dirty_inode( - struct inode *inode, - int flags) -{ - barrier(); - XFS_I(inode)->i_update_core = 1; -} - -STATIC int -xfs_fs_write_inode( struct inode *inode, - struct writeback_control *wbc) + int flags) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - int error = EAGAIN; + struct xfs_trans *tp; + int error; - trace_xfs_write_inode(ip); + if (flags != I_DIRTY_SYNC) + return; - if (XFS_FORCED_SHUTDOWN(mp)) - return -XFS_ERROR(EIO); + trace_xfs_dirty_inode(ip); - if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { - /* - * Make sure the inode has made it it into the log. Instead - * of forcing it all the way to stable storage using a - * synchronous transaction we let the log force inside the - * ->sync_fs call do that for thus, which reduces the number - * of synchronous log forces dramatically. - */ - error = xfs_log_dirty_inode(ip, NULL, 0); - if (error) - goto out; - return 0; - } else { - if (!ip->i_update_core) - return 0; - - /* - * We make this non-blocking if the inode is contended, return - * EAGAIN to indicate to the caller that they did not succeed. - * This prevents the flush path from blocking on inodes inside - * another operation right now, they get caught later by - * xfs_sync. - */ - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) - goto out; - - if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) - goto out_unlock; - - /* - * Now we have the flush lock and the inode is not pinned, we - * can check if the inode is really clean as we know that - * there are no pending transaction completions, it is not - * waiting on the delayed write queue and there is no IO in - * progress. - */ - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - error = 0; - goto out_unlock; - } - error = xfs_iflush(ip, SYNC_TRYLOCK); + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto trouble; } - - out_unlock: - xfs_iunlock(ip, XFS_ILOCK_SHARED); - out: + xfs_ilock(ip, XFS_ILOCK_EXCL); /* - * if we failed to write out the inode then mark - * it dirty again so we'll try again later. + * Grab all the latest timestamps from the Linux inode. */ + ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; + ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec; + ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec; + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); + error = xfs_trans_commit(tp, 0); if (error) - xfs_mark_inode_dirty_sync(ip); - return -error; + goto trouble; + return; + +trouble: + xfs_warn(mp, "failed to update timestamps for inode 0x%llx", ip->i_ino); } STATIC void @@ -947,7 +932,7 @@ xfs_fs_evict_inode( trace_xfs_evict_inode(ip); truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); XFS_STATS_INC(vn_rele); XFS_STATS_INC(vn_remove); XFS_STATS_DEC(vn_active); @@ -971,6 +956,22 @@ xfs_fs_evict_inode( xfs_inactive(ip); } +/* + * We do an unlocked check for XFS_IDONTCACHE here because we are already + * serialised against cache hits here via the inode->i_lock and igrab() in + * xfs_iget_cache_hit(). Hence a lookup that might clear this flag will not be + * racing with us, and it avoids needing to grab a spinlock here for every inode + * we drop the final reference on. + */ +STATIC int +xfs_fs_drop_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + + return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); +} + STATIC void xfs_free_fsname( struct xfs_mount *mp) @@ -986,20 +987,12 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - xfs_syncd_stop(mp); - - /* - * Blow away any referenced inode in the filestreams cache. - * This can and will cause log traffic as inodes go inactive - * here. - */ xfs_filestream_unmount(mp); - - xfs_flush_buftarg(mp->m_ddev_targp, 1); - xfs_unmountfs(mp); + xfs_syncd_stop(mp); xfs_freesb(mp); xfs_icsb_destroy_counters(mp); + xfs_destroy_mount_workqueues(mp); xfs_close_devices(mp); xfs_free_fsname(mp); kfree(mp); @@ -1014,17 +1007,10 @@ xfs_fs_sync_fs( int error; /* - * Not much we can do for the first async pass. Writing out the - * superblock would be counter-productive as we are going to redirty - * when writing out other data and metadata (and writing out a single - * block is quite fast anyway). - * - * Try to asynchronously kick off quota syncing at least. + * Doing anything during the async pass would be counterproductive. */ - if (!wait) { - xfs_qm_sync(mp, SYNC_TRYLOCK); + if (!wait) return 0; - } error = xfs_quiesce_data(mp); if (error) @@ -1083,7 +1069,7 @@ xfs_fs_statfs( spin_unlock(&mp->m_sb_lock); - if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || + if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) == (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) xfs_qm_statvfs(ip, statp); @@ -1238,9 +1224,9 @@ xfs_fs_unfreeze( STATIC int xfs_fs_show_options( struct seq_file *m, - struct vfsmount *mnt) + struct dentry *root) { - return -xfs_showargs(XFS_M(mnt->mnt_sb), m); + return -xfs_showargs(XFS_M(root->d_sb), m); } /* @@ -1333,10 +1319,14 @@ xfs_fs_fill_super( if (error) goto out_free_fsname; - error = xfs_icsb_init_counters(mp); + error = xfs_init_mount_workqueues(mp); if (error) goto out_close_devices; + error = xfs_icsb_init_counters(mp); + if (error) + goto out_destroy_workqueues; + error = xfs_readsb(mp, flags); if (error) goto out_destroy_counters; @@ -1365,40 +1355,44 @@ xfs_fs_fill_super( sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; set_posix_acl_flag(sb); - error = xfs_mountfs(mp); + error = xfs_syncd_init(mp); if (error) goto out_filestream_unmount; - error = xfs_syncd_init(mp); + error = xfs_mountfs(mp); if (error) - goto out_unmount; + goto out_syncd_stop; root = igrab(VFS_I(mp->m_rootip)); if (!root) { error = ENOENT; - goto out_syncd_stop; + goto out_unmount; } if (is_bad_inode(root)) { error = EINVAL; - goto out_syncd_stop; + goto out_unmount; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { error = ENOMEM; - goto out_iput; + goto out_unmount; } return 0; - + out_syncd_stop: + xfs_syncd_stop(mp); out_filestream_unmount: xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); out_destroy_counters: xfs_icsb_destroy_counters(mp); +out_destroy_workqueues: + xfs_destroy_mount_workqueues(mp); out_close_devices: xfs_close_devices(mp); out_free_fsname: @@ -1407,21 +1401,10 @@ xfs_fs_fill_super( out: return -error; - out_iput: - iput(root); - out_syncd_stop: - xfs_syncd_stop(mp); out_unmount: - /* - * Blow away any referenced inode in the filestreams cache. - * This can and will cause log traffic as inodes go inactive - * here. - */ xfs_filestream_unmount(mp); - - xfs_flush_buftarg(mp->m_ddev_targp, 1); - xfs_unmountfs(mp); + xfs_syncd_stop(mp); goto out_free_sb; } @@ -1454,8 +1437,8 @@ static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, .dirty_inode = xfs_fs_dirty_inode, - .write_inode = xfs_fs_write_inode, .evict_inode = xfs_fs_evict_inode, + .drop_inode = xfs_fs_drop_inode, .put_super = xfs_fs_put_super, .sync_fs = xfs_fs_sync_fs, .freeze_fs = xfs_fs_freeze, @@ -1621,20 +1604,36 @@ STATIC int __init xfs_init_workqueues(void) { /* - * max_active is set to 8 to give enough concurency to allow - * multiple work operations on each CPU to run. This allows multiple - * filesystems to be running sync work concurrently, and scales with - * the number of CPUs in the system. + * We never want to the same work item to run twice, reclaiming inodes + * or idling the log is not going to get any faster by multiple CPUs + * competing for ressources. Use the default large max_active value + * so that even lots of filesystems can perform these task in parallel. */ - xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8); + xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0); if (!xfs_syncd_wq) return -ENOMEM; + + /* + * The allocation workqueue can be used in memory reclaim situations + * (writepage path), and parallelism is only limited by the number of + * AGs in all the filesystems mounted. Hence use the default large + * max_active value for this workqueue. + */ + xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); + if (!xfs_alloc_wq) + goto out_destroy_syncd; + return 0; + +out_destroy_syncd: + destroy_workqueue(xfs_syncd_wq); + return -ENOMEM; } STATIC void xfs_destroy_workqueues(void) { + destroy_workqueue(xfs_alloc_wq); destroy_workqueue(xfs_syncd_wq); } @@ -1676,13 +1675,17 @@ init_xfs_fs(void) if (error) goto out_cleanup_procfs; - vfs_initquota(); + error = xfs_qm_init(); + if (error) + goto out_sysctl_unregister; error = register_filesystem(&xfs_fs_type); if (error) - goto out_sysctl_unregister; + goto out_qm_exit; return 0; + out_qm_exit: + xfs_qm_exit(); out_sysctl_unregister: xfs_sysctl_unregister(); out_cleanup_procfs: @@ -1704,7 +1707,7 @@ init_xfs_fs(void) STATIC void __exit exit_xfs_fs(void) { - vfs_exitquota(); + xfs_qm_exit(); unregister_filesystem(&xfs_fs_type); xfs_sysctl_unregister(); xfs_cleanup_procfs(); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 50a3266c999..09b0c26b224 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -21,13 +21,11 @@ #include <linux/exportfs.h> #ifdef CONFIG_XFS_QUOTA -extern void xfs_qm_init(void); +extern int xfs_qm_init(void); extern void xfs_qm_exit(void); -# define vfs_initquota() xfs_qm_init() -# define vfs_exitquota() xfs_qm_exit() #else -# define vfs_initquota() do { } while (0) -# define vfs_exitquota() do { } while (0) +# define xfs_qm_init() (0) +# define xfs_qm_exit() do { } while (0) #endif #ifdef CONFIG_XFS_POSIX_ACL diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index f0994aedcd1..1e9ee064dbb 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -18,7 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -241,45 +240,6 @@ xfs_sync_inode_data( return error; } -STATIC int -xfs_sync_inode_attr( - struct xfs_inode *ip, - struct xfs_perag *pag, - int flags) -{ - int error = 0; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_inode_clean(ip)) - goto out_unlock; - if (!xfs_iflock_nowait(ip)) { - if (!(flags & SYNC_WAIT)) - goto out_unlock; - xfs_iflock(ip); - } - - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - goto out_unlock; - } - - error = xfs_iflush(ip, flags); - - /* - * We don't want to try again on non-blocking flushes that can't run - * again immediately. If an inode really must be written, then that's - * what the SYNC_WAIT flag is for. - */ - if (error == EAGAIN) { - ASSERT(!(flags & SYNC_WAIT)); - error = 0; - } - - out_unlock: - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; -} - /* * Write out pagecache data for the whole filesystem. */ @@ -300,19 +260,6 @@ xfs_sync_data( return 0; } -/* - * Write out inode metadata (attributes) for the whole filesystem. - */ -STATIC int -xfs_sync_attr( - struct xfs_mount *mp, - int flags) -{ - ASSERT((flags & ~SYNC_WAIT) == 0); - - return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags); -} - STATIC int xfs_sync_fsdata( struct xfs_mount *mp) @@ -336,32 +283,6 @@ xfs_sync_fsdata( return error; } -int -xfs_log_dirty_inode( - struct xfs_inode *ip, - struct xfs_perag *pag, - int flags) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - if (!ip->i_update_core) - return 0; - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp, 0); -} - /* * When remounting a filesystem read-only or freezing the filesystem, we have * two phases to execute. This first phase is syncing the data before we @@ -376,7 +297,7 @@ xfs_log_dirty_inode( * First stage of freeze - no writers will make progress now we are here, * so we flush delwri and delalloc buffers here, then wait for all I/O to * complete. Data is frozen at that point. Metadata is not frozen, - * transactions can still occur here so don't bother flushing the buftarg + * transactions can still occur here so don't bother emptying the AIL * because it'll just get dirty again. */ int @@ -385,66 +306,19 @@ xfs_quiesce_data( { int error, error2 = 0; - /* - * Log all pending size and timestamp updates. The vfs writeback - * code is supposed to do this, but due to its overagressive - * livelock detection it will skip inodes where appending writes - * were written out in the first non-blocking sync phase if their - * completion took long enough that it happened after taking the - * timestamp for the cut-off in the blocking phase. - */ - xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0); - - xfs_qm_sync(mp, SYNC_TRYLOCK); - xfs_qm_sync(mp, SYNC_WAIT); - - /* force out the newly dirtied log buffers */ + /* force out the log */ xfs_log_force(mp, XFS_LOG_SYNC); /* write superblock and hoover up shutdown errors */ error = xfs_sync_fsdata(mp); - /* make sure all delwri buffers are written out */ - xfs_flush_buftarg(mp->m_ddev_targp, 1); - /* mark the log as covered if needed */ if (xfs_log_need_covered(mp)) error2 = xfs_fs_log_dummy(mp); - /* flush data-only devices */ - if (mp->m_rtdev_targp) - xfs_flush_buftarg(mp->m_rtdev_targp, 1); - return error ? error : error2; } -STATIC void -xfs_quiesce_fs( - struct xfs_mount *mp) -{ - int count = 0, pincount; - - xfs_reclaim_inodes(mp, 0); - xfs_flush_buftarg(mp->m_ddev_targp, 0); - - /* - * This loop must run at least twice. The first instance of the loop - * will flush most meta data but that will generate more meta data - * (typically directory updates). Which then must be flushed and - * logged before we can write the unmount record. We also so sync - * reclaim of inodes to catch any that the above delwri flush skipped. - */ - do { - xfs_reclaim_inodes(mp, SYNC_WAIT); - xfs_sync_attr(mp, SYNC_WAIT); - pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); - if (!pincount) { - delay(50); - count++; - } - } while (count < 2); -} - /* * Second stage of a quiesce. The data is already synced, now we have to take * care of the metadata. New transactions are already blocked, so we need to @@ -460,8 +334,12 @@ xfs_quiesce_attr( while (atomic_read(&mp->m_active_trans) > 0) delay(100); - /* flush inodes and push all remaining buffers out to disk */ - xfs_quiesce_fs(mp); + /* reclaim inodes to do any IO before the freeze completes */ + xfs_reclaim_inodes(mp, 0); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + /* flush all pending changes from the AIL */ + xfs_ail_push_all_sync(mp->m_ail); /* * Just warn here till VFS can correctly support @@ -475,7 +353,12 @@ xfs_quiesce_attr( xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " "Frozen image may not be consistent."); xfs_log_unmount_write(mp); - xfs_unmountfs_writesb(mp); + + /* + * At this point we might have modified the superblock again and thus + * added an item to the AIL, thus flush it again. + */ + xfs_ail_push_all_sync(mp->m_ail); } static void @@ -499,16 +382,26 @@ xfs_sync_worker( struct xfs_mount, m_sync_work); int error; - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + /* + * We shouldn't write/force the log if we are in the mount/unmount + * process or on a read only filesystem. The workqueue still needs to be + * active in both cases, however, because it is used for inode reclaim + * during these times. Use the MS_ACTIVE flag to avoid doing anything + * during mount. Doing work during unmount is avoided by calling + * cancel_delayed_work_sync on this work queue before tearing down + * the ail and the log in xfs_log_unmount. + */ + if (!(mp->m_super->s_flags & MS_ACTIVE) && + !(mp->m_flags & XFS_MOUNT_RDONLY)) { /* dgc: errors ignored here */ if (mp->m_super->s_frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) error = xfs_fs_log_dummy(mp); else xfs_log_force(mp, 0); - error = xfs_qm_sync(mp, SYNC_TRYLOCK); - /* start pushing all the metadata that is currently dirty */ + /* start pushing all the metadata that is currently + * dirty */ xfs_ail_push_all(mp->m_ail); } @@ -528,14 +421,6 @@ xfs_syncd_queue_reclaim( struct xfs_mount *mp) { - /* - * We can have inodes enter reclaim after we've shut down the syncd - * workqueue during unmount, so don't allow reclaim work to be queued - * during unmount. - */ - if (!(mp->m_super->s_flags & MS_ACTIVE)) - return; - rcu_read_lock(); if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, @@ -604,7 +489,6 @@ xfs_syncd_init( INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); xfs_syncd_queue_sync(mp); - xfs_syncd_queue_reclaim(mp); return 0; } @@ -711,14 +595,13 @@ xfs_reclaim_inode_grab( return 1; /* - * do some unlocked checks first to avoid unnecessary lock traffic. - * The first is a flush lock check, the second is a already in reclaim - * check. Only do these checks if we are not going to block on locks. + * If we are asked for non-blocking operation, do unlocked checks to + * see if the inode already is being flushed or in reclaim to avoid + * lock traffic. */ if ((flags & SYNC_TRYLOCK) && - (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { + __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) return 1; - } /* * The radix tree lock here protects a thread in xfs_iget from racing @@ -743,11 +626,8 @@ xfs_reclaim_inode_grab( } /* - * Inodes in different states need to be treated differently, and the return - * value of xfs_iflush is not sufficient to get this right. The following table - * lists the inode states and the reclaim actions necessary for non-blocking - * reclaim: - * + * Inodes in different states need to be treated differently. The following + * table lists the inode states and the reclaim actions necessary: * * inode state iflush ret required action * --------------- ---------- --------------- @@ -757,39 +637,31 @@ xfs_reclaim_inode_grab( * stale, unpinned 0 reclaim * clean, pinned(*) 0 requeue * stale, pinned EAGAIN requeue - * dirty, delwri ok 0 requeue - * dirty, delwri blocked EAGAIN requeue - * dirty, sync flush 0 reclaim + * dirty, async - requeue + * dirty, sync 0 reclaim * * (*) dgc: I don't think the clean, pinned state is possible but it gets * handled anyway given the order of checks implemented. * - * As can be seen from the table, the return value of xfs_iflush() is not - * sufficient to correctly decide the reclaim action here. The checks in - * xfs_iflush() might look like duplicates, but they are not. - * * Also, because we get the flush lock first, we know that any inode that has * been flushed delwri has had the flush completed by the time we check that - * the inode is clean. The clean inode check needs to be done before flushing - * the inode delwri otherwise we would loop forever requeuing clean inodes as - * we cannot tell apart a successful delwri flush and a clean inode from the - * return value of xfs_iflush(). + * the inode is clean. * - * Note that because the inode is flushed delayed write by background - * writeback, the flush lock may already be held here and waiting on it can - * result in very long latencies. Hence for sync reclaims, where we wait on the - * flush lock, the caller should push out delayed write inodes first before - * trying to reclaim them to minimise the amount of time spent waiting. For - * background relaim, we just requeue the inode for the next pass. + * Note that because the inode is flushed delayed write by AIL pushing, the + * flush lock may already be held here and waiting on it can result in very + * long latencies. Hence for sync reclaims, where we wait on the flush lock, + * the caller should push the AIL first before trying to reclaim inodes to + * minimise the amount of time spent waiting. For background relaim, we only + * bother to reclaim clean inodes anyway. * * Hence the order of actions after gaining the locks should be: * bad => reclaim * shutdown => unpin and reclaim - * pinned, delwri => requeue + * pinned, async => requeue * pinned, sync => unpin * stale => reclaim * clean => reclaim - * dirty, delwri => flush and requeue + * dirty, async => requeue * dirty, sync => flush, wait and reclaim */ STATIC int @@ -798,7 +670,8 @@ xfs_reclaim_inode( struct xfs_perag *pag, int sync_mode) { - int error; + struct xfs_buf *bp = NULL; + int error; restart: error = 0; @@ -806,17 +679,6 @@ restart: if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out; - - /* - * If we only have a single dirty inode in a cluster there is - * a fair chance that the AIL push may have pushed it into - * the buffer, but xfsbufd won't touch it until 30 seconds - * from now, and thus we will lock up here. - * - * Promote the inode buffer to the front of the delwri list - * and wake up xfsbufd now. - */ - xfs_promote_inode(ip); xfs_iflock(ip); } @@ -824,13 +686,12 @@ restart: goto reclaim; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); + xfs_iflush_abort(ip, false); goto reclaim; } if (xfs_ipincount(ip)) { - if (!(sync_mode & SYNC_WAIT)) { - xfs_ifunlock(ip); - goto out; - } + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; xfs_iunpin_wait(ip); } if (xfs_iflags_test(ip, XFS_ISTALE)) @@ -839,60 +700,42 @@ restart: goto reclaim; /* + * Never flush out dirty data during non-blocking reclaim, as it would + * just contend with AIL pushing trying to do the same job. + */ + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; + + /* * Now we have an inode that needs flushing. * - * We do a nonblocking flush here even if we are doing a SYNC_WAIT - * reclaim as we can deadlock with inode cluster removal. + * Note that xfs_iflush will never block on the inode buffer lock, as * xfs_ifree_cluster() can lock the inode buffer before it locks the - * ip->i_lock, and we are doing the exact opposite here. As a result, - * doing a blocking xfs_itobp() to get the cluster buffer will result + * ip->i_lock, and we are doing the exact opposite here. As a result, + * doing a blocking xfs_itobp() to get the cluster buffer would result * in an ABBA deadlock with xfs_ifree_cluster(). * * As xfs_ifree_cluser() must gather all inodes that are active in the * cache to mark them stale, if we hit this case we don't actually want * to do IO here - we want the inode marked stale so we can simply - * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, - * just unlock the inode, back off and try again. Hopefully the next - * pass through will see the stale flag set on the inode. + * reclaim it. Hence if we get an EAGAIN error here, just unlock the + * inode, back off and try again. Hopefully the next pass through will + * see the stale flag set on the inode. */ - error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); - if (sync_mode & SYNC_WAIT) { - if (error == EAGAIN) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* backoff longer than in xfs_ifree_cluster */ - delay(2); - goto restart; - } - xfs_iflock(ip); - goto reclaim; + error = xfs_iflush(ip, &bp); + if (error == EAGAIN) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* backoff longer than in xfs_ifree_cluster */ + delay(2); + goto restart; } - /* - * When we have to flush an inode but don't have SYNC_WAIT set, we - * flush the inode out using a delwri buffer and wait for the next - * call into reclaim to find it in a clean state instead of waiting for - * it now. We also don't return errors here - if the error is transient - * then the next reclaim pass will flush the inode, and if the error - * is permanent then the next sync reclaim will reclaim the inode and - * pass on the error. - */ - if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_warn(ip->i_mount, - "inode 0x%llx background reclaim flush failed with %d", - (long long)ip->i_ino, error); + if (!error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); } -out: - xfs_iflags_clear(ip, XFS_IRECLAIM); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* - * We could return EAGAIN here to make reclaim rescan the inode tree in - * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and xfssyncd never goes back to the idle - * state. Instead, return 0 to let the next scheduled background reclaim - * attempt to reclaim the inode again. - */ - return 0; + xfs_iflock(ip); reclaim: xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -918,17 +761,28 @@ reclaim: * can reference the inodes in the cache without taking references. * * We make that OK here by ensuring that we wait until the inode is - * unlocked after the lookup before we go ahead and free it. We get - * both the ilock and the iolock because the code may need to drop the - * ilock one but will still hold the iolock. + * unlocked after the lookup before we go ahead and free it. */ - xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_qm_dqdetach(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_inode_free(ip); return error; +out_ifunlock: + xfs_ifunlock(ip); +out: + xfs_iflags_clear(ip, XFS_IRECLAIM); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * We could return EAGAIN here to make reclaim rescan the inode tree in + * a short while. However, this just burns CPU time scanning the tree + * waiting for IO to complete and xfssyncd never goes back to the idle + * state. Instead, return 0 to let the next scheduled background reclaim + * attempt to reclaim the inode again. + */ + return 0; } /* diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index fa965479d78..941202e7ac6 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -34,8 +34,6 @@ void xfs_quiesce_attr(struct xfs_mount *mp); void xfs_flush_inodes(struct xfs_inode *ip); -int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags); - int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 9010ce885e6..624bedd8135 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 49403579887..caf5dabfd55 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -32,7 +32,7 @@ struct xfs_da_node_entry; struct xfs_dquot; struct xfs_log_item; struct xlog_ticket; -struct log; +struct xlog; struct xlog_recover; struct xlog_recover_item; struct xfs_buf_log_format; @@ -281,7 +281,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_daddr_t, bno) - __field(size_t, buffer_length) + __field(int, nblks) __field(int, hold) __field(int, pincount) __field(unsigned, lockval) @@ -291,18 +291,18 @@ DECLARE_EVENT_CLASS(xfs_buf_class, TP_fast_assign( __entry->dev = bp->b_target->bt_dev; __entry->bno = bp->b_bn; - __entry->buffer_length = bp->b_buffer_length; + __entry->nblks = bp->b_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; __entry->flags = bp->b_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " "lock %d flags %s caller %pf", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, - __entry->buffer_length, + __entry->nblks, __entry->hold, __entry->pincount, __entry->lockval, @@ -328,7 +328,7 @@ DEFINE_BUF_EVENT(xfs_buf_unlock); DEFINE_BUF_EVENT(xfs_buf_iowait); DEFINE_BUF_EVENT(xfs_buf_iowait_done); DEFINE_BUF_EVENT(xfs_buf_delwri_queue); -DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); +DEFINE_BUF_EVENT(xfs_buf_delwri_queued); DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_bdstrat_shut); @@ -362,7 +362,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, TP_fast_assign( __entry->dev = bp->b_target->bt_dev; __entry->bno = bp->b_bn; - __entry->buffer_length = bp->b_buffer_length; + __entry->buffer_length = BBTOB(bp->b_length); __entry->flags = flags; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); @@ -406,7 +406,7 @@ TRACE_EVENT(xfs_buf_ioerror, TP_fast_assign( __entry->dev = bp->b_target->bt_dev; __entry->bno = bp->b_bn; - __entry->buffer_length = bp->b_buffer_length; + __entry->buffer_length = BBTOB(bp->b_length); __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; @@ -450,7 +450,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->bli_recur = bip->bli_recur; __entry->bli_refcount = atomic_read(&bip->bli_refcount); __entry->buf_bno = bip->bli_buf->b_bn; - __entry->buf_len = bip->bli_buf->b_buffer_length; + __entry->buf_len = BBTOB(bip->bli_buf->b_length); __entry->buf_flags = bip->bli_buf->b_flags; __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); @@ -486,12 +486,10 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); @@ -580,7 +578,7 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr); DEFINE_INODE_EVENT(xfs_dir_fsync); DEFINE_INODE_EVENT(xfs_file_fsync); DEFINE_INODE_EVENT(xfs_destroy_inode); -DEFINE_INODE_EVENT(xfs_write_inode); +DEFINE_INODE_EVENT(xfs_dirty_inode); DEFINE_INODE_EVENT(xfs_evict_inode); DEFINE_INODE_EVENT(xfs_dquot_dqalloc); @@ -627,16 +625,19 @@ DECLARE_EVENT_CLASS(xfs_namespace_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, dp_ino) + __field(int, namelen) __dynamic_array(char, name, name->len) ), TP_fast_assign( __entry->dev = VFS_I(dp)->i_sb->s_dev; __entry->dp_ino = dp->i_ino; + __entry->namelen = name->len; memcpy(__get_str(name), name->name, name->len); ), - TP_printk("dev %d:%d dp ino 0x%llx name %s", + TP_printk("dev %d:%d dp ino 0x%llx name %.*s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dp_ino, + __entry->namelen, __get_str(name)) ) @@ -658,6 +659,8 @@ TRACE_EVENT(xfs_rename, __field(dev_t, dev) __field(xfs_ino_t, src_dp_ino) __field(xfs_ino_t, target_dp_ino) + __field(int, src_namelen) + __field(int, target_namelen) __dynamic_array(char, src_name, src_name->len) __dynamic_array(char, target_name, target_name->len) ), @@ -665,15 +668,20 @@ TRACE_EVENT(xfs_rename, __entry->dev = VFS_I(src_dp)->i_sb->s_dev; __entry->src_dp_ino = src_dp->i_ino; __entry->target_dp_ino = target_dp->i_ino; + __entry->src_namelen = src_name->len; + __entry->target_namelen = target_name->len; memcpy(__get_str(src_name), src_name->name, src_name->len); - memcpy(__get_str(target_name), target_name->name, target_name->len); + memcpy(__get_str(target_name), target_name->name, + target_name->len); ), TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx" - " src name %s target name %s", + " src name %.*s target name %.*s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->src_dp_ino, __entry->target_dp_ino, + __entry->src_namelen, __get_str(src_name), + __entry->target_namelen, __get_str(target_name)) ) @@ -733,21 +741,18 @@ DEFINE_EVENT(xfs_dquot_class, name, \ DEFINE_DQUOT_EVENT(xfs_dqadjust); DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); -DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_done); DEFINE_DQUOT_EVENT(xfs_dqattach_found); DEFINE_DQUOT_EVENT(xfs_dqattach_get); -DEFINE_DQUOT_EVENT(xfs_dqinit); -DEFINE_DQUOT_EVENT(xfs_dqreuse); DEFINE_DQUOT_EVENT(xfs_dqalloc); DEFINE_DQUOT_EVENT(xfs_dqtobp_read); DEFINE_DQUOT_EVENT(xfs_dqread); DEFINE_DQUOT_EVENT(xfs_dqread_fail); -DEFINE_DQUOT_EVENT(xfs_dqlookup_found); -DEFINE_DQUOT_EVENT(xfs_dqlookup_want); -DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist); -DEFINE_DQUOT_EVENT(xfs_dqlookup_done); DEFINE_DQUOT_EVENT(xfs_dqget_hit); DEFINE_DQUOT_EVENT(xfs_dqget_miss); +DEFINE_DQUOT_EVENT(xfs_dqget_freeing); +DEFINE_DQUOT_EVENT(xfs_dqget_dup); DEFINE_DQUOT_EVENT(xfs_dqput); DEFINE_DQUOT_EVENT(xfs_dqput_wait); DEFINE_DQUOT_EVENT(xfs_dqput_free); @@ -757,7 +762,7 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_force); DEFINE_DQUOT_EVENT(xfs_dqflush_done); DECLARE_EVENT_CLASS(xfs_loggrant_class, - TP_PROTO(struct log *log, struct xlog_ticket *tic), + TP_PROTO(struct xlog *log, struct xlog_ticket *tic), TP_ARGS(log, tic), TP_STRUCT__entry( __field(dev_t, dev) @@ -785,12 +790,12 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __entry->curr_res = tic->t_curr_res; __entry->unit_res = tic->t_unit_res; __entry->flags = tic->t_flags; - __entry->reserveq = list_empty(&log->l_reserveq); - __entry->writeq = list_empty(&log->l_writeq); - xlog_crack_grant_head(&log->l_grant_reserve_head, + __entry->reserveq = list_empty(&log->l_reserve_head.waiters); + __entry->writeq = list_empty(&log->l_write_head.waiters); + xlog_crack_grant_head(&log->l_reserve_head.grant, &__entry->grant_reserve_cycle, &__entry->grant_reserve_bytes); - xlog_crack_grant_head(&log->l_grant_write_head, + xlog_crack_grant_head(&log->l_write_head.grant, &__entry->grant_write_cycle, &__entry->grant_write_bytes); __entry->curr_cycle = log->l_curr_cycle; @@ -825,24 +830,18 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, #define DEFINE_LOGGRANT_EVENT(name) \ DEFINE_EVENT(xfs_loggrant_class, name, \ - TP_PROTO(struct log *log, struct xlog_ticket *tic), \ + TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \ TP_ARGS(log, tic)) DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); -DEFINE_LOGGRANT_EVENT(xfs_log_reserve); DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); -DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter); -DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_grant_error); DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up); +DEFINE_LOGGRANT_EVENT(xfs_log_reserve); +DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); @@ -875,15 +874,30 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) ) +TRACE_EVENT(xfs_log_force, + TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn), + TP_ARGS(mp, lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->lsn = lsn; + ), + TP_printk("dev %d:%d lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lsn) +) + #define DEFINE_LOG_ITEM_EVENT(name) \ DEFINE_EVENT(xfs_log_item_class, name, \ TP_PROTO(struct xfs_log_item *lip), \ TP_ARGS(lip)) DEFINE_LOG_ITEM_EVENT(xfs_ail_push); -DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf); -DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned); DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); +DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); DECLARE_EVENT_CLASS(xfs_file_class, @@ -893,7 +907,6 @@ DECLARE_EVENT_CLASS(xfs_file_class, __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_fsize_t, size) - __field(xfs_fsize_t, new_size) __field(loff_t, offset) __field(size_t, count) __field(int, flags) @@ -902,17 +915,15 @@ DECLARE_EVENT_CLASS(xfs_file_class, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->size = ip->i_d.di_size; - __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + TP_printk("dev %d:%d ino 0x%llx size 0x%llx " "offset 0x%llx count 0x%zx ioflags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, - __entry->new_size, __entry->offset, __entry->count, __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) @@ -980,7 +991,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __field(dev_t, dev) __field(xfs_ino_t, ino) __field(loff_t, size) - __field(loff_t, new_size) __field(loff_t, offset) __field(size_t, count) __field(int, type) @@ -992,7 +1002,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->size = ip->i_d.di_size; - __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; __entry->type = type; @@ -1000,13 +1009,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " - "offset 0x%llx count %zd type %s " - "startoff 0x%llx startblock %lld blockcount 0x%llx", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " + "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, - __entry->new_size, __entry->offset, __entry->count, __print_symbolic(__entry->type, XFS_IO_TYPES), @@ -1033,26 +1040,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class, __field(xfs_ino_t, ino) __field(loff_t, isize) __field(loff_t, disize) - __field(loff_t, new_size) __field(loff_t, offset) __field(size_t, count) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->isize = ip->i_size; + __entry->isize = VFS_I(ip)->i_size; __entry->disize = ip->i_d.di_size; - __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; ), - TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx " + TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx " "offset 0x%llx count %zd", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, __entry->disize, - __entry->new_size, __entry->offset, __entry->count) ); @@ -1092,8 +1096,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class, DEFINE_EVENT(xfs_itrunc_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ TP_ARGS(ip, new_size)) -DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start); -DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end); +DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start); +DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end); TRACE_EVENT(xfs_pagecache_inval, TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), @@ -1154,7 +1158,7 @@ TRACE_EVENT(xfs_bunmap, ); -DECLARE_EVENT_CLASS(xfs_busy_class, +DECLARE_EVENT_CLASS(xfs_extent_busy_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len), TP_ARGS(mp, agno, agbno, len), @@ -1177,17 +1181,17 @@ DECLARE_EVENT_CLASS(xfs_busy_class, __entry->len) ); #define DEFINE_BUSY_EVENT(name) \ -DEFINE_EVENT(xfs_busy_class, name, \ +DEFINE_EVENT(xfs_extent_busy_class, name, \ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ xfs_agblock_t agbno, xfs_extlen_t len), \ TP_ARGS(mp, agno, agbno, len)) -DEFINE_BUSY_EVENT(xfs_alloc_busy); -DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem); -DEFINE_BUSY_EVENT(xfs_alloc_busy_force); -DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse); -DEFINE_BUSY_EVENT(xfs_alloc_busy_clear); +DEFINE_BUSY_EVENT(xfs_extent_busy); +DEFINE_BUSY_EVENT(xfs_extent_busy_enomem); +DEFINE_BUSY_EVENT(xfs_extent_busy_force); +DEFINE_BUSY_EVENT(xfs_extent_busy_reuse); +DEFINE_BUSY_EVENT(xfs_extent_busy_clear); -TRACE_EVENT(xfs_alloc_busy_trim, +TRACE_EVENT(xfs_extent_busy_trim, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len, xfs_agblock_t tbno, xfs_extlen_t tlen), @@ -1427,7 +1431,7 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed); -DECLARE_EVENT_CLASS(xfs_dir2_class, +DECLARE_EVENT_CLASS(xfs_da_class, TP_PROTO(struct xfs_da_args *args), TP_ARGS(args), TP_STRUCT__entry( @@ -1462,7 +1466,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_class, ) #define DEFINE_DIR2_EVENT(name) \ -DEFINE_EVENT(xfs_dir2_class, name, \ +DEFINE_EVENT(xfs_da_class, name, \ TP_PROTO(struct xfs_da_args *args), \ TP_ARGS(args)) DEFINE_DIR2_EVENT(xfs_dir2_sf_addname); @@ -1491,6 +1495,64 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace); DEFINE_DIR2_EVENT(xfs_dir2_node_removename); DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); +#define DEFINE_ATTR_EVENT(name) \ +DEFINE_EVENT(xfs_da_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_ATTR_EVENT(xfs_attr_sf_add); +DEFINE_ATTR_EVENT(xfs_attr_sf_addname); +DEFINE_ATTR_EVENT(xfs_attr_sf_create); +DEFINE_ATTR_EVENT(xfs_attr_sf_lookup); +DEFINE_ATTR_EVENT(xfs_attr_sf_remove); +DEFINE_ATTR_EVENT(xfs_attr_sf_removename); +DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf); + +DEFINE_ATTR_EVENT(xfs_attr_leaf_add); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); +DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); +DEFINE_ATTR_EVENT(xfs_attr_leaf_create); +DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup); +DEFINE_ATTR_EVENT(xfs_attr_leaf_replace); +DEFINE_ATTR_EVENT(xfs_attr_leaf_removename); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split_after); +DEFINE_ATTR_EVENT(xfs_attr_leaf_clearflag); +DEFINE_ATTR_EVENT(xfs_attr_leaf_setflag); +DEFINE_ATTR_EVENT(xfs_attr_leaf_flipflags); +DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf); +DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node); +DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance); +DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance); + +DEFINE_ATTR_EVENT(xfs_attr_node_addname); +DEFINE_ATTR_EVENT(xfs_attr_node_lookup); +DEFINE_ATTR_EVENT(xfs_attr_node_replace); +DEFINE_ATTR_EVENT(xfs_attr_node_removename); + +#define DEFINE_DA_EVENT(name) \ +DEFINE_EVENT(xfs_da_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_DA_EVENT(xfs_da_split); +DEFINE_DA_EVENT(xfs_da_join); +DEFINE_DA_EVENT(xfs_da_link_before); +DEFINE_DA_EVENT(xfs_da_link_after); +DEFINE_DA_EVENT(xfs_da_unlink_back); +DEFINE_DA_EVENT(xfs_da_unlink_forward); +DEFINE_DA_EVENT(xfs_da_root_split); +DEFINE_DA_EVENT(xfs_da_root_join); +DEFINE_DA_EVENT(xfs_da_node_add); +DEFINE_DA_EVENT(xfs_da_node_create); +DEFINE_DA_EVENT(xfs_da_node_split); +DEFINE_DA_EVENT(xfs_da_node_remove); +DEFINE_DA_EVENT(xfs_da_node_rebalance); +DEFINE_DA_EVENT(xfs_da_node_unbalance); +DEFINE_DA_EVENT(xfs_da_swap_lastblock); +DEFINE_DA_EVENT(xfs_da_grow_inode); +DEFINE_DA_EVENT(xfs_da_shrink_inode); + DECLARE_EVENT_CLASS(xfs_dir2_space_class, TP_PROTO(struct xfs_da_args *args, int idx), TP_ARGS(args, idx), @@ -1570,7 +1632,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __field(xfs_ino_t, ino) __field(int, format) __field(int, nex) - __field(int, max_nex) __field(int, broot_size) __field(int, fork_off) ), @@ -1580,18 +1641,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->ino = ip->i_ino; __entry->format = ip->i_d.di_format; __entry->nex = ip->i_d.di_nextents; - __entry->max_nex = ip->i_df.if_ext_max; __entry->broot_size = ip->i_df.if_broot_bytes; __entry->fork_off = XFS_IFORK_BOFF(ip); ), TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " - "Max in-fork extents %d, broot size %d, fork offset %d", + "broot size %d, fork offset %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), __entry->nex, - __entry->max_nex, __entry->broot_size, __entry->fork_off) ) @@ -1605,7 +1664,7 @@ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); DECLARE_EVENT_CLASS(xfs_log_recover_item_class, - TP_PROTO(struct log *log, struct xlog_recover *trans, + TP_PROTO(struct xlog *log, struct xlog_recover *trans, struct xlog_recover_item *item, int pass), TP_ARGS(log, trans, item, pass), TP_STRUCT__entry( @@ -1639,7 +1698,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class, #define DEFINE_LOG_RECOVER_ITEM(name) \ DEFINE_EVENT(xfs_log_recover_item_class, name, \ - TP_PROTO(struct log *log, struct xlog_recover *trans, \ + TP_PROTO(struct xlog *log, struct xlog_recover *trans, \ struct xlog_recover_item *item, int pass), \ TP_ARGS(log, trans, item, pass)) @@ -1650,7 +1709,7 @@ DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail); DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover); DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, - TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), + TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), TP_ARGS(log, buf_f), TP_STRUCT__entry( __field(dev_t, dev) @@ -1680,7 +1739,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, #define DEFINE_LOG_RECOVER_BUF_ITEM(name) \ DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \ - TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \ + TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), \ TP_ARGS(log, buf_f)) DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel); @@ -1693,7 +1752,7 @@ DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, - TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), + TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), TP_ARGS(log, in_f), TP_STRUCT__entry( __field(dev_t, dev) @@ -1731,7 +1790,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, ) #define DEFINE_LOG_RECOVER_INO_ITEM(name) \ DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \ - TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \ + TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), \ TP_ARGS(log, in_f)) DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 1f35b2feca9..fdf324508c5 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -19,9 +19,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -36,6 +34,7 @@ #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_bmap.h" #include "xfs_quota.h" #include "xfs_trans_priv.h" @@ -585,7 +584,7 @@ xfs_trans_t * _xfs_trans_alloc( xfs_mount_t *mp, uint type, - uint memflags) + xfs_km_flags_t memflags) { xfs_trans_t *tp; @@ -608,8 +607,8 @@ STATIC void xfs_trans_free( struct xfs_trans *tp) { - xfs_alloc_busy_sort(&tp->t_busy); - xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false); + xfs_extent_busy_sort(&tp->t_busy); + xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); atomic_dec(&tp->t_mountp->m_active_trans); xfs_trans_free_dqinfo(tp); @@ -681,7 +680,6 @@ xfs_trans_reserve( uint flags, uint logcount) { - int log_flags; int error = 0; int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; @@ -707,24 +705,32 @@ xfs_trans_reserve( * Reserve the log space needed for this transaction. */ if (logspace > 0) { - ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace)); - ASSERT((tp->t_log_count == 0) || - (tp->t_log_count == logcount)); + bool permanent = false; + + ASSERT(tp->t_log_res == 0 || tp->t_log_res == logspace); + ASSERT(tp->t_log_count == 0 || tp->t_log_count == logcount); + if (flags & XFS_TRANS_PERM_LOG_RES) { - log_flags = XFS_LOG_PERM_RESERV; tp->t_flags |= XFS_TRANS_PERM_LOG_RES; + permanent = true; } else { ASSERT(tp->t_ticket == NULL); ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); - log_flags = 0; } - error = xfs_log_reserve(tp->t_mountp, logspace, logcount, - &tp->t_ticket, - XFS_TRANSACTION, log_flags, tp->t_type); - if (error) { - goto undo_blocks; + if (tp->t_ticket != NULL) { + ASSERT(flags & XFS_TRANS_PERM_LOG_RES); + error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); + } else { + error = xfs_log_reserve(tp->t_mountp, logspace, + logcount, &tp->t_ticket, + XFS_TRANSACTION, permanent, + tp->t_type); } + + if (error) + goto undo_blocks; + tp->t_log_res = logspace; tp->t_log_count = logcount; } @@ -752,6 +758,8 @@ xfs_trans_reserve( */ undo_log: if (logspace > 0) { + int log_flags; + if (flags & XFS_TRANS_PERM_LOG_RES) { log_flags = XFS_LOG_REL_PERM_RESERV; } else { @@ -1151,14 +1159,13 @@ xfs_trans_add_item( { struct xfs_log_item_desc *lidp; - ASSERT(lip->li_mountp = tp->t_mountp); - ASSERT(lip->li_ailp = tp->t_mountp->m_ail); + ASSERT(lip->li_mountp == tp->t_mountp); + ASSERT(lip->li_ailp == tp->t_mountp->m_ail); lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS); lidp->lid_item = lip; lidp->lid_flags = 0; - lidp->lid_size = 0; list_add_tail(&lidp->lid_trans, &tp->t_items); lip->li_desc = lidp; @@ -1210,219 +1217,6 @@ xfs_trans_free_items( } } -/* - * Unlock the items associated with a transaction. - * - * Items which were not logged should be freed. Those which were logged must - * still be tracked so they can be unpinned when the transaction commits. - */ -STATIC void -xfs_trans_unlock_items( - struct xfs_trans *tp, - xfs_lsn_t commit_lsn) -{ - struct xfs_log_item_desc *lidp, *next; - - list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; - - lip->li_desc = NULL; - - if (commit_lsn != NULLCOMMITLSN) - IOP_COMMITTING(lip, commit_lsn); - IOP_UNLOCK(lip); - - /* - * Free the descriptor if the item is not dirty - * within this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - xfs_trans_free_item_desc(lidp); - } -} - -/* - * Total up the number of log iovecs needed to commit this - * transaction. The transaction itself needs one for the - * transaction header. Ask each dirty item in turn how many - * it needs to get the total. - */ -static uint -xfs_trans_count_vecs( - struct xfs_trans *tp) -{ - int nvecs; - struct xfs_log_item_desc *lidp; - - nvecs = 1; - - /* In the non-debug case we need to start bailing out if we - * didn't find a log_item here, return zero and let trans_commit - * deal with it. - */ - if (list_empty(&tp->t_items)) { - ASSERT(0); - return 0; - } - - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - /* - * Skip items which aren't dirty in this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - continue; - lidp->lid_size = IOP_SIZE(lidp->lid_item); - nvecs += lidp->lid_size; - } - - return nvecs; -} - -/* - * Fill in the vector with pointers to data to be logged - * by this transaction. The transaction header takes - * the first vector, and then each dirty item takes the - * number of vectors it indicated it needed in xfs_trans_count_vecs(). - * - * As each item fills in the entries it needs, also pin the item - * so that it cannot be flushed out until the log write completes. - */ -static void -xfs_trans_fill_vecs( - struct xfs_trans *tp, - struct xfs_log_iovec *log_vector) -{ - struct xfs_log_item_desc *lidp; - struct xfs_log_iovec *vecp; - uint nitems; - - /* - * Skip over the entry for the transaction header, we'll - * fill that in at the end. - */ - vecp = log_vector + 1; - - nitems = 0; - ASSERT(!list_empty(&tp->t_items)); - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - continue; - - /* - * The item may be marked dirty but not log anything. This can - * be used to get called when a transaction is committed. - */ - if (lidp->lid_size) - nitems++; - IOP_FORMAT(lidp->lid_item, vecp); - vecp += lidp->lid_size; - IOP_PIN(lidp->lid_item); - } - - /* - * Now that we've counted the number of items in this transaction, fill - * in the transaction header. Note that the transaction header does not - * have a log item. - */ - tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; - tp->t_header.th_type = tp->t_type; - tp->t_header.th_num_items = nitems; - log_vector->i_addr = (xfs_caddr_t)&tp->t_header; - log_vector->i_len = sizeof(xfs_trans_header_t); - log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; -} - -/* - * The committed item processing consists of calling the committed routine of - * each logged item, updating the item's position in the AIL if necessary, and - * unpinning each item. If the committed routine returns -1, then do nothing - * further with the item because it may have been freed. - * - * Since items are unlocked when they are copied to the incore log, it is - * possible for two transactions to be completing and manipulating the same - * item simultaneously. The AIL lock will protect the lsn field of each item. - * The value of this field can never go backwards. - * - * We unpin the items after repositioning them in the AIL, because otherwise - * they could be immediately flushed and we'd have to race with the flusher - * trying to pull the item from the AIL as we add it. - */ -static void -xfs_trans_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t commit_lsn, - int aborted) -{ - xfs_lsn_t item_lsn; - struct xfs_ail *ailp; - - if (aborted) - lip->li_flags |= XFS_LI_ABORTED; - item_lsn = IOP_COMMITTED(lip, commit_lsn); - - /* item_lsn of -1 means the item needs no further processing */ - if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) - return; - - /* - * If the returned lsn is greater than what it contained before, update - * the location of the item in the AIL. If it is not, then do nothing. - * Items can never move backwards in the AIL. - * - * While the new lsn should usually be greater, it is possible that a - * later transaction completing simultaneously with an earlier one - * using the same item could complete first with a higher lsn. This - * would cause the earlier transaction to fail the test below. - */ - ailp = lip->li_ailp; - spin_lock(&ailp->xa_lock); - if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { - /* - * This will set the item's lsn to item_lsn and update the - * position of the item in the AIL. - * - * xfs_trans_ail_update() drops the AIL lock. - |