From 8eae1ca0034cce78a24738087a32adb1ddb66aa7 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 15 Oct 2012 10:57:02 +0100 Subject: [PATCH 01/16] GFS2: Review bug traps in glops.c Two of the bug traps here could really be warnings. The others are converted from BUG() to GLOCK_BUG_ON() since we'll most likely need to know the glock state in order to debug any issues which arise. As a result of this, __dump_glock has to be renamed and is no longer static. Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 10 ++++----- fs/gfs2/glock.h | 54 ++++++++++++++++++++++++------------------------- fs/gfs2/glops.c | 10 ++++----- 3 files changed, 36 insertions(+), 38 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e6c2fd53cab..e543871ec82 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -55,8 +55,6 @@ struct gfs2_glock_iter { typedef void (*glock_examiner) (struct gfs2_glock * gl); -static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); -#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0) static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); static struct dentry *gfs2_root; @@ -1013,7 +1011,7 @@ trap_recursive: printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid)); printk(KERN_ERR "lock type: %d req lock state : %d\n", gh->gh_gl->gl_name.ln_type, gh->gh_state); - __dump_glock(NULL, gl); + gfs2_dump_glock(NULL, gl); BUG(); } @@ -1508,7 +1506,7 @@ static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) { int ret; spin_lock(&gl->gl_spin); - ret = __dump_glock(seq, gl); + ret = gfs2_dump_glock(seq, gl); spin_unlock(&gl->gl_spin); return ret; } @@ -1655,7 +1653,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) } /** - * __dump_glock - print information about a glock + * gfs2_dump_glock - print information about a glock * @seq: The seq_file struct * @gl: the glock * @@ -1672,7 +1670,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) * Returns: 0 on success, -ENOBUFS when we run out of space */ -static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) +int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) { const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned long long dtime; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 307ac31df78..fd580b7861d 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -178,33 +178,33 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) return NULL; } -int gfs2_glock_get(struct gfs2_sbd *sdp, - u64 number, const struct gfs2_glock_operations *glops, - int create, struct gfs2_glock **glp); -void gfs2_glock_hold(struct gfs2_glock *gl); -void gfs2_glock_put_nolock(struct gfs2_glock *gl); -void gfs2_glock_put(struct gfs2_glock *gl); -void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, - struct gfs2_holder *gh); -void gfs2_holder_reinit(unsigned int state, unsigned flags, - struct gfs2_holder *gh); -void gfs2_holder_uninit(struct gfs2_holder *gh); -int gfs2_glock_nq(struct gfs2_holder *gh); -int gfs2_glock_poll(struct gfs2_holder *gh); -int gfs2_glock_wait(struct gfs2_holder *gh); -void gfs2_glock_dq(struct gfs2_holder *gh); -void gfs2_glock_dq_wait(struct gfs2_holder *gh); - -void gfs2_glock_dq_uninit(struct gfs2_holder *gh); -int gfs2_glock_nq_num(struct gfs2_sbd *sdp, - u64 number, const struct gfs2_glock_operations *glops, - unsigned int state, int flags, struct gfs2_holder *gh); - -int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); -void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); -void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); - -__printf(2, 3) +extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + int create, struct gfs2_glock **glp); +extern void gfs2_glock_hold(struct gfs2_glock *gl); +extern void gfs2_glock_put_nolock(struct gfs2_glock *gl); +extern void gfs2_glock_put(struct gfs2_glock *gl); +extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, + unsigned flags, struct gfs2_holder *gh); +extern void gfs2_holder_reinit(unsigned int state, unsigned flags, + struct gfs2_holder *gh); +extern void gfs2_holder_uninit(struct gfs2_holder *gh); +extern int gfs2_glock_nq(struct gfs2_holder *gh); +extern int gfs2_glock_poll(struct gfs2_holder *gh); +extern int gfs2_glock_wait(struct gfs2_holder *gh); +extern void gfs2_glock_dq(struct gfs2_holder *gh); +extern void gfs2_glock_dq_wait(struct gfs2_holder *gh); +extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh); +extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + unsigned int state, int flags, + struct gfs2_holder *gh); +extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); +extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); +extern void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); +extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); +#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) +extern __printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); /** diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 32cc4fde975..0a3e7c7e26c 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -74,7 +74,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) gfs2_trans_add_revoke(sdp, bd); } - BUG_ON(!fsync && atomic_read(&gl->gl_ail_count)); + GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count)); spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); } @@ -96,7 +96,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) tr.tr_ip = (unsigned long)__builtin_return_address(0); sb_start_intwrite(sdp->sd_vfs); gfs2_log_reserve(sdp, tr.tr_reserved); - BUG_ON(current->journal_info); + WARN_ON_ONCE(current->journal_info); current->journal_info = &tr; __gfs2_ail_flush(gl, 0); @@ -139,7 +139,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl) if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return; - BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE); + GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); gfs2_log_flush(gl->gl_sbd, gl); filemap_fdatawrite(metamapping); @@ -168,7 +168,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { struct address_space *mapping = gfs2_glock2aspace(gl); - BUG_ON(!(flags & DIO_METADATA)); + WARN_ON_ONCE(!(flags & DIO_METADATA)); gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); truncate_inode_pages(mapping, 0); @@ -197,7 +197,7 @@ static void inode_go_sync(struct gfs2_glock *gl) if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return; - BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE); + GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); gfs2_log_flush(gl->gl_sbd, gl); filemap_fdatawrite(metamapping); From a68a0a352a0209467268dfddffe02db08b97ddb4 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 19 Oct 2012 08:32:51 -0400 Subject: [PATCH 02/16] GFS2: Speed up gfs2_rbm_from_block This patch is a rewrite of function gfs2_rbm_from_block. Rather than looping to find the right bitmap, the code now does a few simple math calculations. I compared the performance of both algorithms side by side and the new algorithm is noticeably faster. Sample instrumentation output from a "fast" machine: 5 million calls: millisec spent: Orig: 166 New: 113 5 million calls: millisec spent: Orig: 189 New: 114 In addition, I ran postmark (on a somewhat slowr CPU) before the after the new algorithm was put in place and postmark showed a decent improvement: Before the new algorithm: ------------------------- Time: 645 seconds total 584 seconds of transactions (171 per second) Files: 150087 created (232 per second) Creation alone: 100000 files (2083 per second) Mixed with transactions: 50087 files (85 per second) 49995 read (85 per second) 49991 appended (85 per second) 150087 deleted (232 per second) Deletion alone: 100174 files (7705 per second) Mixed with transactions: 49913 files (85 per second) Data: 273.42 megabytes read (434.08 kilobytes per second) 852.13 megabytes written (1.32 megabytes per second) With the new algorithm: ----------------------- Time: 599 seconds total 530 seconds of transactions (188 per second) Files: 150087 created (250 per second) Creation alone: 100000 files (1886 per second) Mixed with transactions: 50087 files (94 per second) 49995 read (94 per second) 49991 appended (94 per second) 150087 deleted (250 per second) Deletion alone: 100174 files (6260 per second) Mixed with transactions: 49913 files (94 per second) Data: 273.42 megabytes read (467.42 kilobytes per second) 852.13 megabytes written (1.42 megabytes per second) Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 1 + fs/gfs2/ops_fstype.c | 3 +++ fs/gfs2/rgrp.c | 21 ++++++++++++--------- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 3d469d37345..24bb0b85786 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -621,6 +621,7 @@ struct gfs2_sbd { u32 sd_hash_bsize_shift; u32 sd_hash_ptrs; /* Number of pointers in a hash block */ u32 sd_qc_per_block; + u32 sd_blocks_per_bitmap; u32 sd_max_dirres; /* Max blocks needed to add a directory entry */ u32 sd_max_height; /* Max height of a file's metadata tree */ u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e443966c810..0e3554edb8f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -278,6 +278,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(struct gfs2_quota_change); + sdp->sd_blocks_per_bitmap = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) + * GFS2_NBBY; /* not the rgrp bitmap, subsequent bitmaps only */ /* Compute maximum reservation required to add a entry to a directory */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 38fe18f2f05..669b89b95cc 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -251,22 +251,25 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) { u64 rblock = block - rbm->rgd->rd_data0; - u32 goal = (u32)rblock; - int x; + u32 x; if (WARN_ON_ONCE(rblock > UINT_MAX)) return -EINVAL; if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data) return -E2BIG; - for (x = 0; x < rbm->rgd->rd_length; x++) { - rbm->bi = rbm->rgd->rd_bits + x; - if (goal < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY) { - rbm->offset = goal - (rbm->bi->bi_start * GFS2_NBBY); - break; - } - } + rbm->bi = rbm->rgd->rd_bits; + rbm->offset = (u32)(rblock); + /* Check if the block is within the first block */ + if (rbm->offset < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY) + return 0; + /* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */ + rbm->offset += (sizeof(struct gfs2_rgrp) - + sizeof(struct gfs2_meta_header)) * GFS2_NBBY; + x = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap; + rbm->offset -= x * rbm->rgd->rd_sbd->sd_blocks_per_bitmap; + rbm->bi += x; return 0; } From 06dfc30641370094ed522bf5949b2a326fe2741b Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 24 Oct 2012 14:41:05 -0400 Subject: [PATCH 03/16] GFS2: Rename glops go_xmote_th to go_sync [Editorial: This is a nit, but has been a minor irritation for a long time:] This patch renames glops structure item for go_xmote_th to go_sync. The functionality is unchanged; it's just for readability. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 4 ++-- fs/gfs2/glops.c | 6 +++--- fs/gfs2/incore.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e543871ec82..6114571a979 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -535,8 +535,8 @@ __acquires(&gl->gl_spin) (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) clear_bit(GLF_BLOCKING, &gl->gl_flags); spin_unlock(&gl->gl_spin); - if (glops->go_xmote_th) - glops->go_xmote_th(gl); + if (glops->go_sync) + glops->go_sync(gl); if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 0a3e7c7e26c..e86fe26c12d 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -536,7 +536,7 @@ const struct gfs2_glock_operations gfs2_meta_glops = { }; const struct gfs2_glock_operations gfs2_inode_glops = { - .go_xmote_th = inode_go_sync, + .go_sync = inode_go_sync, .go_inval = inode_go_inval, .go_demote_ok = inode_go_demote_ok, .go_lock = inode_go_lock, @@ -546,7 +546,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = { }; const struct gfs2_glock_operations gfs2_rgrp_glops = { - .go_xmote_th = rgrp_go_sync, + .go_sync = rgrp_go_sync, .go_inval = rgrp_go_inval, .go_lock = gfs2_rgrp_go_lock, .go_unlock = gfs2_rgrp_go_unlock, @@ -556,7 +556,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { }; const struct gfs2_glock_operations gfs2_trans_glops = { - .go_xmote_th = trans_go_sync, + .go_sync = trans_go_sync, .go_xmote_bh = trans_go_xmote_bh, .go_demote_ok = trans_go_demote_ok, .go_type = LM_TYPE_NONDISK, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 24bb0b85786..a46f0348593 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -205,7 +205,7 @@ struct lm_lockname { struct gfs2_glock_operations { - void (*go_xmote_th) (struct gfs2_glock *gl); + void (*go_sync) (struct gfs2_glock *gl); int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); void (*go_inval) (struct gfs2_glock *gl, int flags); int (*go_demote_ok) (const struct gfs2_glock *gl); From bcd97c06308cbfa8b46e11762ea116300cdce772 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 31 Oct 2012 09:58:42 +0000 Subject: [PATCH 04/16] GFS2: Add test for resource group congestion status This patch uses information gathered by the recent glock statistics patch in order to derrive a boolean verdict on the congestion status of a resource group. This is then used when making decisions on which resource group to choose during block allocation. The aim is to avoid resource groups which are heavily contended by other nodes, while still ensuring locality of access wherever possible. Once a reservation has been made in a particular resource group we continue to use that resource group until a new reservation is required. This should help to ensure that we do not change resource groups too often. Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 669b89b95cc..bdf3e644baa 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1681,6 +1681,88 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip return; } +/** + * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested + * @rgd: The rgrp in question + * @loops: An indication of how picky we can be (0=very, 1=less so) + * + * This function uses the recently added glock statistics in order to + * figure out whether a parciular resource group is suffering from + * contention from multiple nodes. This is done purely on the basis + * of timings, since this is the only data we have to work with and + * our aim here is to reject a resource group which is highly contended + * but (very important) not to do this too often in order to ensure that + * we do not land up introducing fragmentation by changing resource + * groups when not actually required. + * + * The calculation is fairly simple, we want to know whether the SRTTB + * (i.e. smoothed round trip time for blocking operations) to acquire + * the lock for this rgrp's glock is significantly greater than the + * time taken for resource groups on average. We introduce a margin in + * the form of the variable @var which is computed as the sum of the two + * respective variences, and multiplied by a factor depending on @loops + * and whether we have a lot of data to base the decision on. This is + * then tested against the square difference of the means in order to + * decide whether the result is statistically significant or not. + * + * Returns: A boolean verdict on the congestion status + */ + +static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops) +{ + const struct gfs2_glock *gl = rgd->rd_gl; + const struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_lkstats *st; + s64 r_dcount, l_dcount; + s64 r_srttb, l_srttb; + s64 srttb_diff; + s64 sqr_diff; + s64 var; + + preempt_disable(); + st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP]; + r_srttb = st->stats[GFS2_LKS_SRTTB]; + r_dcount = st->stats[GFS2_LKS_DCOUNT]; + var = st->stats[GFS2_LKS_SRTTVARB] + + gl->gl_stats.stats[GFS2_LKS_SRTTVARB]; + preempt_enable(); + + l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB]; + l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT]; + + if ((l_dcount < 1) || (r_dcount < 1) || (r_srttb == 0)) + return false; + + srttb_diff = r_srttb - l_srttb; + sqr_diff = srttb_diff * srttb_diff; + + var *= 2; + if (l_dcount < 8 || r_dcount < 8) + var *= 2; + if (loops == 1) + var *= 2; + + return ((srttb_diff < 0) && (sqr_diff > var)); +} + +/** + * gfs2_rgrp_used_recently + * @rs: The block reservation with the rgrp to test + * @msecs: The time limit in milliseconds + * + * Returns: True if the rgrp glock has been used within the time limit + */ +static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs, + u64 msecs) +{ + u64 tdiff; + + tdiff = ktime_to_ns(ktime_sub(ktime_get_real(), + rs->rs_rbm.rgd->rd_gl->gl_dstamp)); + + return tdiff > (msecs * 1000 * 1000); +} + static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin) { struct gfs2_rgrpd *rgd = *pos; @@ -1707,7 +1789,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *begin = NULL; struct gfs2_blkreserv *rs = ip->i_res; - int error = 0, rg_locked, flags = LM_FLAG_TRY; + int error = 0, rg_locked, flags = 0; u64 last_unlinked = NO_BLOCK; int loops = 0; @@ -1731,13 +1813,18 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) { rg_locked = 0; + if (!gfs2_rs_active(rs) && (loops < 2) && + gfs2_rgrp_used_recently(rs, 1000) && + gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) + goto next_rgrp; error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl, LM_ST_EXCLUSIVE, flags, &rs->rs_rgd_gh); - if (error == GLR_TRYFAILED) - goto next_rgrp; if (unlikely(error)) return error; + if (!gfs2_rs_active(rs) && (loops < 2) && + gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) + goto skip_rgrp; if (sdp->sd_args.ar_rgrplvb) { error = update_rgrp_lvb(rs->rs_rbm.rgd); if (unlikely(error)) { @@ -1789,7 +1876,6 @@ next_rgrp: * then this checks for some less likely conditions before * trying again. */ - flags &= ~LM_FLAG_TRY; loops++; /* Check that fs hasn't grown if writing to rindex */ if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) { From c9aecf73717f55e41ac11682a50bef8594547025 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 31 Oct 2012 10:30:22 +0000 Subject: [PATCH 05/16] GFS2: Use proper allocation context for new inodes Rather than using the parent directory's allocation context, this patch allocated the new inode earlier in the process and then uses it to contain all the information required. As a result, we can now use the new inode's own allocation context to allocate it rather than having to use the parent directory's context. This give us a lot more flexibility in where the inode is placed on disk. Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 189 ++++++++++++++++++++++++++---------------------- 1 file changed, 101 insertions(+), 88 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 381893ceefa..749b05a960e 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -364,34 +364,34 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, return 0; } -static void munge_mode_uid_gid(struct gfs2_inode *dip, umode_t *mode, - unsigned int *uid, unsigned int *gid) +static void munge_mode_uid_gid(const struct gfs2_inode *dip, + struct inode *inode) { if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) { - if (S_ISDIR(*mode)) - *mode |= S_ISUID; + if (S_ISDIR(inode->i_mode)) + inode->i_mode |= S_ISUID; else if (dip->i_inode.i_uid != current_fsuid()) - *mode &= ~07111; - *uid = dip->i_inode.i_uid; + inode->i_mode &= ~07111; + inode->i_uid = dip->i_inode.i_uid; } else - *uid = current_fsuid(); + inode->i_uid = current_fsuid(); if (dip->i_inode.i_mode & S_ISGID) { - if (S_ISDIR(*mode)) - *mode |= S_ISGID; - *gid = dip->i_inode.i_gid; + if (S_ISDIR(inode->i_mode)) + inode->i_mode |= S_ISGID; + inode->i_gid = dip->i_inode.i_gid; } else - *gid = current_fsgid(); + inode->i_gid = current_fsgid(); } -static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) +static int alloc_dinode(struct gfs2_inode *ip) { - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); int error; int dblocks = 1; - error = gfs2_inplace_reserve(dip, RES_DINODE); + error = gfs2_inplace_reserve(ip, RES_DINODE); if (error) goto out; @@ -399,12 +399,15 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) if (error) goto out_ipreserv; - error = gfs2_alloc_blocks(dip, no_addr, &dblocks, 1, generation); + error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation); + ip->i_no_formal_ino = ip->i_generation; + ip->i_inode.i_ino = ip->i_no_addr; + ip->i_goal = ip->i_no_addr; gfs2_trans_end(sdp); out_ipreserv: - gfs2_inplace_release(dip); + gfs2_inplace_release(ip); out: return error; } @@ -429,52 +432,42 @@ static void gfs2_init_dir(struct buffer_head *dibh, /** * init_dinode - Fill in a new dinode structure * @dip: The directory this inode is being created in - * @gl: The glock covering the new inode - * @inum: The inode number - * @mode: The file permissions - * @uid: The uid of the new inode - * @gid: The gid of the new inode - * @generation: The generation number of the new inode - * @dev: The device number (if a device node) + * @ip: The inode * @symname: The symlink destination (if a symlink) - * @size: The inode size (ignored for directories) * @bhp: The buffer head (returned to caller) * */ -static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, - const struct gfs2_inum_host *inum, umode_t mode, - unsigned int uid, unsigned int gid, - const u64 *generation, dev_t dev, const char *symname, - unsigned size, struct buffer_head **bhp) +static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, + const char *symname, struct buffer_head **bhp) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_dinode *di; struct buffer_head *dibh; struct timespec tv = CURRENT_TIME; - dibh = gfs2_meta_new(gl, inum->no_addr); - gfs2_trans_add_bh(gl, dibh, 1); + dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI); gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); di = (struct gfs2_dinode *)dibh->b_data; - di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino); - di->di_num.no_addr = cpu_to_be64(inum->no_addr); - di->di_mode = cpu_to_be32(mode); - di->di_uid = cpu_to_be32(uid); - di->di_gid = cpu_to_be32(gid); + di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); + di->di_num.no_addr = cpu_to_be64(ip->i_no_addr); + di->di_mode = cpu_to_be32(ip->i_inode.i_mode); + di->di_uid = cpu_to_be32(ip->i_inode.i_uid); + di->di_gid = cpu_to_be32(ip->i_inode.i_gid); di->di_nlink = 0; - di->di_size = cpu_to_be64(size); + di->di_size = cpu_to_be64(ip->i_inode.i_size); di->di_blocks = cpu_to_be64(1); di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec); - di->di_major = cpu_to_be32(MAJOR(dev)); - di->di_minor = cpu_to_be32(MINOR(dev)); - di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr); - di->di_generation = cpu_to_be64(*generation); + di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev)); + di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev)); + di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_no_addr); + di->di_generation = cpu_to_be64(ip->i_generation); di->di_flags = 0; di->__pad1 = 0; - di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0); + di->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) ? GFS2_FORMAT_DE : 0); di->di_height = 0; di->__pad2 = 0; di->__pad3 = 0; @@ -487,7 +480,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec); memset(&di->di_reserved, 0, sizeof(di->di_reserved)); - switch(mode & S_IFMT) { + switch(ip->i_inode.i_mode & S_IFMT) { case S_IFREG: if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || gfs2_tune_get(sdp, gt_new_files_jdata)) @@ -502,7 +495,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, gfs2_init_dir(dibh, dip); break; case S_IFLNK: - memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, size); + memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, ip->i_inode.i_size); break; } @@ -511,25 +504,22 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, *bhp = dibh; } -static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, - umode_t mode, const struct gfs2_inum_host *inum, - const u64 *generation, dev_t dev, const char *symname, - unsigned int size, struct buffer_head **bhp) +static int make_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, + const char *symname, struct buffer_head **bhp) { + struct inode *inode = &ip->i_inode; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - unsigned int uid, gid; int error; - munge_mode_uid_gid(dip, &mode, &uid, &gid); error = gfs2_rindex_update(sdp); if (error) return error; - error = gfs2_quota_lock(dip, uid, gid); + error = gfs2_quota_lock(dip, inode->i_uid, inode->i_gid); if (error) return error; - error = gfs2_quota_check(dip, uid, gid); + error = gfs2_quota_check(dip, inode->i_uid, inode->i_gid); if (error) goto out_quota; @@ -537,8 +527,8 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, if (error) goto out_quota; - init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, symname, size, bhp); - gfs2_quota_change(dip, +1, uid, gid); + init_dinode(dip, ip, symname, bhp); + gfs2_quota_change(dip, +1, inode->i_uid, inode->i_gid); gfs2_trans_end(sdp); out_quota: @@ -657,19 +647,13 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; + struct gfs2_glock *io_gl; int error; - u64 generation; struct buffer_head *bh = NULL; if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; - /* We need a reservation to allocate the new dinode block. The - directory ip temporarily points to the reservation, but this is - being done to get a set of contiguous blocks for the new dinode. - Since this is a create, we don't have a sizehint yet, so it will - have to use the minimum reservation size. */ error = gfs2_rs_alloc(dip); if (error) return error; @@ -688,45 +672,63 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock; - error = alloc_dinode(dip, &inum.no_addr, &generation); - if (error) - goto fail_gunlock; - inum.no_formal_ino = generation; - - error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops, - LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); - if (error) - goto fail_gunlock; - - error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, symname, size, &bh); - if (error) - goto fail_gunlock2; - - inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, - inum.no_formal_ino, 0); - if (IS_ERR(inode)) - goto fail_gunlock2; - + inode = new_inode(sdp->sd_vfs); ip = GFS2_I(inode); - error = gfs2_inode_refresh(ip); - if (error) - goto fail_gunlock2; - error = gfs2_rs_alloc(ip); + if (error) + goto fail_free_inode; + + set_bit(GIF_INVALID, &ip->i_flags); + inode->i_mode = mode; + inode->i_rdev = dev; + inode->i_size = size; + munge_mode_uid_gid(dip, inode); + ip->i_goal = dip->i_goal; + + error = alloc_dinode(ip); + if (error) + goto fail_free_inode; + + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + if (error) + goto fail_free_inode; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + if (error) + goto fail_free_inode; + + error = make_dinode(dip, ip, symname, &bh); if (error) goto fail_gunlock2; + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); + if (error) + goto fail_gunlock2; + + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); + if (error) + goto fail_gunlock2; + + ip->i_iopen_gh.gh_gl->gl_object = ip; + gfs2_glock_put(io_gl); + gfs2_set_iop(inode); + insert_inode_hash(inode); + + error = gfs2_inode_refresh(ip); + if (error) + goto fail_gunlock3; + error = gfs2_acl_create(dip, inode); if (error) - goto fail_gunlock2; + goto fail_gunlock3; error = gfs2_security_init(dip, ip, name); if (error) - goto fail_gunlock2; + goto fail_gunlock3; error = link_dinode(dip, name, ip); if (error) - goto fail_gunlock2; + goto fail_gunlock3; if (bh) brelse(bh); @@ -739,8 +741,20 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); return 0; +fail_gunlock3: + gfs2_glock_dq_uninit(ghs + 1); + if (ip->i_gl) + gfs2_glock_put(ip->i_gl); + goto fail_gunlock; + fail_gunlock2: gfs2_glock_dq_uninit(ghs + 1); +fail_free_inode: + if (ip->i_gl) + gfs2_glock_put(ip->i_gl); + gfs2_rs_delete(ip); + free_inode_nonrcu(inode); + inode = NULL; fail_gunlock: gfs2_glock_dq_uninit(ghs); if (inode && !IS_ERR(inode)) { @@ -748,7 +762,6 @@ fail_gunlock: iput(inode); } fail: - gfs2_rs_delete(dip); if (bh) brelse(bh); return error; From 9dbe9610b9df4efe0946299804ed46bb8f91dec2 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 31 Oct 2012 10:37:10 +0000 Subject: [PATCH 06/16] GFS2: Add Orlov allocator Just like ext3, this works on the root directory and any directory with the +T flag set. Also, just like ext3, any subdirectory created in one of the just mentioned cases will be allocated to a random resource group (GFS2 equivalent of a block group). If you are creating a set of directories, each of which will contain a job running on a different node, then by setting +T on the parent directory before creating the subdirectories, each will land up in a different resource group, and thus resource group contention between nodes will be kept to a minimum. Signed-off-by: Steven Whitehouse --- fs/gfs2/aops.c | 2 +- fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 4 ++-- fs/gfs2/inode.c | 17 +++++++++++------ fs/gfs2/quota.c | 4 ++-- fs/gfs2/rgrp.c | 19 ++++++++++++++++++- fs/gfs2/rgrp.h | 3 ++- fs/gfs2/xattr.c | 2 +- 8 files changed, 38 insertions(+), 15 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 01c4975da4b..30de4f2a2ea 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -643,7 +643,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, goto out_unlock; requested = data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip, requested); + error = gfs2_inplace_reserve(ip, requested, 0); if (error) goto out_qunlock; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1fd3ae237bd..de70e52caf3 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1178,7 +1178,7 @@ static int do_grow(struct inode *inode, u64 size) if (error) return error; - error = gfs2_inplace_reserve(ip, 1); + error = gfs2_inplace_reserve(ip, 1, 0); if (error) goto do_grow_qunlock; unstuff = 1; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index e056b4ce487..dfe2d8cb9b2 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -432,7 +432,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out_unlock; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); - ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); + ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0); if (ret) goto out_quota_unlock; @@ -825,7 +825,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, retry: gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); - error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); + error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0); if (error) { if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { bytes >>= 1; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 749b05a960e..ef3ce00bb52 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -385,13 +385,13 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip, inode->i_gid = current_fsgid(); } -static int alloc_dinode(struct gfs2_inode *ip) +static int alloc_dinode(struct gfs2_inode *ip, u32 flags) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); int error; int dblocks = 1; - error = gfs2_inplace_reserve(ip, RES_DINODE); + error = gfs2_inplace_reserve(ip, RES_DINODE, flags); if (error) goto out; @@ -560,7 +560,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) goto fail_quota_locks; - error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres); + error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0); if (error) goto fail_quota_locks; @@ -650,6 +650,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct gfs2_glock *io_gl; int error; struct buffer_head *bh = NULL; + u32 aflags = 0; if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; @@ -685,7 +686,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, munge_mode_uid_gid(dip, inode); ip->i_goal = dip->i_goal; - error = alloc_dinode(ip); + if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) || + (dip->i_diskflags & GFS2_DIF_TOPDIR)) + aflags |= GFS2_AF_ORLOV; + + error = alloc_dinode(ip, aflags); if (error) goto fail_free_inode; @@ -897,7 +902,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_gunlock; - error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres); + error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0); if (error) goto out_gunlock_q; @@ -1378,7 +1383,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_gunlock; - error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres); + error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres, 0); if (error) goto out_gunlock_q; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index c5af8e18f27..6bbf64f0f5b 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -816,7 +816,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; reserved = 1 + (nalloc * (data_blocks + ind_blocks)); - error = gfs2_inplace_reserve(ip, reserved); + error = gfs2_inplace_reserve(ip, reserved, 0); if (error) goto out_alloc; @@ -1605,7 +1605,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); blocks = 1 + data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip, blocks); + error = gfs2_inplace_reserve(ip, blocks, 0); if (error) goto out_i; blocks += gfs2_rg_blocks(ip, blocks); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index bdf3e644baa..99a619788c6 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -1763,6 +1764,15 @@ static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs, return tdiff > (msecs * 1000 * 1000); } +static u32 gfs2_orlov_skip(const struct gfs2_inode *ip) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u32 skip; + + get_random_bytes(&skip, sizeof(skip)); + return skip % sdp->sd_rgrps; +} + static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin) { struct gfs2_rgrpd *rgd = *pos; @@ -1784,7 +1794,7 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b * Returns: errno */ -int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) +int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *begin = NULL; @@ -1792,6 +1802,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) int error = 0, rg_locked, flags = 0; u64 last_unlinked = NO_BLOCK; int loops = 0; + u32 skip = 0; if (sdp->sd_args.ar_rgrplvb) flags |= GL_SKIP; @@ -1805,6 +1816,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) } else { rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); } + if (S_ISDIR(ip->i_inode.i_mode) && (aflags & GFS2_AF_ORLOV)) + skip = gfs2_orlov_skip(ip); if (rs->rs_rbm.rgd == NULL) return -EBADSLT; @@ -1813,6 +1826,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) { rg_locked = 0; + if (skip && skip--) + goto next_rgrp; if (!gfs2_rs_active(rs) && (loops < 2) && gfs2_rgrp_used_recently(rs, 1000) && gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) @@ -1871,6 +1886,8 @@ next_rgrp: /* Find the next rgrp, and continue looking */ if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin)) continue; + if (skip) + continue; /* If we've scanned all the rgrps, but found no free blocks * then this checks for some less likely conditions before diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 24077958dcf..842185853f6 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -39,7 +39,8 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); -extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested); +#define GFS2_AF_ORLOV 1 +extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 flags); extern void gfs2_inplace_release(struct gfs2_inode *ip); extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index db330e5518c..76c144b3c9b 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) return error; - error = gfs2_inplace_reserve(ip, blks); + error = gfs2_inplace_reserve(ip, blks, 0); if (error) goto out_gunlock_q; From fa731fc4e045a801814547188a63c2cd49a4cfe6 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 13 Nov 2012 09:50:28 +0000 Subject: [PATCH 07/16] GFS2: Fix truncation of journaled data files This patch fixes an issue relating to not having enough revokes available when truncating journaled data files. In order to ensure that we do no run out, the truncation is broken into separate pieces if it is large enough. Tested using fsx on a journaled data file. Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index de70e52caf3..a68e91bcef3 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -991,6 +991,41 @@ unlock: return err; } +/** + * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files + * @inode: The inode being truncated + * @oldsize: The original (larger) size + * @newsize: The new smaller size + * + * With jdata files, we have to journal a revoke for each block which is + * truncated. As a result, we need to split this into separate transactions + * if the number of pages being truncated gets too large. + */ + +#define GFS2_JTRUNC_REVOKES 8192 + +static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize) +{ + struct gfs2_sbd *sdp = GFS2_SB(inode); + u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize; + u64 chunk; + int error; + + while (oldsize != newsize) { + chunk = oldsize - newsize; + if (chunk > max_chunk) + chunk = max_chunk; + truncate_pagecache(inode, oldsize, oldsize - chunk); + oldsize -= chunk; + gfs2_trans_end(sdp); + error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); + if (error) + return error; + } + + return 0; +} + static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) { struct gfs2_inode *ip = GFS2_I(inode); @@ -1000,8 +1035,10 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) int journaled = gfs2_is_jdata(ip); int error; - error = gfs2_trans_begin(sdp, - RES_DINODE + (journaled ? RES_JDATA : 0), 0); + if (journaled) + error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES); + else + error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) return error; @@ -1026,7 +1063,16 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, dibh->b_data); - truncate_pagecache(inode, oldsize, newsize); + if (journaled) + error = gfs2_journaled_truncate(inode, oldsize, newsize); + else + truncate_pagecache(inode, oldsize, newsize); + + if (error) { + brelse(dibh); + return error; + } + out_brelse: brelse(dibh); out: From 343cd8f0d78515da38e41e9351f5ba306cdec84a Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 12 Nov 2012 13:04:54 -0500 Subject: [PATCH 08/16] GFS2: Use dirty_inode in gfs2_dir_add This patch changes the gfs2_dir_add function so that it uses the dirty_inode function (via mark_inode_dirty) rather than manually updating the dinode. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/dir.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 259b088cfc4..9a35670fdc3 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1676,16 +1676,11 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, be16_add_cpu(&leaf->lf_entries, 1); } brelse(bh); - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - break; - gfs2_trans_add_bh(ip->i_gl, bh, 1); ip->i_entries++; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; if (S_ISDIR(nip->i_inode.i_mode)) inc_nlink(&ip->i_inode); - gfs2_dinode_out(ip, bh->b_data); - brelse(bh); + mark_inode_dirty(inode); error = 0; break; } From 4327a9bf71f4b021b675e01f24fefc647cff7513 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 12 Nov 2012 13:03:29 -0500 Subject: [PATCH 09/16] GFS2: Eliminate redundant buffer_head manipulation in gfs2_unlink_inode Since we now have a dirty_inode that takes care of manipulating the inode buffer and writing from the inode to the buffer, we can eliminate some unnecessary buffer manipulations in gfs2_unlink_inode that are now redundant. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index ef3ce00bb52..e321333f0b4 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -995,7 +995,6 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it * @dip: The parent directory * @name: The name of the entry in the parent directory - * @bh: The inode buffer for the inode to be removed * @inode: The inode to be removed * * Called with all the locks and in a transaction. This will only be @@ -1005,8 +1004,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, */ static int gfs2_unlink_inode(struct gfs2_inode *dip, - const struct dentry *dentry, - struct buffer_head *bh) + const struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); @@ -1046,7 +1044,6 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) struct gfs2_sbd *sdp = GFS2_SB(dir); struct inode *inode = dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); - struct buffer_head *bh; struct gfs2_holder ghs[3]; struct gfs2_rgrpd *rgd; int error; @@ -1094,15 +1091,10 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) goto out_gunlock; error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0); - if (error) - goto out_gunlock; - - error = gfs2_meta_inode_buffer(ip, &bh); if (error) goto out_end_trans; - error = gfs2_unlink_inode(dip, dentry, bh); - brelse(bh); + error = gfs2_unlink_inode(dip, dentry); out_end_trans: gfs2_trans_end(sdp); @@ -1402,14 +1394,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, /* Remove the target file, if it exists */ - if (nip) { - struct buffer_head *bh; - error = gfs2_meta_inode_buffer(nip, &bh); - if (error) - goto out_end_trans; - error = gfs2_unlink_inode(ndip, ndentry, bh); - brelse(bh); - } + if (nip) + error = gfs2_unlink_inode(ndip, ndentry); if (dir_rename) { error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); From aa8920c96897dd82f0520f9e7db7311b42547ce6 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 13 Nov 2012 14:50:35 +0000 Subject: [PATCH 10/16] GFS2: Fix one RG corner case For filesystems with only a single resource group, we need to be careful that the allocation loop will not land up with a NULL resource group. This fixes a bug in a previous patch where the gfs2_rgrpd_get_next() function was being used instead of gfs2_rgrpd_get_first() Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 99a619788c6..5625e93bf61 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1776,10 +1776,11 @@ static u32 gfs2_orlov_skip(const struct gfs2_inode *ip) static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin) { struct gfs2_rgrpd *rgd = *pos; + struct gfs2_sbd *sdp = rgd->rd_sbd; rgd = gfs2_rgrpd_get_next(rgd); if (rgd == NULL) - rgd = gfs2_rgrpd_get_next(NULL); + rgd = gfs2_rgrpd_get_first(sdp); *pos = rgd; if (rgd != begin) /* If we didn't wrap */ return true; From fb6791d100d1bba20b5cdbc4912e1f7086ec60f8 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Tue, 13 Nov 2012 10:58:56 -0500 Subject: [PATCH 11/16] GFS2: skip dlm_unlock calls in unmount When unmounting, gfs2 does a full dlm_unlock operation on every cached lock. This can create a very large amount of work and can take a long time to complete. However, the vast majority of these dlm unlock operations are unnecessary because after all the unlocks are done, gfs2 leaves the dlm lockspace, which automatically clears the locks of the leaving node, without unlocking each one individually. So, gfs2 can skip explicit dlm unlocks, and use dlm_release_lockspace to remove the locks implicitly. The one exception is when the lock's lvb is being used. In this case, dlm_unlock is called because it may update the lvb of the resource. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 1 + fs/gfs2/incore.h | 1 + fs/gfs2/lock_dlm.c | 8 ++++++++ 3 files changed, 10 insertions(+) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 6114571a979..9d29a5167d3 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1526,6 +1526,7 @@ static void dump_glock_func(struct gfs2_glock *gl) void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) { + set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); glock_hash_walk(clear_glock, sdp); flush_workqueue(glock_workqueue); wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index a46f0348593..a35ef5cd148 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -539,6 +539,7 @@ enum { SDF_DEMOTE = 5, SDF_NOJOURNALID = 6, SDF_RORECOVERY = 7, /* read only recovery */ + SDF_SKIP_DLM_UNLOCK = 8, }; #define GFS2_FSNAME_LEN 256 diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 0fb6539b0c8..f6504d3fadb 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -289,6 +289,14 @@ static void gdlm_put_lock(struct gfs2_glock *gl) gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_update_request_times(gl); + + /* don't want to skip dlm_unlock writing the lvb when lock is ex */ + if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && + gl->gl_state != LM_ST_EXCLUSIVE) { + gfs2_glock_free(gl); + return; + } + error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, NULL, gl); if (error) { From dba2d70c5dc520fdb569d1fd8dbd45c0e330253e Mon Sep 17 00:00:00 2001 From: David Teigland Date: Wed, 14 Nov 2012 13:46:53 -0500 Subject: [PATCH 12/16] GFS2: only use lvb on glocks that need it Save the effort of allocating, reading and writing the lvb for most glocks that do not use it. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 27 +++++++++++++++++++++------ fs/gfs2/glops.c | 3 ++- fs/gfs2/incore.h | 3 ++- fs/gfs2/lock_dlm.c | 12 +++++++----- 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9d29a5167d3..2284de4d05c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -105,10 +105,12 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu) { struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); - if (gl->gl_ops->go_flags & GLOF_ASPACE) + if (gl->gl_ops->go_flags & GLOF_ASPACE) { kmem_cache_free(gfs2_glock_aspace_cachep, gl); - else + } else { + kfree(gl->gl_lvb); kmem_cache_free(gfs2_glock_cachep, gl); + } } void gfs2_glock_free(struct gfs2_glock *gl) @@ -545,7 +547,10 @@ __acquires(&gl->gl_spin) if (sdp->sd_lockstruct.ls_ops->lm_lock) { /* lock_dlm */ ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); - GLOCK_BUG_ON(gl, ret); + if (ret) { + printk(KERN_ERR "GFS2: lm_lock ret %d\n", ret); + GLOCK_BUG_ON(gl, 1); + } } else { /* lock_nolock */ finish_xmote(gl, target); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) @@ -734,6 +739,18 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, if (!gl) return -ENOMEM; + memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); + gl->gl_lvb = NULL; + + if (glops->go_flags & GLOF_LVB) { + gl->gl_lvb = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL); + if (!gl->gl_lvb) { + kmem_cache_free(cachep, gl); + return -ENOMEM; + } + gl->gl_lksb.sb_lvbptr = gl->gl_lvb; + } + atomic_inc(&sdp->sd_glock_disposal); gl->gl_sbd = sdp; gl->gl_flags = 0; @@ -751,9 +768,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, preempt_enable(); gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0; gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0; - memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); - memset(gl->gl_lvb, 0, 32 * sizeof(char)); - gl->gl_lksb.sb_lvbptr = gl->gl_lvb; gl->gl_tchange = jiffies; gl->gl_object = NULL; gl->gl_hold_time = GL_GLOCK_DFT_HOLD; @@ -775,6 +789,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, tmp = search_bucket(hash, sdp, &name); if (tmp) { spin_unlock_bucket(hash); + kfree(gl->gl_lvb); kmem_cache_free(cachep, gl); atomic_dec(&sdp->sd_glock_disposal); gl = tmp; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index e86fe26c12d..78d4184ffc7 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -552,7 +552,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_unlock = gfs2_rgrp_go_unlock, .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, - .go_flags = GLOF_ASPACE, + .go_flags = GLOF_ASPACE | GLOF_LVB, }; const struct gfs2_glock_operations gfs2_trans_glops = { @@ -577,6 +577,7 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = { const struct gfs2_glock_operations gfs2_quota_glops = { .go_type = LM_TYPE_QUOTA, + .go_flags = GLOF_LVB, }; const struct gfs2_glock_operations gfs2_journal_glops = { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index a35ef5cd148..bd577fc59e0 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -216,6 +216,7 @@ struct gfs2_glock_operations { const int go_type; const unsigned long go_flags; #define GLOF_ASPACE 1 +#define GLOF_LVB 2 }; enum { @@ -321,7 +322,7 @@ struct gfs2_glock { ktime_t gl_dstamp; struct gfs2_lkstats gl_stats; struct dlm_lksb gl_lksb; - char gl_lvb[32]; + char *gl_lvb; unsigned long gl_tchange; void *gl_object; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index f6504d3fadb..d28ae37ceb3 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -120,7 +120,7 @@ static void gdlm_ast(void *arg) gfs2_update_reply_times(gl); BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); - if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) + if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID && gl->gl_lvb) memset(gl->gl_lvb, 0, GDLM_LVB_SIZE); switch (gl->gl_lksb.sb_status) { @@ -203,8 +203,10 @@ static int make_mode(const unsigned int lmstate) static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, const int req) { - u32 lkf = DLM_LKF_VALBLK; - u32 lkid = gl->gl_lksb.sb_lkid; + u32 lkf = 0; + + if (gl->gl_lvb) + lkf |= DLM_LKF_VALBLK; if (gfs_flags & LM_FLAG_TRY) lkf |= DLM_LKF_NOQUEUE; @@ -228,7 +230,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, BUG(); } - if (lkid != 0) { + if (gl->gl_lksb.sb_lkid != 0) { lkf |= DLM_LKF_CONVERT; if (test_bit(GLF_BLOCKING, &gl->gl_flags)) lkf |= DLM_LKF_QUECVT; @@ -292,7 +294,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) /* don't want to skip dlm_unlock writing the lvb when lock is ex */ if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && - gl->gl_state != LM_ST_EXCLUSIVE) { + gl->gl_lvb && gl->gl_state != LM_ST_EXCLUSIVE) { gfs2_glock_free(gl); return; } From 4e2f8849def738092ad6c0fc2b34737381bc9d26 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Wed, 14 Nov 2012 13:47:37 -0500 Subject: [PATCH 13/16] GFS2: remove redundant lvb pointer The lksb struct already contains a pointer to the lvb, so another directly from the glock struct is not needed. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 10 ++++------ fs/gfs2/incore.h | 1 - fs/gfs2/lock_dlm.c | 8 ++++---- fs/gfs2/quota.c | 6 +++--- fs/gfs2/rgrp.c | 2 +- 5 files changed, 12 insertions(+), 15 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 2284de4d05c..274b6bed5d6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -108,7 +108,7 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu) if (gl->gl_ops->go_flags & GLOF_ASPACE) { kmem_cache_free(gfs2_glock_aspace_cachep, gl); } else { - kfree(gl->gl_lvb); + kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(gfs2_glock_cachep, gl); } } @@ -740,15 +740,13 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, return -ENOMEM; memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); - gl->gl_lvb = NULL; if (glops->go_flags & GLOF_LVB) { - gl->gl_lvb = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL); - if (!gl->gl_lvb) { + gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL); + if (!gl->gl_lksb.sb_lvbptr) { kmem_cache_free(cachep, gl); return -ENOMEM; } - gl->gl_lksb.sb_lvbptr = gl->gl_lvb; } atomic_inc(&sdp->sd_glock_disposal); @@ -789,7 +787,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, tmp = search_bucket(hash, sdp, &name); if (tmp) { spin_unlock_bucket(hash); - kfree(gl->gl_lvb); + kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(cachep, gl); atomic_dec(&sdp->sd_glock_disposal); gl = tmp; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index bd577fc59e0..c373a24fedd 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -322,7 +322,6 @@ struct gfs2_glock { ktime_t gl_dstamp; struct gfs2_lkstats gl_stats; struct dlm_lksb gl_lksb; - char *gl_lvb; unsigned long gl_tchange; void *gl_object; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index d28ae37ceb3..8dad6b09371 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -120,8 +120,8 @@ static void gdlm_ast(void *arg) gfs2_update_reply_times(gl); BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); - if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID && gl->gl_lvb) - memset(gl->gl_lvb, 0, GDLM_LVB_SIZE); + if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr) + memset(gl->gl_lksb.sb_lvbptr, 0, GDLM_LVB_SIZE); switch (gl->gl_lksb.sb_status) { case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ @@ -205,7 +205,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, { u32 lkf = 0; - if (gl->gl_lvb) + if (gl->gl_lksb.sb_lvbptr) lkf |= DLM_LKF_VALBLK; if (gfs_flags & LM_FLAG_TRY) @@ -294,7 +294,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) /* don't want to skip dlm_unlock writing the lvb when lock is ex */ if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && - gl->gl_lvb && gl->gl_state != LM_ST_EXCLUSIVE) { + gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) { gfs2_glock_free(gl); return; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 6bbf64f0f5b..ae55e248c3b 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -869,7 +869,7 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) if (error < 0) return error; - qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); qlvb->__pad = 0; qlvb->qb_limit = q.qu_limit; @@ -893,7 +893,7 @@ restart: if (error) return error; - qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; + qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { gfs2_glock_dq_uninit(q_gh); @@ -1506,7 +1506,7 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, if (error) goto out; - qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; fdq->d_version = FS_DQUOT_VERSION; fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; fdq->d_id = from_kqid(&init_user_ns, qid); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 5625e93bf61..37ee061d899 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -879,7 +879,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) goto fail; rgd->rd_gl->gl_object = rgd; - rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lvb; + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~GFS2_RDF_UPTODATE; if (rgd->rd_data > sdp->sd_max_rg_data) sdp->sd_max_rg_data = rgd->rd_data; From b7804161a3a3077c568078dfaa4ee4ffc8817f65 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 16 Nov 2012 09:04:16 -0500 Subject: [PATCH 14/16] GFS2: don't reference inode's glock during block allocation trace This patch changes the block allocation trace so that it references the rgd's glock rather than the inode's glock. Now that the order of inode creation is switched, this prevents a reference to the glock which may not be set yet. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/trace_gfs2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index bbdc78af60c..2ee13e841e9 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -486,7 +486,7 @@ TRACE_EVENT(gfs2_block_alloc, ), TP_fast_assign( - __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev; + __entry->dev = rgd->rd_gl->gl_sbd->sd_vfs->s_dev; __entry->start = block; __entry->inum = ip->i_no_addr; __entry->len = len; From be4f245dbbbc1f37370ab463cd4892acf4a1222b Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 16 Nov 2012 09:11:39 -0500 Subject: [PATCH 15/16] GFS2: add error check while allocating new inodes This patch adds a return code check after attempting to allocate a new inode during dinode creation. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e321333f0b4..2405695febe 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -674,6 +674,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail_gunlock; inode = new_inode(sdp->sd_vfs); + if (!inode) { + gfs2_glock_dq_uninit(ghs); + return -ENOMEM; + } ip = GFS2_I(inode); error = gfs2_rs_alloc(ip); if (error) From 1e2d9d44f3ceb7dac7cb14d2476d0a8128c8e169 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 21 Nov 2012 09:56:00 -0500 Subject: [PATCH 16/16] GFS2: Set gl_object during inode create This patch fixes a cluster coherency problem that occurs when one node creates a file, does several writes, then a different node tries to write to the same file. When the inode's glock is demoted, the inode wasn't synced to the media properly because the gl_object wasn't set. Later, the flush daemon noticed the uncommitted data and tried to flush it, only to discover the glock was no longer locked properly in exclusive mode. That caused an assert withdraw. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 2405695febe..2b6f5698ef1 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -702,6 +702,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_free_inode; + ip->i_gl->gl_object = ip; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); if (error) goto fail_free_inode;