From de171cb9a52598cc023adceafc6c166112401386 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 31 Jan 2011 11:57:42 +1100 Subject: md: revert change to raid_disks on failure. If we try to update_raid_disks and it fails, we should put 'delta_disks' back to zero. This is important because some code, such as slot_store, assumes that delta_disks has been validated. Signed-off-by: NeilBrown --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index b76cfc89e1b..e636e404e9a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5578,6 +5578,8 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks) mddev->delta_disks = raid_disks - mddev->raid_disks; rv = mddev->pers->check_reshape(mddev); + if (rv < 0) + mddev->delta_disks = 0; return rv; } -- cgit v1.2.3 From 87a8dec91e15954f0cf86be6c21741d991d83621 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 31 Jan 2011 11:57:43 +1100 Subject: md: simplify some 'if' conditionals in raid5_start_reshape. There are two consecutive 'if' statements. if (mddev->delta_disks >= 0) .... if (mddev->delta_disks > 0) The code in the second is equally valid if delta_disks == 0, and these two statements are the only place that 'added_devices' is used. So make them a single if statement, make added_devices a local variable, and re-indent it all. No functional change. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 55 +++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5044babfcda..fa5519389a1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5517,7 +5517,6 @@ static int raid5_start_reshape(mddev_t *mddev) raid5_conf_t *conf = mddev->private; mdk_rdev_t *rdev; int spares = 0; - int added_devices = 0; unsigned long flags; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) @@ -5571,34 +5570,36 @@ static int raid5_start_reshape(mddev_t *mddev) * to correctly record the "partially reconstructed" state of * such devices during the reshape and confusion could result. */ - if (mddev->delta_disks >= 0) - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk < 0 && - !test_bit(Faulty, &rdev->flags)) { - if (raid5_add_disk(mddev, rdev) == 0) { - char nm[20]; - if (rdev->raid_disk >= conf->previous_raid_disks) { - set_bit(In_sync, &rdev->flags); - added_devices++; + if (mddev->delta_disks >= 0) { + int added_devices = 0; + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk < 0 && + !test_bit(Faulty, &rdev->flags)) { + if (raid5_add_disk(mddev, rdev) == 0) { + char nm[20]; + if (rdev->raid_disk + >= conf->previous_raid_disks) { + set_bit(In_sync, &rdev->flags); + added_devices++; + } else + rdev->recovery_offset = 0; + sprintf(nm, "rd%d", rdev->raid_disk); + if (sysfs_create_link(&mddev->kobj, + &rdev->kobj, nm)) + /* Failure here is OK */; } else - rdev->recovery_offset = 0; - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&mddev->kobj, - &rdev->kobj, nm)) - /* Failure here is OK */; - } else - break; - } else if (rdev->raid_disk >= conf->previous_raid_disks - && !test_bit(Faulty, &rdev->flags)) { - /* This is a spare that was manually added */ - set_bit(In_sync, &rdev->flags); - added_devices++; - } + break; + } else if (rdev->raid_disk >= conf->previous_raid_disks + && !test_bit(Faulty, &rdev->flags)) { + /* This is a spare that was manually added */ + set_bit(In_sync, &rdev->flags); + added_devices++; + } - /* When a reshape changes the number of devices, ->degraded - * is measured against the larger of the pre and post number of - * devices.*/ - if (mddev->delta_disks > 0) { + /* When a reshape changes the number of devices, + * ->degraded is measured against the larger of the + * pre and post number of devices. + */ spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) - added_devices; -- cgit v1.2.3 From 469518a3455c79619e9231aeffeffa2e2989f738 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 31 Jan 2011 11:57:43 +1100 Subject: md: fix the test for finding spares in raid5_start_reshape. As spares can be added to the array before the reshape is started, we need to find and count them when checking there are enough. The array could have been degraded, so we need to check all devices, no just those out side of the range of devices in the array before the reshape. So instead of checking the index, check the In_sync flag as that reliably tells if the device is a spare or this purpose. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index fa5519389a1..a6d2c3ddeee 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5526,8 +5526,8 @@ static int raid5_start_reshape(mddev_t *mddev) return -ENOSPC; list_for_each_entry(rdev, &mddev->disks, same_set) - if ((rdev->raid_disk < 0 || rdev->raid_disk >= conf->raid_disks) - && !test_bit(Faulty, &rdev->flags)) + if (!test_bit(In_sync, &rdev->flags) + && !test_bit(Faulty, &rdev->flags)) spares++; if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) -- cgit v1.2.3 From 50da08409654e036c4c964a473567a61a654cb83 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 31 Jan 2011 11:57:43 +1100 Subject: md: don't abort checking spares as soon as one cannot be added. As spares can be added manually before a reshape starts, we need to find them all to mark some of them as in_sync. Previously we would abort looking for spares when we found an unallocated spare what could not be added to the array (implying there was no room for new spares). However already-added spares could be later in the list, so we need to keep searching. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a6d2c3ddeee..70281282419 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5587,8 +5587,7 @@ static int raid5_start_reshape(mddev_t *mddev) if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) /* Failure here is OK */; - } else - break; + } } else if (rdev->raid_disk >= conf->previous_raid_disks && !test_bit(Faulty, &rdev->flags)) { /* This is a spare that was manually added */ -- cgit v1.2.3 From f21e9ff7f77d41ceca4e1e5ee5a4efa5ad7a5e40 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 31 Jan 2011 12:10:09 +1100 Subject: md: Remove the AllReserved flag for component devices. This flag is not needed and is used badly. Devices that are included in a native-metadata array are reserved exclusively for that array - and currently have AllReserved set. They all are bd_claimed for the rdev and so cannot be shared. Devices that are included in external-metadata arrays can be shared among multiple arrays - providing there is no overlap. These are bd_claimed for md in general - not for a particular rdev. When changing the amount of a device that is used in an array we need to check for overlap. This currently includes a check on AllReserved So even without overlap, sharing with an AllReserved device is not allowed. However the bd_claim usage already precludes sharing with these devices, so the test on AllReserved is not needed. And in fact it is wrong. As this is the only use of AllReserved, simply remove all usage and definition of AllReserved. Signed-off-by: NeilBrown --- drivers/md/md.c | 13 +++++-------- drivers/md/md.h | 2 -- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index e636e404e9a..f539b587ca7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1947,8 +1947,6 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) __bdevname(dev, b)); return PTR_ERR(bdev); } - if (!shared) - set_bit(AllReserved, &rdev->flags); rdev->bdev = bdev; return err; } @@ -2610,12 +2608,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) mddev_lock(mddev); list_for_each_entry(rdev2, &mddev->disks, same_set) - if (test_bit(AllReserved, &rdev2->flags) || - (rdev->bdev == rdev2->bdev && - rdev != rdev2 && - overlaps(rdev->data_offset, rdev->sectors, - rdev2->data_offset, - rdev2->sectors))) { + if (rdev->bdev == rdev2->bdev && + rdev != rdev2 && + overlaps(rdev->data_offset, rdev->sectors, + rdev2->data_offset, + rdev2->sectors)) { overlap = 1; break; } diff --git a/drivers/md/md.h b/drivers/md/md.h index eec517ced31..7e90b8593b2 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -93,8 +93,6 @@ struct mdk_rdev_s #define Faulty 1 /* device is known to have a fault */ #define In_sync 2 /* device is in_sync with rest of array */ #define WriteMostly 4 /* Avoid reading if at all possible */ -#define AllReserved 6 /* If whole device is reserved for - * one array */ #define AutoDetected 7 /* added by auto-detect */ #define Blocked 8 /* An error occured on an externally * managed array, don't allow writes -- cgit v1.2.3 From fc3a08b85b7a4f6c1069e5f71f6ad40d925ff55b Mon Sep 17 00:00:00 2001 From: Krzysztof Wojcik Date: Mon, 31 Jan 2011 13:47:13 +1100 Subject: Add raid1->raid0 takeover support This patch introduces raid 1 to raid0 takeover operation in kernel space. Signed-off-by: Krzysztof Wojcik Signed-off-by: Neil Brown --- drivers/md/raid0.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'drivers') diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index a39f4c355e5..637a96855ed 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -179,6 +179,14 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) rdev1->new_raid_disk = j; } + if (mddev->level == 1) { + /* taiking over a raid1 array- + * we have only one active disk + */ + j = 0; + rdev1->new_raid_disk = j; + } + if (j < 0 || j >= mddev->raid_disks) { printk(KERN_ERR "md/raid0:%s: bad disk number %d - " "aborting!\n", mdname(mddev), j); @@ -644,12 +652,38 @@ static void *raid0_takeover_raid10(mddev_t *mddev) return priv_conf; } +static void *raid0_takeover_raid1(mddev_t *mddev) +{ + raid0_conf_t *priv_conf; + + /* Check layout: + * - (N - 1) mirror drives must be already faulty + */ + if ((mddev->raid_disks - 1) != mddev->degraded) { + printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_layout = 0; + mddev->new_chunk_sectors = 128; /* by default set chunk size to 64k */ + mddev->delta_disks = 1 - mddev->raid_disks; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + create_strip_zones(mddev, &priv_conf); + return priv_conf; +} + static void *raid0_takeover(mddev_t *mddev) { /* raid0 can take over: * raid4 - if all data disks are active. * raid5 - providing it is Raid4 layout and one disk is faulty * raid10 - assuming we have all necessary active disks + * raid1 - with (N -1) mirror drives faulty */ if (mddev->level == 4) return raid0_takeover_raid45(mddev); @@ -665,6 +699,12 @@ static void *raid0_takeover(mddev_t *mddev) if (mddev->level == 10) return raid0_takeover_raid10(mddev); + if (mddev->level == 1) + return raid0_takeover_raid1(mddev); + + printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n", + mddev->level); + return ERR_PTR(-EINVAL); } -- cgit v1.2.3 From a8c42c7f476b5bb39bb3a5b32d5473b9a46cadb9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 31 Jan 2011 13:47:13 +1100 Subject: md: Don't use remove_and_add_spares to remove failed devices from a read-only array remove_and_add_spares is called in two places where the needs really are very different. remove_and_add_spares should not be called on an array which is about to be reshaped as some extra devices might have been manually added and that would remove them. However if the array is 'read-auto', that will currently happen, which is bad. So in the 'ro != 0' case don't call remove_and_add_spares but simply remove the failed devices as the comment suggests is needed. Signed-off-by: NeilBrown --- drivers/md/md.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index f539b587ca7..5b93829f3d4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7027,7 +7027,7 @@ static int remove_and_add_spares(mddev_t *mddev) } } - if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) { + if (mddev->degraded && !mddev->recovery_disabled) { list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && @@ -7150,7 +7150,20 @@ void md_check_recovery(mddev_t *mddev) /* Only thing we do on a ro array is remove * failed devices. */ - remove_and_add_spares(mddev); + mdk_rdev_t *rdev; + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0 && + !test_bit(Blocked, &rdev->flags) && + test_bit(Faulty, &rdev->flags) && + atomic_read(&rdev->nr_pending)==0) { + if (mddev->pers->hot_remove_disk( + mddev, rdev->raid_disk)==0) { + char nm[20]; + sprintf(nm,"rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + rdev->raid_disk = -1; + } + } clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; } -- cgit v1.2.3 From 7281f8129c362436237b82c8c026494dd36479dc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 31 Jan 2011 14:30:27 +1100 Subject: md: don't clear curr_resync_completed at end of resync. There is no need to set this to zero at this point. It will be set to zero by remove_and_add_spares or at the start of md_do_sync at the latest. And setting it to zero before MD_RECOVERY_RUNNING is cleared can make a 'zero' appear briefly in the 'sync_completed' sysfs attribute just as resync is finishing. So simply remove this setting to zero. Signed-off-by: NeilBrown --- drivers/md/md.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 5b93829f3d4..f2d5628d51c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6984,9 +6984,6 @@ void md_do_sync(mddev_t *mddev) } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; mddev->curr_resync = 0; - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - mddev->curr_resync_completed = 0; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); -- cgit v1.2.3 From c6751b2bde477f56ceef67aa1d298ce44e8e2e23 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 2 Feb 2011 11:57:13 +1100 Subject: md: Don't allow slot_store while resync/recovery is happening. Activating a spare in an array while resync/recovery is already happening can lead the that spare being marked in-sync when it isn't really. So don't allow the 'slot' to be set (this activating the device) while resync/recovery is happening. Signed-off-by: NeilBrown --- drivers/md/md.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index f2d5628d51c..1138d1053e9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2463,6 +2463,9 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (rdev->raid_disk != -1) return -EBUSY; + if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) + return -EBUSY; + if (rdev->mddev->pers->hot_add_disk == NULL) return -EINVAL; -- cgit v1.2.3 From e91ece5590b3c728624ab57043fc7a05069c604a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 7 Feb 2011 19:21:48 -0500 Subject: md_make_request: don't touch the bio after calling make_request md_make_request was calling bio_sectors() for part_stat_add after it was calling the make_request function. This is bad because the make_request function can free the bio and because the bi_size field can change around. The fix here was suggested by Jens Axboe. It saves the sector count before the make_request call. I hit this with CONFIG_DEBUG_PAGEALLOC turned on while trying to break his pretty fusionio card. Cc: Signed-off-by: Chris Mason Signed-off-by: NeilBrown --- drivers/md/md.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 1138d1053e9..0cc30ecda4c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -287,6 +287,7 @@ static int md_make_request(struct request_queue *q, struct bio *bio) mddev_t *mddev = q->queuedata; int rv; int cpu; + unsigned int sectors; if (mddev == NULL || mddev->pers == NULL || !mddev->ready) { @@ -311,12 +312,16 @@ static int md_make_request(struct request_queue *q, struct bio *bio) atomic_inc(&mddev->active_io); rcu_read_unlock(); + /* + * save the sectors now since our bio can + * go away inside make_request + */ + sectors = bio_sectors(bio); rv = mddev->pers->make_request(mddev, bio); cpu = part_stat_lock(); part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], - bio_sectors(bio)); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); part_stat_unlock(); if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) -- cgit v1.2.3 From 02214dc5461c36da26a34014cab4e1bb484edba2 Mon Sep 17 00:00:00 2001 From: Krzysztof Wojcik Date: Fri, 4 Feb 2011 14:18:26 +0100 Subject: FIX: md: process hangs at wait_barrier after 0->10 takeover Following symptoms were observed: 1. After raid0->raid10 takeover operation we have array with 2 missing disks. When we add disk for rebuild, recovery process starts as expected but it does not finish- it stops at about 90%, md126_resync process hangs in "D" state. 2. Similar behavior is when we have mounted raid0 array and we execute takeover to raid10. After this when we try to unmount array- it causes process umount hangs in "D" In scenarios above processes hang at the same function- wait_barrier in raid10.c. Process waits in macro "wait_event_lock_irq" until the "!conf->barrier" condition will be true. In scenarios above it never happens. Reason was that at the end of level_store, after calling pers->run, we call mddev_resume. This calls pers->quiesce(mddev, 0) with RAID10, that calls lower_barrier. However raise_barrier hadn't been called on that 'conf' yet, so conf->barrier becomes negative, which is bad. This patch introduces setting conf->barrier=1 after takeover operation. It prevents to become barrier negative after call lower_barrier(). Signed-off-by: Krzysztof Wojcik Signed-off-by: NeilBrown --- drivers/md/raid10.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 69b65954439..3b607b28741 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2463,11 +2463,13 @@ static void *raid10_takeover_raid0(mddev_t *mddev) mddev->recovery_cp = MaxSector; conf = setup_conf(mddev); - if (!IS_ERR(conf)) + if (!IS_ERR(conf)) { list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0) rdev->new_raid_disk = rdev->raid_disk * 2; - + conf->barrier = 1; + } + return conf; } -- cgit v1.2.3