From 96e65306b81351b656835c15931d1d237b252f27 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sun, 2 Sep 2012 00:28:19 +0800
Subject: [PATCH 1/5] workqueue: UNBOUND -> REBIND morphing in rebind_workers()
 should be atomic

The compiler may compile the following code into TWO write/modify
instructions.

	worker->flags &= ~WORKER_UNBOUND;
	worker->flags |= WORKER_REBIND;

so the other CPU may temporarily see worker->flags which doesn't have
either WORKER_UNBOUND or WORKER_REBIND set and perform local wakeup
prematurely.

Fix it by using single explicit assignment via ACCESS_ONCE().

Because idle workers have another WORKER_NOT_RUNNING flag, this bug
doesn't exist for them; however, update it to use the same pattern for
consistency.

tj: Applied the change to idle workers too and updated comments and
    patch description a bit.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
---
 kernel/workqueue.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 692d97628a1..c462cd60c37 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1396,12 +1396,15 @@ retry:
 	/* set REBIND and kick idle ones, we'll wait for these later */
 	for_each_worker_pool(pool, gcwq) {
 		list_for_each_entry(worker, &pool->idle_list, entry) {
+			unsigned long worker_flags = worker->flags;
+
 			if (worker->flags & WORKER_REBIND)
 				continue;
 
-			/* morph UNBOUND to REBIND */
-			worker->flags &= ~WORKER_UNBOUND;
-			worker->flags |= WORKER_REBIND;
+			/* morph UNBOUND to REBIND atomically */
+			worker_flags &= ~WORKER_UNBOUND;
+			worker_flags |= WORKER_REBIND;
+			ACCESS_ONCE(worker->flags) = worker_flags;
 
 			idle_rebind.cnt++;
 			worker->idle_rebind = &idle_rebind;
@@ -1434,10 +1437,12 @@ retry:
 	/* rebind busy workers */
 	for_each_busy_worker(worker, i, pos, gcwq) {
 		struct work_struct *rebind_work = &worker->rebind_work;
+		unsigned long worker_flags = worker->flags;
 
-		/* morph UNBOUND to REBIND */
-		worker->flags &= ~WORKER_UNBOUND;
-		worker->flags |= WORKER_REBIND;
+		/* morph UNBOUND to REBIND atomically */
+		worker_flags &= ~WORKER_UNBOUND;
+		worker_flags |= WORKER_REBIND;
+		ACCESS_ONCE(worker->flags) = worker_flags;
 
 		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
 				     work_data_bits(rebind_work)))

From 90beca5de591e12482a812f23a7f10690962ed4a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 4 Sep 2012 23:12:33 -0700
Subject: [PATCH 2/5] workqueue: move WORKER_REBIND clearing in
 rebind_workers() to the end of the function

This doesn't make any functional difference and is purely to help the
next patch to be simpler.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 kernel/workqueue.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c462cd60c37..d79a18d0c42 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1422,19 +1422,7 @@ retry:
 		goto retry;
 	}
 
-	/*
-	 * All idle workers are rebound and waiting for %WORKER_REBIND to
-	 * be cleared inside idle_worker_rebind().  Clear and release.
-	 * Clearing %WORKER_REBIND from this foreign context is safe
-	 * because these workers are still guaranteed to be idle.
-	 */
-	for_each_worker_pool(pool, gcwq)
-		list_for_each_entry(worker, &pool->idle_list, entry)
-			worker->flags &= ~WORKER_REBIND;
-
-	wake_up_all(&gcwq->rebind_hold);
-
-	/* rebind busy workers */
+	/* all idle workers are rebound, rebind busy workers */
 	for_each_busy_worker(worker, i, pos, gcwq) {
 		struct work_struct *rebind_work = &worker->rebind_work;
 		unsigned long worker_flags = worker->flags;
@@ -1454,6 +1442,18 @@ retry:
 			    worker->scheduled.next,
 			    work_color_to_flags(WORK_NO_COLOR));
 	}
+
+	/*
+	 * All idle workers are rebound and waiting for %WORKER_REBIND to
+	 * be cleared inside idle_worker_rebind().  Clear and release.
+	 * Clearing %WORKER_REBIND from this foreign context is safe
+	 * because these workers are still guaranteed to be idle.
+	 */
+	for_each_worker_pool(pool, gcwq)
+		list_for_each_entry(worker, &pool->idle_list, entry)
+			worker->flags &= ~WORKER_REBIND;
+
+	wake_up_all(&gcwq->rebind_hold);
 }
 
 static struct worker *alloc_worker(void)

From ec58815ab0409a921a7c9744eb4ca44866b14d71 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 4 Sep 2012 23:16:32 -0700
Subject: [PATCH 3/5] workqueue: fix possible deadlock in idle worker rebinding

Currently, rebind_workers() and idle_worker_rebind() are two-way
interlocked.  rebind_workers() waits for idle workers to finish
rebinding and rebound idle workers wait for rebind_workers() to finish
rebinding busy workers before proceeding.

Unfortunately, this isn't enough.  The second wait from idle workers
is implemented as follows.

	wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));

rebind_workers() clears WORKER_REBIND, wakes up the idle workers and
then returns.  If CPU hotplug cycle happens again before one of the
idle workers finishes the above wait_event(), rebind_workers() will
repeat the first part of the handshake - set WORKER_REBIND again and
wait for the idle worker to finish rebinding - and this leads to
deadlock because the idle worker would be waiting for WORKER_REBIND to
clear.

This is fixed by adding another interlocking step at the end -
rebind_workers() now waits for all the idle workers to finish the
above WORKER_REBIND wait before returning.  This ensures that all
rebinding steps are complete on all idle workers before the next
hotplug cycle can happen.

This problem was diagnosed by Lai Jiangshan who also posted a patch to
fix the issue, upon which this patch is based.

This is the minimal fix and further patches are scheduled for the next
merge window to simplify the CPU hotplug path.

Signed-off-by: Tejun Heo <tj@kernel.org>
Original-patch-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <1346516916-1991-3-git-send-email-laijs@cn.fujitsu.com>
---
 kernel/workqueue.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d79a18d0c42..dc7b8458e27 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1326,6 +1326,15 @@ static void idle_worker_rebind(struct worker *worker)
 
 	/* we did our part, wait for rebind_workers() to finish up */
 	wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+
+	/*
+	 * rebind_workers() shouldn't finish until all workers passed the
+	 * above WORKER_REBIND wait.  Tell it when done.
+	 */
+	spin_lock_irq(&worker->pool->gcwq->lock);
+	if (!--worker->idle_rebind->cnt)
+		complete(&worker->idle_rebind->done);
+	spin_unlock_irq(&worker->pool->gcwq->lock);
 }
 
 /*
@@ -1448,12 +1457,28 @@ retry:
 	 * be cleared inside idle_worker_rebind().  Clear and release.
 	 * Clearing %WORKER_REBIND from this foreign context is safe
 	 * because these workers are still guaranteed to be idle.
+	 *
+	 * We need to make sure all idle workers passed WORKER_REBIND wait
+	 * in idle_worker_rebind() before returning; otherwise, workers can
+	 * get stuck at the wait if hotplug cycle repeats.
 	 */
-	for_each_worker_pool(pool, gcwq)
-		list_for_each_entry(worker, &pool->idle_list, entry)
+	idle_rebind.cnt = 1;
+	INIT_COMPLETION(idle_rebind.done);
+
+	for_each_worker_pool(pool, gcwq) {
+		list_for_each_entry(worker, &pool->idle_list, entry) {
 			worker->flags &= ~WORKER_REBIND;
+			idle_rebind.cnt++;
+		}
+	}
 
 	wake_up_all(&gcwq->rebind_hold);
+
+	if (--idle_rebind.cnt) {
+		spin_unlock_irq(&gcwq->lock);
+		wait_for_completion(&idle_rebind.done);
+		spin_lock_irq(&gcwq->lock);
+	}
 }
 
 static struct worker *alloc_worker(void)

From 552a37e9360a293cd20e7f8ff1fb326a244c5f1e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 10 Sep 2012 10:03:33 -0700
Subject: [PATCH 4/5] workqueue: restore POOL_MANAGING_WORKERS

This patch restores POOL_MANAGING_WORKERS which was replaced by
pool->manager_mutex by 6037315269 "workqueue: use mutex for global_cwq
manager exclusion".

There's a subtle idle worker depletion bug across CPU hotplug events
and we need to distinguish an actual manager and CPU hotplug
preventing management.  POOL_MANAGING_WORKERS will be used for the
former and manager_mutex the later.

This patch just lays POOL_MANAGING_WORKERS on top of the existing
manager_mutex and doesn't introduce any synchronization changes.  The
next patch will update it.

Note that this patch fixes a non-critical anomaly where
too_many_workers() may return %true spuriously while CPU hotplug is in
progress.  While the issue could schedule idle timer spuriously, it
didn't trigger any actual misbehavior.

tj: Rewrote patch description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dc7b8458e27..383548ed0b5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -66,6 +66,7 @@ enum {
 
 	/* pool flags */
 	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
+	POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
 
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
@@ -652,7 +653,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
-	bool managing = mutex_is_locked(&pool->manager_mutex);
+	bool managing = pool->flags & POOL_MANAGING_WORKERS;
 	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
 	int nr_busy = pool->nr_workers - nr_idle;
 
@@ -1827,6 +1828,7 @@ static bool manage_workers(struct worker *worker)
 	if (!mutex_trylock(&pool->manager_mutex))
 		return ret;
 
+	pool->flags |= POOL_MANAGING_WORKERS;
 	pool->flags &= ~POOL_MANAGE_WORKERS;
 
 	/*
@@ -1836,6 +1838,7 @@ static bool manage_workers(struct worker *worker)
 	ret |= maybe_destroy_workers(pool);
 	ret |= maybe_create_worker(pool);
 
+	pool->flags &= ~POOL_MANAGING_WORKERS;
 	mutex_unlock(&pool->manager_mutex);
 	return ret;
 }

From ee378aa49b594da9bda6a2c768cc5b2ad585f911 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 10 Sep 2012 10:03:44 -0700
Subject: [PATCH 5/5] workqueue: fix possible idle worker depletion across CPU
 hotplug

To simplify both normal and CPU hotplug paths, worker management is
prevented while CPU hoplug is in progress.  This is achieved by CPU
hotplug holding the same exclusion mechanism used by workers to ensure
there's only one manager per pool.

If someone else seems to be performing the manager role, workers
proceed to execute work items.  CPU hotplug using the same mechanism
can lead to idle worker depletion because all workers could proceed to
execute work items while CPU hotplug is in progress and CPU hotplug
itself wouldn't actually perform the worker management duty - it
doesn't guarantee that there's an idle worker left when it releases
management.

This idle worker depletion, under extreme circumstances, can break
forward-progress guarantee and thus lead to deadlock.

This patch fixes the bug by using separate mechanisms for manager
exclusion among workers and hotplug exclusion.  For manager exclusion,
POOL_MANAGING_WORKERS which was restored by the previous patch is
used.  pool->manager_mutex is now only used for exclusion between the
elected manager and CPU hotplug.  The elected manager won't proceed
without holding pool->manager_mutex.

This ensures that the worker which won the manager position can't skip
managing while CPU hotplug is in progress.  It will block on
manager_mutex and perform management after CPU hotplug is complete.

Note that hotplug may happen while waiting for manager_mutex.  A
manager isn't either on idle or busy list and thus the hoplug code
can't unbind/rebind it.  Make the manager handle its own un/rebinding.

tj: Updated comment and description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 383548ed0b5..1e1373bcb3e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1825,10 +1825,45 @@ static bool manage_workers(struct worker *worker)
 	struct worker_pool *pool = worker->pool;
 	bool ret = false;
 
-	if (!mutex_trylock(&pool->manager_mutex))
+	if (pool->flags & POOL_MANAGING_WORKERS)
 		return ret;
 
 	pool->flags |= POOL_MANAGING_WORKERS;
+
+	/*
+	 * To simplify both worker management and CPU hotplug, hold off
+	 * management while hotplug is in progress.  CPU hotplug path can't
+	 * grab %POOL_MANAGING_WORKERS to achieve this because that can
+	 * lead to idle worker depletion (all become busy thinking someone
+	 * else is managing) which in turn can result in deadlock under
+	 * extreme circumstances.  Use @pool->manager_mutex to synchronize
+	 * manager against CPU hotplug.
+	 *
+	 * manager_mutex would always be free unless CPU hotplug is in
+	 * progress.  trylock first without dropping @gcwq->lock.
+	 */
+	if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
+		spin_unlock_irq(&pool->gcwq->lock);
+		mutex_lock(&pool->manager_mutex);
+		/*
+		 * CPU hotplug could have happened while we were waiting
+		 * for manager_mutex.  Hotplug itself can't handle us
+		 * because manager isn't either on idle or busy list, and
+		 * @gcwq's state and ours could have deviated.
+		 *
+		 * As hotplug is now excluded via manager_mutex, we can
+		 * simply try to bind.  It will succeed or fail depending
+		 * on @gcwq's current state.  Try it and adjust
+		 * %WORKER_UNBOUND accordingly.
+		 */
+		if (worker_maybe_bind_and_lock(worker))
+			worker->flags &= ~WORKER_UNBOUND;
+		else
+			worker->flags |= WORKER_UNBOUND;
+
+		ret = true;
+	}
+
 	pool->flags &= ~POOL_MANAGE_WORKERS;
 
 	/*