[patch] sched: fix sched-domains partitioning by cpusets

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



* Nick Piggin <[email protected]> wrote:

> BTW. as far as the sched.c changes in your patch go, I much prefer the 
> partition_sched_domains API: http://lkml.org/lkml/2006/10/19/85
> 
> The caller should manage everything itself, rather than 
> partition_sched_domains doing half of the memory allocation.

i've merged your patch to my scheduler queue - see the patch below. (And 
could you send me your SoB line too?) Paul, if we went with the patch 
below, what else would be needed for your purposes?

	Ingo

--------------------------------->
Subject: sched: fix sched-domains partitioning by cpusets
From: Nick Piggin <[email protected]>

Fix sched-domains partitioning by cpusets. Walk the whole cpusets tree after
something interesting changes, and recreate all partitions.

Signed-off-by: Ingo Molnar <[email protected]>
---
 include/linux/cpuset.h |    2 
 include/linux/sched.h  |    3 -
 kernel/cpuset.c        |  109 ++++++++++++++++++++++---------------------------
 kernel/sched.c         |   31 +++++++------
 4 files changed, 70 insertions(+), 75 deletions(-)

Index: linux/include/linux/cpuset.h
===================================================================
--- linux.orig/include/linux/cpuset.h
+++ linux/include/linux/cpuset.h
@@ -14,6 +14,8 @@
 
 #ifdef CONFIG_CPUSETS
 
+extern int cpuset_hotplug_update_sched_domains(void);
+
 extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 
 extern int cpuset_init_early(void);
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -798,8 +798,7 @@ struct sched_domain {
 #endif
 };
 
-extern int partition_sched_domains(cpumask_t *partition1,
-				    cpumask_t *partition2);
+extern int partition_sched_domains(cpumask_t *partition);
 
 #endif	/* CONFIG_SMP */
 
Index: linux/kernel/cpuset.c
===================================================================
--- linux.orig/kernel/cpuset.c
+++ linux/kernel/cpuset.c
@@ -752,6 +752,24 @@ static int validate_change(const struct 
 	return 0;
 }
 
+static void update_cpu_domains_children(struct cpuset *par,
+					cpumask_t *non_partitioned)
+{
+	struct cpuset *c;
+
+	list_for_each_entry(c, &par->children, sibling) {
+		if (cpus_empty(c->cpus_allowed))
+			continue;
+		if (is_cpu_exclusive(c)) {
+			if (!partition_sched_domains(&c->cpus_allowed)) {
+				cpus_andnot(*non_partitioned,
+					*non_partitioned, c->cpus_allowed);
+			}
+		} else
+			update_cpu_domains_children(c, non_partitioned);
+	}
+}
+
 /*
  * For a given cpuset cur, partition the system as follows
  * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
@@ -761,53 +779,38 @@ static int validate_change(const struct 
  * Build these two partitions by calling partition_sched_domains
  *
  * Call with manage_mutex held.  May nest a call to the
- * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
- * Must not be called holding callback_mutex, because we must
- * not call lock_cpu_hotplug() while holding callback_mutex.
+ * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.  Must not be called holding
+ * callback_mutex, because we must not call lock_cpu_hotplug() while holding
+ * callback_mutex.
  */
 
-static void update_cpu_domains(struct cpuset *cur)
+static void update_cpu_domains(void)
 {
-	struct cpuset *c, *par = cur->parent;
-	cpumask_t pspan, cspan;
-
-	if (par == NULL || cpus_empty(cur->cpus_allowed))
-		return;
+	cpumask_t non_partitioned;
 
-	/*
-	 * Get all cpus from parent's cpus_allowed not part of exclusive
-	 * children
-	 */
-	pspan = par->cpus_allowed;
-	list_for_each_entry(c, &par->children, sibling) {
-		if (is_cpu_exclusive(c))
-			cpus_andnot(pspan, pspan, c->cpus_allowed);
-	}
-	if (!is_cpu_exclusive(cur)) {
-		cpus_or(pspan, pspan, cur->cpus_allowed);
-		if (cpus_equal(pspan, cur->cpus_allowed))
-			return;
-		cspan = CPU_MASK_NONE;
-	} else {
-		if (cpus_empty(pspan))
-			return;
-		cspan = cur->cpus_allowed;
-		/*
-		 * Get all cpus from current cpuset's cpus_allowed not part
-		 * of exclusive children
-		 */
-		list_for_each_entry(c, &cur->children, sibling) {
-			if (is_cpu_exclusive(c))
-				cpus_andnot(cspan, cspan, c->cpus_allowed);
-		}
-	}
+	BUG_ON(!mutex_is_locked(&manage_mutex));
 
 	lock_cpu_hotplug();
-	partition_sched_domains(&pspan, &cspan);
+	non_partitioned = top_cpuset.cpus_allowed;
+	update_cpu_domains_children(&top_cpuset, &non_partitioned);
+	partition_sched_domains(&non_partitioned);
 	unlock_cpu_hotplug();
 }
 
 /*
+ * Same as above except called with lock_cpu_hotplug and without manage_mutex.
+ */
+
+int cpuset_hotplug_update_sched_domains(void)
+{
+	cpumask_t non_partitioned;
+
+	non_partitioned = top_cpuset.cpus_allowed;
+	update_cpu_domains_children(&top_cpuset, &non_partitioned);
+	return partition_sched_domains(&non_partitioned);
+}
+
+/*
  * Call with manage_mutex held.  May take callback_mutex during call.
  */
 
@@ -845,8 +848,8 @@ static int update_cpumask(struct cpuset 
 	mutex_lock(&callback_mutex);
 	cs->cpus_allowed = trialcs.cpus_allowed;
 	mutex_unlock(&callback_mutex);
-	if (is_cpu_exclusive(cs) && !cpus_unchanged)
-		update_cpu_domains(cs);
+	if (!cpus_unchanged)
+		update_cpu_domains();
 	return 0;
 }
 
@@ -1087,7 +1090,7 @@ static int update_flag(cpuset_flagbits_t
 	mutex_unlock(&callback_mutex);
 
 	if (cpu_exclusive_changed)
-                update_cpu_domains(cs);
+                update_cpu_domains();
 	return 0;
 }
 
@@ -1947,19 +1950,9 @@ static int cpuset_mkdir(struct inode *di
 	return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
 }
 
-/*
- * Locking note on the strange update_flag() call below:
- *
- * If the cpuset being removed is marked cpu_exclusive, then simulate
- * turning cpu_exclusive off, which will call update_cpu_domains().
- * The lock_cpu_hotplug() call in update_cpu_domains() must not be
- * made while holding callback_mutex.  Elsewhere the kernel nests
- * callback_mutex inside lock_cpu_hotplug() calls.  So the reverse
- * nesting would risk an ABBA deadlock.
- */
-
 static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
+	int is_exclusive;
 	struct cpuset *cs = dentry->d_fsdata;
 	struct dentry *d;
 	struct cpuset *parent;
@@ -1977,13 +1970,8 @@ static int cpuset_rmdir(struct inode *un
 		mutex_unlock(&manage_mutex);
 		return -EBUSY;
 	}
-	if (is_cpu_exclusive(cs)) {
-		int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
-		if (retval < 0) {
-			mutex_unlock(&manage_mutex);
-			return retval;
-		}
-	}
+	is_exclusive = is_cpu_exclusive(cs);
+
 	parent = cs->parent;
 	mutex_lock(&callback_mutex);
 	set_bit(CS_REMOVED, &cs->flags);
@@ -1998,8 +1986,13 @@ static int cpuset_rmdir(struct inode *un
 	mutex_unlock(&callback_mutex);
 	if (list_empty(&parent->children))
 		check_for_release(parent, &pathbuf);
+
+	if (is_exclusive)
+		update_cpu_domains();
+
 	mutex_unlock(&manage_mutex);
 	cpuset_release_agent(pathbuf);
+
 	return 0;
 }
 
Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -6274,6 +6274,9 @@ error:
  */
 static int arch_init_sched_domains(const cpumask_t *cpu_map)
 {
+#ifdef CONFIG_CPUSETS
+	return cpuset_hotplug_update_sched_domains();
+#else
 	cpumask_t cpu_default_map;
 	int err;
 
@@ -6287,6 +6290,7 @@ static int arch_init_sched_domains(const
 	err = build_sched_domains(&cpu_default_map);
 
 	return err;
+#endif
 }
 
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
@@ -6310,29 +6314,26 @@ static void detach_destroy_domains(const
 
 /*
  * Partition sched domains as specified by the cpumasks below.
- * This attaches all cpus from the cpumasks to the NULL domain,
+ * This attaches all cpus from the partition to the NULL domain,
  * waits for a RCU quiescent period, recalculates sched
- * domain information and then attaches them back to the
- * correct sched domains
- * Call with hotplug lock held
+ * domain information and then attaches them back to their own
+ * isolated partition.
+ *
+ * Called with hotplug lock held
+ *
+ * Returns 0 on success.
  */
-int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+int partition_sched_domains(cpumask_t *partition)
 {
+	cpumask_t non_isolated_cpus;
 	cpumask_t change_map;
-	int err = 0;
 
-	cpus_and(*partition1, *partition1, cpu_online_map);
-	cpus_and(*partition2, *partition2, cpu_online_map);
-	cpus_or(change_map, *partition1, *partition2);
+	cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map);
+	cpus_and(change_map, *partition, non_isolated_cpus);
 
 	/* Detach sched domains from all of the affected cpus */
 	detach_destroy_domains(&change_map);
-	if (!cpus_empty(*partition1))
-		err = build_sched_domains(partition1);
-	if (!err && !cpus_empty(*partition2))
-		err = build_sched_domains(partition2);
-
-	return err;
+	return build_sched_domains(&change_map);
 }
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux