[PATCH] sched: fix smpnice abmormal nice anomalies

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Suresh B. Siddha has reported:

"On a lightly loaded system, this can result in HT scheduler optimizations being disabled in presence of low priority tasks... in this case, they(low priority ones) can end up running on the same package, even in the presence of other idle packages."

Analysis has shown that this is a manifestation of a more general problem which occurs when the average of the nice values assigned to runnable tasks is skewed in either direction. The cause is that find_busiest_group() assumes that the average weighted load per task is SCHED_LOAD_SCALE which will not be true when the distribution of nice values is skewed in one direction or the other. This in turn will cause the load balancing code to under balance when the skew is towards low priority tasks (i.e. positive nice) and over balance when it is skewed in the opposite direction.

The attached patch fixes this problem. It replaces SCHED_LOAD_SCALE with the average load per task (calculated during the search for the busiest group) in those places where SCHED_LOAD_SCALE was being used to represent the load generated by a single task.

Signed-off-by: Peter Williams <[email protected]>

--

Andrew,
This patch in conjunction with the "Fix smpnice high priority task hopping problem" patch posted earlier should address all of the outstanding smpnice issues. Could you please add them to -mm so that they can get wider testing?

Thanks,
Peter
--
Peter Williams                                   [email protected]

"Learning, n. The kind of ignorance distinguishing the studious."
 -- Ambrose Bierce
Index: MM-2.6.X/kernel/sched.c
===================================================================
--- MM-2.6.X.orig/kernel/sched.c	2006-02-16 11:02:45.000000000 +1100
+++ MM-2.6.X/kernel/sched.c	2006-02-16 12:39:30.000000000 +1100
@@ -2025,9 +2025,11 @@ find_busiest_group(struct sched_domain *
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 	unsigned long max_pull;
+	unsigned long avg_load_per_task, busiest_nr_running;
 	int load_idx;
 
 	max_load = this_load = total_load = total_pwr = 0;
+	busiest_nr_running = 0;
 	if (idle == NOT_IDLE)
 		load_idx = sd->busy_idx;
 	else if (idle == NEWLY_IDLE)
@@ -2039,11 +2041,12 @@ find_busiest_group(struct sched_domain *
 		unsigned long load;
 		int local_group;
 		int i;
+		unsigned long sum_nr_running;
 
 		local_group = cpu_isset(this_cpu, group->cpumask);
 
 		/* Tally up the load of all CPUs in the group */
-		avg_load = 0;
+		sum_nr_running = avg_load = 0;
 
 		for_each_cpu_mask(i, group->cpumask) {
 			if (*sd_idle && !idle_cpu(i))
@@ -2056,6 +2059,7 @@ find_busiest_group(struct sched_domain *
 				load = source_load(i, load_idx);
 
 			avg_load += load;
+			sum_nr_running += cpu_rq(i)->nr_running;
 		}
 
 		total_load += avg_load;
@@ -2070,11 +2074,15 @@ find_busiest_group(struct sched_domain *
 		} else if (avg_load > max_load) {
 			max_load = avg_load;
 			busiest = group;
+			busiest_nr_running = sum_nr_running;
 		}
 		group = group->next;
 	} while (group != sd->groups);
 
-	if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
+	/* Don't assume that busiest_nr_running > 0 */
+	avg_load_per_task = busiest_nr_running ? max_load / busiest_nr_running : max_load;
+
+	if (!busiest || this_load >= max_load || max_load <= avg_load_per_task)
 		goto out_balanced;
 
 	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2096,19 +2104,25 @@ find_busiest_group(struct sched_domain *
 	 */
 
 	/* Don't want to pull so many tasks that a group would go idle */
-	max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+	max_pull = min(max_load - avg_load, max_load - avg_load_per_task);
 
 	/* How much load to actually move to equalise the imbalance */
 	*imbalance = min(max_pull * busiest->cpu_power,
 				(avg_load - this_load) * this->cpu_power)
 			/ SCHED_LOAD_SCALE;
 
-	if (*imbalance < SCHED_LOAD_SCALE) {
+	/*
+	 * if *imbalance is less than the average load per runnable task
+	 * there is no gaurantee that any tasks will be moved so we'll have
+	 * a think about bumping its value to force at least one task to be
+	 * moved
+	 */
+	if (*imbalance < avg_load_per_task) {
 		unsigned long pwr_now = 0, pwr_move = 0;
 		unsigned long tmp;
 
-		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-			*imbalance = NICE_TO_BIAS_PRIO(0);
+		if (max_load - this_load >= avg_load_per_task*2) {
+			*imbalance = biased_load(avg_load_per_task);
 			return busiest;
 		}
 
@@ -2118,31 +2132,32 @@ find_busiest_group(struct sched_domain *
 		 * moving them.
 		 */
 
-		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
-		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+		pwr_now += busiest->cpu_power*min(avg_load_per_task, max_load);
+		pwr_now += this->cpu_power*min(avg_load_per_task, this_load);
 		pwr_now /= SCHED_LOAD_SCALE;
 
 		/* Amount of load we'd subtract */
-		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+		tmp = avg_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
 		if (max_load > tmp)
-			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
+			pwr_move += busiest->cpu_power*min(avg_load_per_task,
 							max_load - tmp);
 
 		/* Amount of load we'd add */
 		if (max_load*busiest->cpu_power <
-				SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
+				avg_load_per_task*SCHED_LOAD_SCALE)
 			tmp = max_load*busiest->cpu_power/this->cpu_power;
 		else
-			tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
-		pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
+			tmp = avg_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
+		pwr_move += this->cpu_power*min(avg_load_per_task, this_load + tmp);
 		pwr_move /= SCHED_LOAD_SCALE;
 
-		/* Move if we gain throughput */
-		if (pwr_move <= pwr_now)
+		/* Move if we gain throughput
+		 * or if there's a reasonable chance that *imbalance is big enough to cause a move
+		 */
+		if (pwr_move > pwr_now)
+			*imbalance = avg_load_per_task;
+		 else if (*imbalance <= avg_load_per_task / 2)
 			goto out_balanced;
-
-		*imbalance = NICE_TO_BIAS_PRIO(0);
-		return busiest;
 	}
 
 	/*

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux