[PATCH] sched: implement staircase deadline scheduler ymf accounting fixes

This causes significant improvements on SMP hardware. I don't think the kernel
should be -nicing X by itself; that should be a sysadmin choice so I won't
be including that change in the SD patches. The following change will be in
the next release of SD (v0.45).

Andrew Please apply on top of yaf-fix

---
SMP balancing broke on converting time_slice to usecs.

update_cpu_clock is unnecessarily complex and doesn't allow sub usec values.

Thanks to Willy Tarreau <[email protected]> for picking up SMP idle anomalies.

Signed-off-by: Con Kolivas <[email protected]>

---
 kernel/sched.c |   42 +++++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

Index: linux-2.6.21-rc7-sd/kernel/sched.c
===================================================================
--- linux-2.6.21-rc7-sd.orig/kernel/sched.c	2007-04-21 22:50:31.000000000 +1000
+++ linux-2.6.21-rc7-sd/kernel/sched.c	2007-04-22 13:29:29.000000000 +1000
@@ -88,12 +88,10 @@ unsigned long long __attribute__((weak))
 #define SCHED_PRIO(p)		((p)+MAX_RT_PRIO)
 
 /* Some helpers for converting to/from various scales.*/
-#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
 #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+#define MS_TO_NS(TIME)		((TIME) * 1000000)
 #define MS_TO_US(TIME)		((TIME) * 1000)
-/* Can return 0 */
-#define MS_TO_JIFFIES(TIME)	((TIME) * HZ / 1000)
-#define JIFFIES_TO_MS(TIME)	((TIME) * 1000 / HZ)
+#define US_TO_MS(TIME)		((TIME) / 1000)
 
 #define TASK_PREEMPTS_CURR(p, curr)	((p)->prio < (curr)->prio)
 
@@ -876,29 +874,28 @@ static void requeue_task(struct task_str
 
 /*
  * task_timeslice - the total duration a task can run during one major
- * rotation. Returns value in jiffies.
+ * rotation. Returns value in milliseconds as the smallest value can be 1.
  */
-static inline int task_timeslice(struct task_struct *p)
+static int task_timeslice(struct task_struct *p)
 {
-	int slice;
+	int slice = p->quota;	/* quota is in us */
 
-	slice = NS_TO_JIFFIES(p->quota);
 	if (!rt_task(p))
 		slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice;
-	return slice;
+	return US_TO_MS(slice);
 }
 
 /*
  * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
  * If static_prio_timeslice() is ever changed to break this assumption then
- * this code will need modification
+ * this code will need modification. Scaled as multiples of milliseconds.
  */
 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
 #define LOAD_WEIGHT(lp) \
 	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 #define TASK_LOAD_WEIGHT(p)	LOAD_WEIGHT(task_timeslice(p))
 #define RTPRIO_TO_LOAD_WEIGHT(rp)	\
-	(LOAD_WEIGHT((MS_TO_JIFFIES(rr_interval) + 20 + (rp))))
+	(LOAD_WEIGHT((rr_interval + 20 + (rp))))
 
 static void set_load_weight(struct task_struct *p)
 {
@@ -3035,32 +3032,27 @@ static void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now,
 		 int tick)
 {
-	cputime64_t time_diff = now - p->last_ran;
-	const unsigned int min_diff = 1000;
-	int us_time_diff;
+	long time_diff = now - p->last_ran;
 
 	if (tick) {
 		/*
 		 * Called from scheduler_tick() there should be less than two
 		 * jiffies worth, and not negative/overflow.
 		 */
-		if (time_diff > JIFFIES_TO_NS(2) || time_diff < min_diff)
+		if (time_diff > JIFFIES_TO_NS(2) || time_diff < 0)
 			time_diff = JIFFIES_TO_NS(1);
 	} else {
 		/*
 		 * Called from context_switch there should be less than one
-		 * jiffy worth, and not negative/overflowed. In the case when
-		 * sched_clock fails to return high resolution values this
-		 * also ensures at least 1 min_diff gets banked.
+		 * jiffy worth, and not negative/overflow. There should be
+		 * some time banked here so use a nominal 1ms.
 		 */
-		if (time_diff > JIFFIES_TO_NS(1) || time_diff < min_diff)
-			time_diff = min_diff;
+		if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1)
+			time_diff = 1000;
 	}
 	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
-	us_time_diff = time_diff;
-	us_time_diff /= 1000;
 	if (p != rq->idle && p->policy != SCHED_FIFO)
-		p->time_slice -= us_time_diff;
+		p->time_slice -= time_diff / 1000;
 	p->sched_time += time_diff;
 	p->last_ran = rq->most_recent_timestamp = now;
 }
@@ -4636,8 +4628,8 @@ long sys_sched_rr_get_interval(pid_t pid
 	if (retval)
 		goto out_unlock;
 
-	jiffies_to_timespec(p->policy == SCHED_FIFO ?
-				0 : task_timeslice(p), &t);
+	t = ns_to_timespec(p->policy == SCHED_FIFO ? 0 :
+			   MS_TO_NS(task_timeslice(p)));
 	read_unlock(&tasklist_lock);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Follow-Ups:
- [PATCH] sched: ymf typo
  - From: Con Kolivas <[email protected]>

Prev by Date: Re: Wrong free clusters count on FAT32
Next by Date: [PATCH] sched: ymf typo
Previous by thread: [PATCH] v9fs: don't use primary fid when removing file
Next by thread: [PATCH] sched: ymf typo
Index(es):
- Date
- Thread

[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]