Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Con Kolivas wrote:
some quick comments within code below.

On Wednesday 05 July 2006 09:35, Peter Williams wrote:
---
 include/linux/init_task.h |    6 -
 include/linux/sched.h     |   11 ++
 kernel/fork.c             |    1
 kernel/mutex.c            |   28 ++++++-
 kernel/sched.c            |  183
++++++++++++++++++++++++++++++++++++++-------- 5 files changed, 192
insertions(+), 37 deletions(-)

Index: MM-2.6.17-mm6/include/linux/init_task.h
===================================================================
--- MM-2.6.17-mm6.orig/include/linux/init_task.h	2006-07-04
14:37:42.000000000 +1000 +++
MM-2.6.17-mm6/include/linux/init_task.h	2006-07-04 14:38:12.000000000 +1000
@@ -99,9 +99,9 @@ extern struct group_info init_groups;
 	.usage		= ATOMIC_INIT(2),				\
 	.flags		= 0,						\
 	.lock_depth	= -1,						\
-	.prio		= MAX_PRIO-20,					\
-	.static_prio	= MAX_PRIO-20,					\
-	.normal_prio	= MAX_PRIO-20,					\
+	.prio		= MAX_RT_PRIO+20,				\
+	.static_prio	= MAX_RT_PRIO+20,				\
+	.normal_prio	= MAX_RT_PRIO+20,				\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
Index: MM-2.6.17-mm6/include/linux/sched.h
===================================================================
--- MM-2.6.17-mm6.orig/include/linux/sched.h	2006-07-04 14:37:43.000000000
+1000 +++ MM-2.6.17-mm6/include/linux/sched.h	2006-07-04 14:38:12.000000000
+1000 @@ -34,6 +34,8 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
+/* Scheduler class for background tasks */
+#define SCHED_BGND		4

 #ifdef __KERNEL__

@@ -503,13 +505,16 @@ struct signal_struct {
 #define MAX_USER_RT_PRIO	100
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO

-#define MAX_PRIO		(MAX_RT_PRIO + 40)
+#define BGND_PRIO		(MAX_RT_PRIO + 40)
+/* add another slot for SCHED_BGND tasks */
+#define MAX_PRIO		(BGND_PRIO + 1)

 #define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
 #define rt_task(p)		rt_prio((p)->prio)
 #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
 #define has_rt_policy(p) \
-	unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
+	unlikely((p)->policy != SCHED_NORMAL && (p)->policy < SCHED_BATCH)

idleprio tasks should be able to get rt_policy as well

I don't understand what you mean here. A task can only have one scheduling policy. The simple (direct) definition of has_rt_policy() is (p->policy == SCHED_FIFO || p->policy == SCHED_RR) and the one defined is just a rearrangement of that with a view to minimizing overhead in the majority of invocations.


+#define bgnd_task(p)		(unlikely((p)->policy == SCHED_BGND))

 /*
  * Some day this will be a full-fledged user tracking system..
@@ -810,6 +815,7 @@ struct task_struct {
 	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
 	unsigned long long sched_time; /* sched_clock time spent running */
+	unsigned int mutexes_held; /* for knowing when it's safe to repress
SCHED_BGND tasks */ enum sleep_type sleep_type;

 	unsigned long policy;
@@ -1090,6 +1096,7 @@ static inline void put_task_struct(struc
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset
*/ +#define PF_UIWAKE	0x08000000	/* Just woken from uninterrptible sleep */
#define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex
tester */

Index: MM-2.6.17-mm6/kernel/sched.c
===================================================================
--- MM-2.6.17-mm6.orig/kernel/sched.c	2006-07-04 14:37:43.000000000 +1000
+++ MM-2.6.17-mm6/kernel/sched.c	2006-07-04 14:38:12.000000000 +1000
@@ -59,7 +59,7 @@

 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * to static priority [ MAX_RT_PRIO..BGND_PRIO-1 ],
  * and back.
  */
 #define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
@@ -73,7 +73,7 @@
  */
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
+#define MAX_USER_PRIO		(USER_PRIO(BGND_PRIO))

 /*
  * Some helpers for converting nanosecond timing to jiffy resolution
@@ -171,7 +171,7 @@
  */

 #define SCALE_PRIO(x, prio) \
-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
+	max(x * (BGND_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)

 static unsigned int static_prio_timeslice(int static_prio)
 {
@@ -186,6 +186,11 @@ static inline unsigned int task_timeslic
 	return static_prio_timeslice(p->static_prio);
 }

+#define task_in_background(p) unlikely((p)->prio == BGND_PRIO)
+#define safe_to_background(p) \
+	(!((p)->mutexes_held || \
+	   (p)->flags & (PF_FREEZE | PF_UIWAKE | PF_EXITING)))
+
 /*
  * These are the runqueue data structures:
  */
@@ -715,13 +720,17 @@ static inline int __normal_prio(struct t
 {
 	int bonus, prio;

+	/* Ensure that background tasks stay at BGND_PRIO */
+	if (bgnd_task(p) && safe_to_background(p))
+		return BGND_PRIO;
+
 	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;

 	prio = p->static_prio - bonus;
 	if (prio < MAX_RT_PRIO)
 		prio = MAX_RT_PRIO;
-	if (prio > MAX_PRIO-1)
-		prio = MAX_PRIO-1;
+	if (prio > BGND_PRIO-1)
+		prio = BGND_PRIO-1;
 	return prio;
 }

@@ -761,8 +770,18 @@ static void set_load_weight(struct task_
 		else
 #endif
 			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
-	} else
-		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+	} else {
+		/*
+		 * Reduce the probability of a task escaping the background
+		 * due to load balancing leaving it on a lighly used CPU
+		 * Can't use zero as that would kill load balancing when only
+		 * background tasks are running.
+		 */
+		if (bgnd_task(p))
+			p->load_weight = LOAD_WEIGHT(MIN_TIMESLICE / 2 ? : 1);

Why not just set it to 1 for all idleprio tasks? The granularity will be lost at anything lower anyway and it avoids a more complex calculation.

+		else
+			p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+	}
 }

 static inline void
@@ -834,7 +853,10 @@ static void __activate_task(struct task_
 {
 	struct prio_array *target = rq->active;

-	if (batch_task(p))
+	/* Don't punish batch tasks just tasks actually in the background

An extra line here for multiline comments such as:
+	/*
+	 * Don't punish batch tasks just tasks actually in the background


+	 * as anything else is counter productive from a system wide aspect
+	 */
+	if (task_in_background(p))
 		target = rq->expired;
 	enqueue_task(p, target);
 	inc_nr_running(p, rq);
@@ -942,6 +964,8 @@ static void activate_task(struct task_st
 	if (!rt_task(p))
 		p->prio = recalc_task_prio(p, now);

+	p->flags &= ~PF_UIWAKE;
+
 	/*
 	 * This checks to make sure it's not an uninterruptible task
 	 * that is now waking up.
@@ -1484,6 +1508,7 @@ out_activate:
 		 * sleep_avg beyond just interactive state.
 		 */
 		p->sleep_type = SLEEP_NONINTERACTIVE;
+		p->flags |= PF_UIWAKE;
 	} else

 	/*
@@ -3024,6 +3049,48 @@ void scheduler_tick(void)
 		}
 		goto out_unlock;
 	}
+
+	if (bgnd_task(p)) {
+		/*
+		 * Do this even if there's only one task on the queue as
+		 * we want to set the priority low so that any waking tasks
+		 * can preempt.
+		 */
+		if (task_in_background(p)) {
+			/*
+			 * Tasks currently in the background will be
+			 * at BGND_PRIO priority and preemption
+			 * should be enough to keep them in check provided we
+			 * don't let them adversely effect tasks on the expired

ok I'm going to risk a lart and say "affect" ?

I have to refer you to the Oxford English Dictionary. According to it (when used as a verb):

affect: 1. like, love 2. like to use, practice or wear 3. aim at, seek 4. use or display ostentatiously 5. assume a false appearance 6. attack as a disease 7. move or touch.

effect: 1. bring about (an event or result) 2. produce (a state or condition) 3. make, construct or build


+			 * array.
+			 */
+			if (!safe_to_background(p)) {
+				dequeue_task(p, rq->active);
+				p->prio = effective_prio(p);
+				enqueue_task(p, rq->active);
+			} else if (rq->expired->nr_active &&
+				   rq->best_expired_prio < p->prio) {
+				dequeue_task(p, rq->active);
+				enqueue_task(p, rq->expired);
+				set_tsk_need_resched(p);
+				goto out_unlock;
+			}
+		}
+		else if (safe_to_background(p)) {
+			dequeue_task(p, rq->active);
+			p->normal_prio = BGND_PRIO;
+			/* this should be safe for PI purposes */
+			p->prio = p->normal_prio;
+			enqueue_task(p, rq->expired);
+			/*
+			 * think about making this conditional to reduce
+			 * context switch rate
+			 */
+			set_tsk_need_resched(p);
+			goto out_unlock;
+		}
+	}
+
 	if (!--p->time_slice) {
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
@@ -3033,6 +3100,11 @@ void scheduler_tick(void)

 		if (!rq->expired_timestamp)
 			rq->expired_timestamp = jiffies;
+		/*
+		 * No need to do anything special for background tasks here
+		 * as TASK_INTERACTIVE() should fail when they're in the
+		 * background.
+		 */
 		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
 			enqueue_task(p, rq->expired);
 			if (p->static_prio < rq->best_expired_prio)
@@ -3122,6 +3194,33 @@ smt_slice(struct task_struct *p, struct
 }

 /*
+ * task time slice for SMT dependent idle purposes
+ */
+static unsigned int smt_timeslice(struct task_struct *p)
+{
+	if (task_in_background(p))
+		return 0;
+
+	return task_timeslice(p);
+}
+
+/*
+ * Is the thisp a higher priority task than thatp for SMT dependent idle
+ * purposes?
+ */
+static int task_priority_gt(const struct task_struct *thisp,
+			    const struct task_struct *thatp)
+{
+	if (task_in_background(thisp))
+	    return !task_in_background(thatp);
+
+	if (task_in_background(thatp))
+	    return 1;
+
+	return thisp->static_prio < thatp->static_prio;
+}
+
+/*
  * To minimise lock contention and not have to drop this_rq's runlock we
only * trylock the sibling runqueues and bypass those runqueues if we fail
to * acquire their lock. As we only trylock the normal locking order does
not @@ -3180,9 +3279,9 @@ dependent_sleeper(int this_cpu, struct r
 				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
 					ret = 1;
 		} else {
-			if (smt_curr->static_prio < p->static_prio &&
+			if (task_priority_gt(smt_curr, p) &&
 				!TASK_PREEMPTS_CURR(p, smt_rq) &&
-				smt_slice(smt_curr, sd) > task_timeslice(p))
+				smt_slice(smt_curr, sd) > smt_timeslice(p))
 					ret = 1;
 		}
 unlock:
@@ -3245,6 +3344,22 @@ static inline int interactive_sleep(enum
 }

 /*
+ * Switch the active and expired arrays.
+ */
+static struct prio_array *switch_arrays(struct rq *rq, int
best_active_prio) +{

In the fast path this should be inlined even if it is large.

OK.


+	struct prio_array *array = rq->active;
+
+	schedstat_inc(rq, sched_switch);
+	rq->active = rq->expired;
+	rq->expired = array;
+	rq->expired_timestamp = 0;
+	rq->best_expired_prio = best_active_prio;
+
+	return rq->active;
+}
+
+/*
  * schedule() is the main scheduler function.
  */
 asmlinkage void __sched schedule(void)
@@ -3332,23 +3447,25 @@ need_resched_nonpreemptible:
 	}

 	array = rq->active;
-	if (unlikely(!array->nr_active)) {
-		/*
-		 * Switch the active and expired arrays.
-		 */
-		schedstat_inc(rq, sched_switch);
-		rq->active = rq->expired;
-		rq->expired = array;
-		array = rq->active;
-		rq->expired_timestamp = 0;
-		rq->best_expired_prio = MAX_PRIO;
-	}
+	if (unlikely(!array->nr_active))
+		array = switch_arrays(rq, MAX_PRIO);

 	idx = sched_find_first_bit(array->bitmap);
+get_next:
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct task_struct, run_list);
+	/* very strict backgrounding */
+	if (unlikely(task_in_background(next) && rq->expired->nr_active)) {
+		int tmp = sched_find_first_bit(rq->expired->bitmap);
+
+		if (likely(tmp < idx)) {
+			array = switch_arrays(rq, idx);
+			idx = tmp;
+			goto get_next;
+		}
+	}

-	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
+	if (!rt_task(next) && interactive_sleep(next->sleep_type) &&
!bgnd_task(next)) { unsigned long long delta = now - next->timestamp;
 		if (unlikely((long long)(now - next->timestamp) < 0))
 			delta = 0;
@@ -4052,7 +4169,8 @@ recheck:
 	if (policy < 0)
 		policy = oldpolicy = p->policy;
 	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-			policy != SCHED_NORMAL && policy != SCHED_BATCH)
+			policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+			policy != SCHED_BGND)

how about a wrapper for all these policies? (see my sched_range patch)

I must admit that when I was doing this I wished it was less messy. But I figured that it was best to do it in a way that was easy to show as being correct and then do simplification later. Especially, as simplification would also effect the other policies.


 		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
@@ -4063,8 +4181,8 @@ recheck:
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
-	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
-					!= (param->sched_priority == 0))
+	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH ||
+	     policy == SCHED_BGND) != (param->sched_priority == 0))
 		return -EINVAL;

same

 	/*
@@ -4072,15 +4190,20 @@ recheck:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
 		/*
-		 * can't change policy, except between SCHED_NORMAL
-		 * and SCHED_BATCH:
+		 * can't change policy, except between SCHED_NORMAL,
+		 * SCHED_BATCH or SCHED_BGND:
 		 */
-		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
-			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
+		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH &&
+			    p->policy != SCHED_BGND) &&
+		     (policy != SCHED_BATCH && p->policy != SCHED_NORMAL &&
+			    p->policy != SCHED_BGND) &&
+		     (policy != SCHED_BGND && p->policy != SCHED_NORMAL &&
+			    p->policy != SCHED_BATCH)) &&

same

 				!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
 			return -EPERM;
 		/* can't increase priority */
-		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
+		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+			    policy != SCHED_BGND) &&
 		    param->sched_priority > p->rt_priority &&
 		    param->sched_priority >
 				p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
Index: MM-2.6.17-mm6/kernel/mutex.c
===================================================================
--- MM-2.6.17-mm6.orig/kernel/mutex.c	2006-07-04 14:37:43.000000000 +1000
+++ MM-2.6.17-mm6/kernel/mutex.c	2006-07-04 14:38:12.000000000 +1000
@@ -51,6 +51,16 @@ __mutex_init(struct mutex *lock, const c

 EXPORT_SYMBOL(__mutex_init);

+static inline void inc_mutex_count(void)
+{
+	current->mutexes_held++;
+}
+
+static inline void dec_mutex_count(void)
+{
+	current->mutexes_held--;
+}
+
 /*
  * We split the mutex lock/unlock logic into separate fastpath and
  * slowpath functions, to reduce the register pressure on the fastpath.
@@ -89,6 +99,7 @@ void inline fastcall __sched mutex_lock(
 	 * 'unlocked' into 'locked' state.
 	 */
 	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+	inc_mutex_count();
 }

 EXPORT_SYMBOL(mutex_lock);
@@ -114,6 +125,7 @@ void fastcall __sched mutex_unlock(struc
 	 * into 'unlocked' state:
 	 */
 	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
+	dec_mutex_count();
 }

 EXPORT_SYMBOL(mutex_unlock);
@@ -274,9 +286,16 @@ __mutex_lock_interruptible_slowpath(atom
  */
 int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_fastpath_lock_retval
+	ret = __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_interruptible_slowpath);
+
+	if (likely(!ret))
+		inc_mutex_count();
+
+	return ret;
 }

 EXPORT_SYMBOL(mutex_lock_interruptible);
@@ -331,8 +350,13 @@ static inline int __mutex_trylock_slowpa
  */
 int fastcall __sched mutex_trylock(struct mutex *lock)
 {
-	return __mutex_fastpath_trylock(&lock->count,
+	int ret = __mutex_fastpath_trylock(&lock->count,
 					__mutex_trylock_slowpath);
+
+	if (likely(ret))
+		inc_mutex_count();
+
+	return ret;
 }

 EXPORT_SYMBOL(mutex_trylock);
Index: MM-2.6.17-mm6/kernel/fork.c
===================================================================
--- MM-2.6.17-mm6.orig/kernel/fork.c	2006-07-04 14:37:43.000000000 +1000
+++ MM-2.6.17-mm6/kernel/fork.c	2006-07-04 14:38:12.000000000 +1000
@@ -1029,6 +1029,7 @@ static struct task_struct *copy_process(
 	p->wchar = 0;		/* I/O counter: bytes written */
 	p->syscr = 0;		/* I/O counter: read syscalls */
 	p->syscw = 0;		/* I/O counter: write syscalls */
+	p->mutexes_held = 0;
 	acct_clear_integrals(p);

  	p->it_virt_expires = cputime_zero;



--
Peter Williams                                   [email protected]

"Learning, n. The kind of ignorance distinguishing the studious."
 -- Ambrose Bierce
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux