Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy

some quick comments within code below.

On Wednesday 05 July 2006 09:35, Peter Williams wrote:
> ---
>  include/linux/init_task.h |    6 -
>  include/linux/sched.h     |   11 ++
>  kernel/fork.c             |    1
>  kernel/mutex.c            |   28 ++++++-
>  kernel/sched.c            |  183
> ++++++++++++++++++++++++++++++++++++++-------- 5 files changed, 192
> insertions(+), 37 deletions(-)
>
> Index: MM-2.6.17-mm6/include/linux/init_task.h
> ===================================================================
> --- MM-2.6.17-mm6.orig/include/linux/init_task.h	2006-07-04
> 14:37:42.000000000 +1000 +++
> MM-2.6.17-mm6/include/linux/init_task.h	2006-07-04 14:38:12.000000000 +1000
> @@ -99,9 +99,9 @@ extern struct group_info init_groups;
>  	.usage		= ATOMIC_INIT(2),				\
>  	.flags		= 0,						\
>  	.lock_depth	= -1,						\
> -	.prio		= MAX_PRIO-20,					\
> -	.static_prio	= MAX_PRIO-20,					\
> -	.normal_prio	= MAX_PRIO-20,					\
> +	.prio		= MAX_RT_PRIO+20,				\
> +	.static_prio	= MAX_RT_PRIO+20,				\
> +	.normal_prio	= MAX_RT_PRIO+20,				\
>  	.policy		= SCHED_NORMAL,					\
>  	.cpus_allowed	= CPU_MASK_ALL,					\
>  	.mm		= NULL,						\
> Index: MM-2.6.17-mm6/include/linux/sched.h
> ===================================================================
> --- MM-2.6.17-mm6.orig/include/linux/sched.h	2006-07-04 14:37:43.000000000
> +1000 +++ MM-2.6.17-mm6/include/linux/sched.h	2006-07-04 14:38:12.000000000
> +1000 @@ -34,6 +34,8 @@
>  #define SCHED_FIFO		1
>  #define SCHED_RR		2
>  #define SCHED_BATCH		3
> +/* Scheduler class for background tasks */
> +#define SCHED_BGND		4
>
>  #ifdef __KERNEL__
>
> @@ -503,13 +505,16 @@ struct signal_struct {
>  #define MAX_USER_RT_PRIO	100
>  #define MAX_RT_PRIO		MAX_USER_RT_PRIO
>
> -#define MAX_PRIO		(MAX_RT_PRIO + 40)
> +#define BGND_PRIO		(MAX_RT_PRIO + 40)
> +/* add another slot for SCHED_BGND tasks */
> +#define MAX_PRIO		(BGND_PRIO + 1)
>
>  #define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
>  #define rt_task(p)		rt_prio((p)->prio)
>  #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
>  #define has_rt_policy(p) \
> -	unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
> +	unlikely((p)->policy != SCHED_NORMAL && (p)->policy < SCHED_BATCH)

idleprio tasks should be able to get rt_policy as well

> +#define bgnd_task(p)		(unlikely((p)->policy == SCHED_BGND))
>
>  /*
>   * Some day this will be a full-fledged user tracking system..
> @@ -810,6 +815,7 @@ struct task_struct {
>  	unsigned long sleep_avg;
>  	unsigned long long timestamp, last_ran;
>  	unsigned long long sched_time; /* sched_clock time spent running */
> +	unsigned int mutexes_held; /* for knowing when it's safe to repress
> SCHED_BGND tasks */ enum sleep_type sleep_type;
>
>  	unsigned long policy;
> @@ -1090,6 +1096,7 @@ static inline void put_task_struct(struc
>  #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
>  #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
>  #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset
> */ +#define PF_UIWAKE	0x08000000	/* Just woken from uninterrptible sleep */
> #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
>  #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex
> tester */
>
> Index: MM-2.6.17-mm6/kernel/sched.c
> ===================================================================
> --- MM-2.6.17-mm6.orig/kernel/sched.c	2006-07-04 14:37:43.000000000 +1000
> +++ MM-2.6.17-mm6/kernel/sched.c	2006-07-04 14:38:12.000000000 +1000
> @@ -59,7 +59,7 @@
>
>  /*
>   * Convert user-nice values [ -20 ... 0 ... 19 ]
> - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
> + * to static priority [ MAX_RT_PRIO..BGND_PRIO-1 ],
>   * and back.
>   */
>  #define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
> @@ -73,7 +73,7 @@
>   */
>  #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
>  #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
> -#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
> +#define MAX_USER_PRIO		(USER_PRIO(BGND_PRIO))
>
>  /*
>   * Some helpers for converting nanosecond timing to jiffy resolution
> @@ -171,7 +171,7 @@
>   */
>
>  #define SCALE_PRIO(x, prio) \
> -	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
> +	max(x * (BGND_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
>
>  static unsigned int static_prio_timeslice(int static_prio)
>  {
> @@ -186,6 +186,11 @@ static inline unsigned int task_timeslic
>  	return static_prio_timeslice(p->static_prio);
>  }
>
> +#define task_in_background(p) unlikely((p)->prio == BGND_PRIO)
> +#define safe_to_background(p) \
> +	(!((p)->mutexes_held || \
> +	   (p)->flags & (PF_FREEZE | PF_UIWAKE | PF_EXITING)))
> +
>  /*
>   * These are the runqueue data structures:
>   */
> @@ -715,13 +720,17 @@ static inline int __normal_prio(struct t
>  {
>  	int bonus, prio;
>
> +	/* Ensure that background tasks stay at BGND_PRIO */
> +	if (bgnd_task(p) && safe_to_background(p))
> +		return BGND_PRIO;
> +
>  	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
>
>  	prio = p->static_prio - bonus;
>  	if (prio < MAX_RT_PRIO)
>  		prio = MAX_RT_PRIO;
> -	if (prio > MAX_PRIO-1)
> -		prio = MAX_PRIO-1;
> +	if (prio > BGND_PRIO-1)
> +		prio = BGND_PRIO-1;
>  	return prio;
>  }
>
> @@ -761,8 +770,18 @@ static void set_load_weight(struct task_
>  		else
>  #endif
>  			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
> -	} else
> -		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
> +	} else {
> +		/*
> +		 * Reduce the probability of a task escaping the background
> +		 * due to load balancing leaving it on a lighly used CPU
> +		 * Can't use zero as that would kill load balancing when only
> +		 * background tasks are running.
> +		 */
> +		if (bgnd_task(p))
> +			p->load_weight = LOAD_WEIGHT(MIN_TIMESLICE / 2 ? : 1);

Why not just set it to 1 for all idleprio tasks? The granularity will be lost 
at anything lower anyway and it avoids a more complex calculation.

> +		else
> +			p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
> +	}
>  }
>
>  static inline void
> @@ -834,7 +853,10 @@ static void __activate_task(struct task_
>  {
>  	struct prio_array *target = rq->active;
>
> -	if (batch_task(p))
> +	/* Don't punish batch tasks just tasks actually in the background

An extra line here for multiline comments such as:
+	/*
+	 * Don't punish batch tasks just tasks actually in the background


> +	 * as anything else is counter productive from a system wide aspect
> +	 */
> +	if (task_in_background(p))
>  		target = rq->expired;
>  	enqueue_task(p, target);
>  	inc_nr_running(p, rq);
> @@ -942,6 +964,8 @@ static void activate_task(struct task_st
>  	if (!rt_task(p))
>  		p->prio = recalc_task_prio(p, now);
>
> +	p->flags &= ~PF_UIWAKE;
> +
>  	/*
>  	 * This checks to make sure it's not an uninterruptible task
>  	 * that is now waking up.
> @@ -1484,6 +1508,7 @@ out_activate:
>  		 * sleep_avg beyond just interactive state.
>  		 */
>  		p->sleep_type = SLEEP_NONINTERACTIVE;
> +		p->flags |= PF_UIWAKE;
>  	} else
>
>  	/*
> @@ -3024,6 +3049,48 @@ void scheduler_tick(void)
>  		}
>  		goto out_unlock;
>  	}
> +
> +	if (bgnd_task(p)) {
> +		/*
> +		 * Do this even if there's only one task on the queue as
> +		 * we want to set the priority low so that any waking tasks
> +		 * can preempt.
> +		 */
> +		if (task_in_background(p)) {
> +			/*
> +			 * Tasks currently in the background will be
> +			 * at BGND_PRIO priority and preemption
> +			 * should be enough to keep them in check provided we
> +			 * don't let them adversely effect tasks on the expired

ok I'm going to risk a lart and say "affect" ?

> +			 * array.
> +			 */
> +			if (!safe_to_background(p)) {
> +				dequeue_task(p, rq->active);
> +				p->prio = effective_prio(p);
> +				enqueue_task(p, rq->active);
> +			} else if (rq->expired->nr_active &&
> +				   rq->best_expired_prio < p->prio) {
> +				dequeue_task(p, rq->active);
> +				enqueue_task(p, rq->expired);
> +				set_tsk_need_resched(p);
> +				goto out_unlock;
> +			}
> +		}
> +		else if (safe_to_background(p)) {
> +			dequeue_task(p, rq->active);
> +			p->normal_prio = BGND_PRIO;
> +			/* this should be safe for PI purposes */
> +			p->prio = p->normal_prio;
> +			enqueue_task(p, rq->expired);
> +			/*
> +			 * think about making this conditional to reduce
> +			 * context switch rate
> +			 */
> +			set_tsk_need_resched(p);
> +			goto out_unlock;
> +		}
> +	}
> +
>  	if (!--p->time_slice) {
>  		dequeue_task(p, rq->active);
>  		set_tsk_need_resched(p);
> @@ -3033,6 +3100,11 @@ void scheduler_tick(void)
>
>  		if (!rq->expired_timestamp)
>  			rq->expired_timestamp = jiffies;
> +		/*
> +		 * No need to do anything special for background tasks here
> +		 * as TASK_INTERACTIVE() should fail when they're in the
> +		 * background.
> +		 */
>  		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
>  			enqueue_task(p, rq->expired);
>  			if (p->static_prio < rq->best_expired_prio)
> @@ -3122,6 +3194,33 @@ smt_slice(struct task_struct *p, struct
>  }
>
>  /*
> + * task time slice for SMT dependent idle purposes
> + */
> +static unsigned int smt_timeslice(struct task_struct *p)
> +{
> +	if (task_in_background(p))
> +		return 0;
> +
> +	return task_timeslice(p);
> +}
> +
> +/*
> + * Is the thisp a higher priority task than thatp for SMT dependent idle
> + * purposes?
> + */
> +static int task_priority_gt(const struct task_struct *thisp,
> +			    const struct task_struct *thatp)
> +{
> +	if (task_in_background(thisp))
> +	    return !task_in_background(thatp);
> +
> +	if (task_in_background(thatp))
> +	    return 1;
> +
> +	return thisp->static_prio < thatp->static_prio;
> +}
> +
> +/*
>   * To minimise lock contention and not have to drop this_rq's runlock we
> only * trylock the sibling runqueues and bypass those runqueues if we fail
> to * acquire their lock. As we only trylock the normal locking order does
> not @@ -3180,9 +3279,9 @@ dependent_sleeper(int this_cpu, struct r
>  				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
>  					ret = 1;
>  		} else {
> -			if (smt_curr->static_prio < p->static_prio &&
> +			if (task_priority_gt(smt_curr, p) &&
>  				!TASK_PREEMPTS_CURR(p, smt_rq) &&
> -				smt_slice(smt_curr, sd) > task_timeslice(p))
> +				smt_slice(smt_curr, sd) > smt_timeslice(p))
>  					ret = 1;
>  		}
>  unlock:
> @@ -3245,6 +3344,22 @@ static inline int interactive_sleep(enum
>  }
>
>  /*
> + * Switch the active and expired arrays.
> + */
> +static struct prio_array *switch_arrays(struct rq *rq, int
> best_active_prio) +{

In the fast path this should be inlined even if it is large.

> +	struct prio_array *array = rq->active;
> +
> +	schedstat_inc(rq, sched_switch);
> +	rq->active = rq->expired;
> +	rq->expired = array;
> +	rq->expired_timestamp = 0;
> +	rq->best_expired_prio = best_active_prio;
> +
> +	return rq->active;
> +}
> +
> +/*
>   * schedule() is the main scheduler function.
>   */
>  asmlinkage void __sched schedule(void)
> @@ -3332,23 +3447,25 @@ need_resched_nonpreemptible:
>  	}
>
>  	array = rq->active;
> -	if (unlikely(!array->nr_active)) {
> -		/*
> -		 * Switch the active and expired arrays.
> -		 */
> -		schedstat_inc(rq, sched_switch);
> -		rq->active = rq->expired;
> -		rq->expired = array;
> -		array = rq->active;
> -		rq->expired_timestamp = 0;
> -		rq->best_expired_prio = MAX_PRIO;
> -	}
> +	if (unlikely(!array->nr_active))
> +		array = switch_arrays(rq, MAX_PRIO);
>
>  	idx = sched_find_first_bit(array->bitmap);
> +get_next:
>  	queue = array->queue + idx;
>  	next = list_entry(queue->next, struct task_struct, run_list);
> +	/* very strict backgrounding */
> +	if (unlikely(task_in_background(next) && rq->expired->nr_active)) {
> +		int tmp = sched_find_first_bit(rq->expired->bitmap);
> +
> +		if (likely(tmp < idx)) {
> +			array = switch_arrays(rq, idx);
> +			idx = tmp;
> +			goto get_next;
> +		}
> +	}
>
> -	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
> +	if (!rt_task(next) && interactive_sleep(next->sleep_type) &&
> !bgnd_task(next)) { unsigned long long delta = now - next->timestamp;
>  		if (unlikely((long long)(now - next->timestamp) < 0))
>  			delta = 0;
> @@ -4052,7 +4169,8 @@ recheck:
>  	if (policy < 0)
>  		policy = oldpolicy = p->policy;
>  	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
> -			policy != SCHED_NORMAL && policy != SCHED_BATCH)
> +			policy != SCHED_NORMAL && policy != SCHED_BATCH &&
> +			policy != SCHED_BGND)

how about a wrapper for all these policies? (see my sched_range patch)

>  		return -EINVAL;
>  	/*
>  	 * Valid priorities for SCHED_FIFO and SCHED_RR are
> @@ -4063,8 +4181,8 @@ recheck:
>  	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
>  	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
>  		return -EINVAL;
> -	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
> -					!= (param->sched_priority == 0))
> +	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH ||
> +	     policy == SCHED_BGND) != (param->sched_priority == 0))
>  		return -EINVAL;

same

>  	/*
> @@ -4072,15 +4190,20 @@ recheck:
>  	 */
>  	if (!capable(CAP_SYS_NICE)) {
>  		/*
> -		 * can't change policy, except between SCHED_NORMAL
> -		 * and SCHED_BATCH:
> +		 * can't change policy, except between SCHED_NORMAL,
> +		 * SCHED_BATCH or SCHED_BGND:
>  		 */
> -		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
> -			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
> +		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH &&
> +			    p->policy != SCHED_BGND) &&
> +		     (policy != SCHED_BATCH && p->policy != SCHED_NORMAL &&
> +			    p->policy != SCHED_BGND) &&
> +		     (policy != SCHED_BGND && p->policy != SCHED_NORMAL &&
> +			    p->policy != SCHED_BATCH)) &&

same

>  				!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
>  			return -EPERM;
>  		/* can't increase priority */
> -		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
> +		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH &&
> +			    policy != SCHED_BGND) &&
>  		    param->sched_priority > p->rt_priority &&
>  		    param->sched_priority >
>  				p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
> Index: MM-2.6.17-mm6/kernel/mutex.c
> ===================================================================
> --- MM-2.6.17-mm6.orig/kernel/mutex.c	2006-07-04 14:37:43.000000000 +1000
> +++ MM-2.6.17-mm6/kernel/mutex.c	2006-07-04 14:38:12.000000000 +1000
> @@ -51,6 +51,16 @@ __mutex_init(struct mutex *lock, const c
>
>  EXPORT_SYMBOL(__mutex_init);
>
> +static inline void inc_mutex_count(void)
> +{
> +	current->mutexes_held++;
> +}
> +
> +static inline void dec_mutex_count(void)
> +{
> +	current->mutexes_held--;
> +}
> +
>  /*
>   * We split the mutex lock/unlock logic into separate fastpath and
>   * slowpath functions, to reduce the register pressure on the fastpath.
> @@ -89,6 +99,7 @@ void inline fastcall __sched mutex_lock(
>  	 * 'unlocked' into 'locked' state.
>  	 */
>  	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
> +	inc_mutex_count();
>  }
>
>  EXPORT_SYMBOL(mutex_lock);
> @@ -114,6 +125,7 @@ void fastcall __sched mutex_unlock(struc
>  	 * into 'unlocked' state:
>  	 */
>  	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
> +	dec_mutex_count();
>  }
>
>  EXPORT_SYMBOL(mutex_unlock);
> @@ -274,9 +286,16 @@ __mutex_lock_interruptible_slowpath(atom
>   */
>  int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
>  {
> +	int ret;
> +
>  	might_sleep();
> -	return __mutex_fastpath_lock_retval
> +	ret = __mutex_fastpath_lock_retval
>  			(&lock->count, __mutex_lock_interruptible_slowpath);
> +
> +	if (likely(!ret))
> +		inc_mutex_count();
> +
> +	return ret;
>  }
>
>  EXPORT_SYMBOL(mutex_lock_interruptible);
> @@ -331,8 +350,13 @@ static inline int __mutex_trylock_slowpa
>   */
>  int fastcall __sched mutex_trylock(struct mutex *lock)
>  {
> -	return __mutex_fastpath_trylock(&lock->count,
> +	int ret = __mutex_fastpath_trylock(&lock->count,
>  					__mutex_trylock_slowpath);
> +
> +	if (likely(ret))
> +		inc_mutex_count();
> +
> +	return ret;
>  }
>
>  EXPORT_SYMBOL(mutex_trylock);
> Index: MM-2.6.17-mm6/kernel/fork.c
> ===================================================================
> --- MM-2.6.17-mm6.orig/kernel/fork.c	2006-07-04 14:37:43.000000000 +1000
> +++ MM-2.6.17-mm6/kernel/fork.c	2006-07-04 14:38:12.000000000 +1000
> @@ -1029,6 +1029,7 @@ static struct task_struct *copy_process(
>  	p->wchar = 0;		/* I/O counter: bytes written */
>  	p->syscr = 0;		/* I/O counter: read syscalls */
>  	p->syscw = 0;		/* I/O counter: write syscalls */
> +	p->mutexes_held = 0;
>  	acct_clear_integrals(p);
>
>   	p->it_virt_expires = cputime_zero;

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Follow-Ups:
- Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  - From: Peter Williams <[email protected]>
- Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  - From: Peter Williams <[email protected]>
References:
- [PATCH] sched: Add SCHED_BGND (background) scheduling policy
  - From: Peter Williams <[email protected]>
Prev by Date: Re: Where can I get glibc with PI futex support (for -RT tests) ?
Next by Date: Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
Previous by thread: Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
Next by thread: Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]