Add the SCHED_ISO policy (isochronous) which is a starvation free soft
realtime policy available to unprivileged users. The amount of cpu that
SCHED_ISO tasks will run as realtime is configurable by the tunable in
/proc/sys/kernel/iso_cpu
and is set to 80% (over 3 seconds) by default.
Signed-off-by: Con Kolivas <[email protected]>
Documentation/sysctl/kernel.txt | 9 ++++
include/linux/sched.h | 10 +++--
include/linux/sysctl.h | 1
kernel/sched.c | 77 ++++++++++++++++++++++++++++++++++++----
kernel/sysctl.c | 22 ++++++++---
5 files changed, 104 insertions(+), 15 deletions(-)
Index: linux-ck-dev/include/linux/sched.h
===================================================================
--- linux-ck-dev.orig/include/linux/sched.h 2006-06-18 15:23:35.000000000 +1000
+++ linux-ck-dev/include/linux/sched.h 2006-06-18 15:23:38.000000000 +1000
@@ -164,9 +164,10 @@ extern unsigned long weighted_cpuload(co
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
+#define SCHED_ISO 4
#define SCHED_MIN 0
-#define SCHED_MAX 3
+#define SCHED_MAX 4
#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX)
#define SCHED_RT(policy) ((policy) == SCHED_FIFO || \
@@ -209,7 +210,7 @@ extern void show_stack(struct task_struc
void io_schedule(void);
long io_schedule_timeout(long timeout);
-extern int sched_interactive, sched_compute;
+extern int sched_interactive, sched_compute, sched_iso_cpu;
extern void cpu_init (void);
extern void trap_init(void);
@@ -489,12 +490,14 @@ struct signal_struct {
#define MAX_USER_RT_PRIO 100
#define MAX_RT_PRIO MAX_USER_RT_PRIO
+#define ISO_PRIO (MAX_RT_PRIO - 1)
#define MAX_PRIO (MAX_RT_PRIO + 40)
#define MIN_USER_PRIO (MAX_PRIO - 1)
-#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO))
+#define rt_task(p) (unlikely(SCHED_RT((p)->policy)))
#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
+#define iso_task(p) (unlikely((p)->policy == SCHED_ISO))
/*
* Some day this will be a full-fledged user tracking system..
@@ -954,6 +957,7 @@ static inline void put_task_struct(struc
#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
#define PF_NONSLEEP 0x20000000 /* Waiting on in kernel activity */
#define PF_FORKED 0x40000000 /* Task just forked another process */
+#define PF_ISOREF 0x80000000 /* SCHED_ISO task has used up quota */
/*
* Only the _current_ task can read/write to tsk->flags, but other
Index: linux-ck-dev/include/linux/sysctl.h
===================================================================
--- linux-ck-dev.orig/include/linux/sysctl.h 2006-06-18 15:23:21.000000000 +1000
+++ linux-ck-dev/include/linux/sysctl.h 2006-06-18 15:23:38.000000000 +1000
@@ -150,6 +150,7 @@ enum
KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
KERN_INTERACTIVE=73, /* interactive tasks can have cpu bursts */
KERN_COMPUTE=74, /* adjust timeslices for a compute server */
+ KERN_ISO_CPU=75, /* percent cpu SCHED_ISO tasks run SCHED_RR */
};
Index: linux-ck-dev/kernel/sched.c
===================================================================
--- linux-ck-dev.orig/kernel/sched.c 2006-06-18 15:23:35.000000000 +1000
+++ linux-ck-dev/kernel/sched.c 2006-06-18 15:23:38.000000000 +1000
@@ -62,10 +62,14 @@
* raise its priority.
* sched_compute - sysctl which enables long timeslices and delayed preemption
* for compute server usage.
+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
+ * are allowed to run (over ISO_PERIOD seconds) as real time tasks.
*/
int sched_interactive __read_mostly = 1;
int sched_compute __read_mostly;
+int sched_iso_cpu __read_mostly = 80;
+#define ISO_PERIOD (5 * HZ)
/*
* CACHE_DELAY is the time preemption is delayed in sched_compute mode
* and is set to a nominal 10ms.
@@ -146,6 +150,9 @@ struct runqueue {
unsigned long long timestamp_last_tick;
unsigned short cache_ticks, preempted;
+ unsigned long iso_ticks;
+ unsigned short iso_refractory;
+
task_t *curr, *idle;
struct mm_struct *prev_mm;
unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)];
@@ -742,6 +749,17 @@ static int effective_prio(const task_t *
if (rt_task(p))
return p->prio;
+ if (iso_task(p)) {
+ if (likely(!(p->flags & PF_ISOREF)))
+ /*
+ * If SCHED_ISO tasks have not used up their real time
+ * quota they have run just better than highest
+ * SCHED_NORMAL priority. Otherwise they run as
+ * SCHED_NORMAL.
+ */
+ return ISO_PRIO;
+ }
+
full_slice = slice(p);
if (full_slice > p->slice)
used_slice = full_slice - p->slice;
@@ -2632,6 +2650,22 @@ static void time_slice_expired(task_t *p
}
/*
+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT
+ * tasks and set the refractory flag if necessary. There is 10% hysteresis
+ * for unsetting the flag.
+ */
+static inline unsigned int test_ret_isorefractory(runqueue_t *rq)
+{
+ if (likely(!rq->iso_refractory)) {
+ if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu)
+ rq->iso_refractory = 1;
+ } else
+ if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100))
+ rq->iso_refractory = 0;
+ return rq->iso_refractory;
+}
+
+/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*/
@@ -2659,11 +2693,29 @@ void scheduler_tick(void)
set_tsk_need_resched(p);
goto out;
}
- /* SCHED_FIFO tasks never run out of timeslice. */
- if (unlikely(p->policy == SCHED_FIFO))
- goto out;
spin_lock(&rq->lock);
+ if (unlikely((rt_task(p) || (iso_task(p) && !rq->iso_refractory)) &&
+ p->mm)) {
+ if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100)
+ rq->iso_ticks += 100;
+ } else
+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
+
+ if (iso_task(p)) {
+ if (unlikely(test_ret_isorefractory(rq))) {
+ if (!(p->flags & PF_ISOREF)) {
+ set_tsk_need_resched(p);
+ p->flags |= PF_ISOREF;
+ }
+ } else
+ p->flags &= ~PF_ISOREF;
+ } else
+ /* SCHED_FIFO tasks never run out of timeslice. */
+ if (unlikely(p->policy == SCHED_FIFO))
+ goto out_unlock;
+
+
debit = ns_diff(rq->timestamp_last_tick, p->timestamp);
p->ns_debit += debit;
if (p->ns_debit < NSJIFFY)
@@ -2758,7 +2810,7 @@ static int dependent_sleeper(int this_cp
int ret = 0, i;
/* kernel/rt threads do not participate in dependent sleeping */
- if (!p->mm || rt_task(p))
+ if (!p->mm || rt_task(p) || iso_task(p))
return 0;
for_each_domain(this_cpu, tmp) {
@@ -2795,7 +2847,7 @@ static int dependent_sleeper(int this_cp
* task from using an unfair proportion of the
* physical cpu's resources. -ck
*/
- if (rt_task(smt_curr)) {
+ if (rt_task(smt_curr) || iso_task(smt_curr)) {
/*
* With real time tasks we run non-rt tasks only
* per_cpu_gain% of the time.
@@ -3567,9 +3619,19 @@ int sched_setscheduler(struct task_struc
{
int retval;
int queued, oldprio, oldpolicy = -1;
+ struct sched_param zero_param = { .sched_priority = 0 };
unsigned long flags;
runqueue_t *rq;
+ if (SCHED_RT(policy) && !capable(CAP_SYS_NICE)) {
+ /*
+ * If the caller requested an RT policy without having the
+ * necessary rights, we downgrade the policy to SCHED_ISO.
+ * We also set the parameter to zero to pass the checks.
+ */
+ policy = SCHED_ISO;
+ param = &zero_param;
+ }
recheck:
/* double check policy once rq lock held */
if (policy < 0)
@@ -4063,6 +4125,7 @@ asmlinkage long sys_sched_get_priority_m
break;
case SCHED_NORMAL:
case SCHED_BATCH:
+ case SCHED_ISO:
ret = 0;
break;
}
@@ -4087,6 +4150,7 @@ asmlinkage long sys_sched_get_priority_m
break;
case SCHED_NORMAL:
case SCHED_BATCH:
+ case SCHED_ISO:
ret = 0;
}
return ret;
@@ -5992,7 +6056,8 @@ void __init sched_init(void)
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
- rq->nr_running = rq->cache_ticks = rq->preempted = 0;
+ rq->nr_running = rq->cache_ticks = rq->preempted =
+ rq->iso_ticks = 0;
#ifdef CONFIG_SMP
rq->sd = NULL;
Index: linux-ck-dev/kernel/sysctl.c
===================================================================
--- linux-ck-dev.orig/kernel/sysctl.c 2006-06-18 15:23:21.000000000 +1000
+++ linux-ck-dev/kernel/sysctl.c 2006-06-18 15:23:38.000000000 +1000
@@ -229,6 +229,11 @@ static ctl_table root_table[] = {
{ .ctl_name = 0 }
};
+/* Constants for minimum and maximum testing.
+ We use these as one-element integer vectors. */
+static int zero;
+static int one_hundred = 100;
+
static ctl_table kern_table[] = {
{
.ctl_name = KERN_OSTYPE,
@@ -639,6 +644,17 @@ static ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = KERN_ISO_CPU,
+ .procname = "iso_cpu",
+ .data = &sched_iso_cpu,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
{
.ctl_name = KERN_UNKNOWN_NMI_PANIC,
@@ -702,12 +718,6 @@ static ctl_table kern_table[] = {
{ .ctl_name = 0 }
};
-/* Constants for minimum and maximum testing in vm_table.
- We use these as one-element integer vectors. */
-static int zero;
-static int one_hundred = 100;
-
-
static ctl_table vm_table[] = {
{
.ctl_name = VM_OVERCOMMIT_MEMORY,
Index: linux-ck-dev/Documentation/sysctl/kernel.txt
===================================================================
--- linux-ck-dev.orig/Documentation/sysctl/kernel.txt 2006-06-18 15:23:21.000000000 +1000
+++ linux-ck-dev/Documentation/sysctl/kernel.txt 2006-06-18 15:23:38.000000000 +1000
@@ -27,6 +27,7 @@ show up in /proc/sys/kernel:
- hostname
- hotplug
- interactive
+- iso_cpu
- java-appletviewer [ binfmt_java, obsolete ]
- java-interpreter [ binfmt_java, obsolete ]
- l2cr [ PPC only ]
@@ -182,6 +183,14 @@ are obeyed if this tunable is disabled.
==============================================================
+iso_cpu:
+
+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
+run effectively at realtime priority, averaged over a rolling 3 seconds.
+Set to 80% by default.
+
+==============================================================
+
l2cr: (PPC only)
This flag controls the L2 cache of G3 processor boards. If
--
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
[Index of Archives]
[Kernel Newbies]
[Netfilter]
[Bugtraq]
[Photo]
[Stuff]
[Gimp]
[Yosemite News]
[MIPS Linux]
[ARM Linux]
[Linux Security]
[Linux RAID]
[Video 4 Linux]
[Linux for the blind]
[Linux Resources]