Re: [RFC 2/5] sched: Add CPU rate soft caps

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 5/26/06, Peter Williams <[email protected]> wrote:
<snip>

Notes:

1. To minimize the overhead incurred when testing to skip caps processing for
uncapped tasks a new flag PF_HAS_CAP has been added to flags.

2. The implementation involves the addition of two priority slots to the
run queue priority arrays and this means that MAX_PRIO no longer
represents the scheduling priority of the idle process and can't be used to
test whether priority values are in the valid range.  To alleviate this
problem a new function sched_idle_prio() has been provided.

I am a little confused by this. Why link the bandwidth expired tasks a
cpu (its caps) to a priority slot? Is this a hack to conitnue using
the prio_array? why not move such tasks to the expired array?

<snip>
 /*
  * Some day this will be a full-fledged user tracking system..
  */
@@ -787,6 +793,10 @@ struct task_struct {
        unsigned long sleep_avg;
        unsigned long long timestamp, last_ran;
        unsigned long long sched_time; /* sched_clock time spent running */
+#ifdef CONFIG_CPU_RATE_CAPS
+       unsigned long long avg_cpu_per_cycle, avg_cycle_length;
+       unsigned int cpu_rate_cap;
+#endif

How is a cycle defined? What are the units of a cycle? Could we please
document the units for the declarations above

        enum sleep_type sleep_type;

        unsigned long policy;
@@ -981,6 +991,11 @@ struct task_struct {
 #endif
 };

+#ifdef CONFIG_CPU_RATE_CAPS
+unsigned int get_cpu_rate_cap(const struct task_struct *);
+int set_cpu_rate_cap(struct task_struct *, unsigned int);
+#endif
+
 static inline pid_t process_group(struct task_struct *tsk)
 {
        return tsk->signal->pgrp;
@@ -1040,6 +1055,7 @@ static inline void put_task_struct(struc
 #define PF_SPREAD_SLAB 0x08000000      /* Spread some slab caches over cpuset */
 #define PF_MEMPOLICY   0x10000000      /* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER        0x02000000      /* Thread belongs to the rt mutex tester */
+#define PF_HAS_CAP     0x20000000      /* Has a CPU rate cap */

 /*
  * Only the _current_ task can read/write to tsk->flags, but other
Index: MM-2.6.17-rc4-mm3/init/Kconfig
===================================================================
--- MM-2.6.17-rc4-mm3.orig/init/Kconfig 2006-05-26 10:39:59.000000000 +1000
+++ MM-2.6.17-rc4-mm3/init/Kconfig      2006-05-26 10:45:26.000000000 +1000
@@ -286,6 +286,8 @@ config RELAY

          If unsure, say N.

+source "kernel/Kconfig.caps"
+
 source "usr/Kconfig"

 config UID16
Index: MM-2.6.17-rc4-mm3/kernel/Kconfig.caps
===================================================================
--- /dev/null   1970-01-01 00:00:00.000000000 +0000
+++ MM-2.6.17-rc4-mm3/kernel/Kconfig.caps       2006-05-26 10:45:26.000000000 +1000
@@ -0,0 +1,13 @@
+#
+# CPU Rate Caps Configuration
+#
+
+config CPU_RATE_CAPS
+       bool "Support (soft) CPU rate caps"
+       default n
+       ---help---
+         Say y here if you wish to be able to put a (soft) upper limit on
+         the rate of CPU usage by individual tasks.  A task which has been
+         allocated a soft CPU rate cap will be limited to that rate of CPU
+         usage unless there is spare CPU resources available after the needs
+         of uncapped tasks are met.
Index: MM-2.6.17-rc4-mm3/kernel/sched.c
===================================================================
--- MM-2.6.17-rc4-mm3.orig/kernel/sched.c       2006-05-26 10:44:51.000000000 +1000
+++ MM-2.6.17-rc4-mm3/kernel/sched.c    2006-05-26 11:00:02.000000000 +1000
@@ -57,6 +57,19 @@

 #include <asm/unistd.h>

+#ifdef CONFIG_CPU_RATE_CAPS
+#define IDLE_PRIO      (MAX_PRIO + 2)
+#else
+#define IDLE_PRIO      MAX_PRIO
+#endif
+#define BGND_PRIO      (IDLE_PRIO - 1)
+#define CAPPED_PRIO    (IDLE_PRIO - 2)
+
+int sched_idle_prio(void)
+{
+       return IDLE_PRIO;
+}
+
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -186,6 +199,149 @@ static inline unsigned int task_timeslic
        return static_prio_timeslice(p->static_prio);
 }

+#ifdef CONFIG_CPU_RATE_CAPS
+#define CAP_STATS_OFFSET 8
+#define task_has_cap(p) unlikely((p)->flags & PF_HAS_CAP)
+/* this assumes that p is not a real time task */
+#define task_is_background(p) unlikely((p)->cpu_rate_cap == 0)
+#define task_being_capped(p) unlikely((p)->prio >= CAPPED_PRIO)
+#define cap_load_weight(p) (((p)->cpu_rate_cap * SCHED_LOAD_SCALE) / 1000)

Could we please use a const or #define'd name instead of 1000. How
about TOTAL_CAP_IN_PARTS? It would make the code easier to read and
maintain.

+
+static void init_cpu_rate_caps(task_t *p)
+{
+       p->cpu_rate_cap = 1000;
+       p->flags &= ~PF_HAS_CAP;
+}
+
+static inline void set_cap_flag(task_t *p)
+{
+       if (p->cpu_rate_cap < 1000 && !has_rt_policy(p))
+               p->flags |= PF_HAS_CAP;
+       else
+               p->flags &= ~PF_HAS_CAP;
+}

Why don't you re-use RLIMIT_INFINITY?

+
+static inline int task_exceeding_cap(const task_t *p)
+{
+       return (p->avg_cpu_per_cycle * 1000) > (p->avg_cycle_length * p->cpu_rate_cap);
+}
+
+#ifdef CONFIG_SCHED_SMT
+static unsigned int smt_timeslice(task_t *p)
+{
+       if (task_has_cap(p) && task_being_capped(p))
+               return 0;
+
+       return task_timeslice(p);
+}
+
+static int task_priority_gt(const task_t *thisp, const task_t *thatp)
+{
+       if (task_has_cap(thisp) && (task_being_capped(thisp)))
+           return 0;
+
+       if (task_has_cap(thatp) && (task_being_capped(thatp)))
+           return 1;
+
+       return thisp->static_prio < thatp->static_prio;
+}

This function needs some comments. At least with respect to what is
thisp and thatp

+#endif
+
+/*
+ * Update usage stats to "now" before making comparison
+ * Assume: task is actually on a CPU
+ */
+static int task_exceeding_cap_now(const task_t *p, unsigned long long now)
+{
+       unsigned long long delta, lhs, rhs;
+
+       delta = (now > p->timestamp) ? (now - p->timestamp) : 0;
+       lhs = (p->avg_cpu_per_cycle + delta) * 1000;
+       rhs = (p->avg_cycle_length + delta) * p->cpu_rate_cap;
+
+       return lhs > rhs;
+}
+
+static inline void init_cap_stats(task_t *p)
+{
+       p->avg_cpu_per_cycle = 0;
+       p->avg_cycle_length = 0;
+}
+
+static inline void inc_cap_stats_cycle(task_t *p, unsigned long long now)
+{
+       unsigned long long delta;
+
+       delta = (now > p->timestamp) ? (now - p->timestamp) : 0;
+       p->avg_cycle_length += delta;
+}
+
+static inline void inc_cap_stats_both(task_t *p, unsigned long long now)
+{
+       unsigned long long delta;
+
+       delta = (now > p->timestamp) ? (now - p->timestamp) : 0;
+       p->avg_cycle_length += delta;
+       p->avg_cpu_per_cycle += delta;
+}
+
+static inline void decay_cap_stats(task_t *p)
+{
+       p->avg_cycle_length *= ((1 << CAP_STATS_OFFSET) - 1);
+       p->avg_cycle_length >>= CAP_STATS_OFFSET;
+       p->avg_cpu_per_cycle *= ((1 << CAP_STATS_OFFSET) - 1);
+       p->avg_cpu_per_cycle >>= CAP_STATS_OFFSET;
+}
+#else
+#define task_has_cap(p) 0
+#define task_is_background(p) 0
+#define task_being_capped(p) 0
+#define cap_load_weight(p) SCHED_LOAD_SCALE
+
+static inline void init_cpu_rate_caps(task_t *p)
+{
+}
+
+static inline void set_cap_flag(task_t *p)
+{
+}
+
+static inline int task_exceeding_cap(const task_t *p)
+{
+       return 0;
+}
+
+#ifdef CONFIG_SCHED_SMT
+#define smt_timeslice(p) task_timeslice(p)
+
+static inline int task_priority_gt(const task_t *thisp, const task_t *thatp)
+{
+       return thisp->static_prio < thatp->static_prio;
+}
+#endif
+
+static inline int task_exceeding_cap_now(const task_t *p, unsigned long long now)
+{
+       return 0;
+}
+
+static inline void init_cap_stats(task_t *p)
+{
+}
+
+static inline void inc_cap_stats_cycle(task_t *p, unsigned long long now)
+{
+}
+
+static inline void inc_cap_stats_both(task_t *p, unsigned long long now)
+{
+}
+
+static inline void decay_cap_stats(task_t *p)
+{
+}
+#endif
+
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)      \
                                < (long long) (sd)->cache_hot_time)

@@ -197,8 +353,8 @@ typedef struct runqueue runqueue_t;

 struct prio_array {
        unsigned int nr_active;
-       DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
-       struct list_head queue[MAX_PRIO];
+       DECLARE_BITMAP(bitmap, IDLE_PRIO+1); /* include 1 bit for delimiter */
+       struct list_head queue[IDLE_PRIO];
 };

 /*
@@ -710,6 +866,10 @@ static inline int __normal_prio(task_t *
 {
        int bonus, prio;

+       /* Ensure that background tasks stay at BGND_PRIO */
+       if (task_is_background(p))
+               return BGND_PRIO;
+
        bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;

        prio = p->static_prio - bonus;
@@ -786,6 +946,8 @@ static inline int expired_starving(runqu

 static void set_load_weight(task_t *p)
 {
+       set_cap_flag(p);
+
        if (has_rt_policy(p)) {
 #ifdef CONFIG_SMP
                if (p == task_rq(p)->migration_thread)
@@ -798,8 +960,22 @@ static void set_load_weight(task_t *p)
                else
 #endif
                        p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
-       } else
+       } else {
                p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+
+               /*
+                * Reduce the probability of a task escaping its CPU rate cap
+                * due to load balancing leaving it on a lighly used CPU
+                * This will be optimized away if rate caps aren't configured
+                */
+               if (task_has_cap(p)) {
+                       unsigned int clw; /* load weight based on cap */
+
+                       clw = cap_load_weight(p);
+                       if (clw < p->load_weight)
+                               p->load_weight = clw;
+               }

You could  use
p->load_weight = min(cap_load_weight(p), p->load_weight);


+       }
 }

 static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
@@ -869,7 +1045,8 @@ static void __activate_task(task_t *p, r
 {
        prio_array_t *target = rq->active;

-       if (unlikely(batch_task(p) || (expired_starving(rq) && !rt_task(p))))
+       if (unlikely(batch_task(p) || (expired_starving(rq) && !rt_task(p)) ||
+                       task_being_capped(p)))
                target = rq->expired;
        enqueue_task(p, target);
        inc_nr_running(p, rq);
@@ -975,8 +1152,30 @@ static void activate_task(task_t *p, run
 #endif

        if (!rt_task(p))
+               /*
+                * We want to do the recalculation even if we're exceeding
+                * a cap so that everything still works when we stop
+                * exceeding our cap.
+                */
                p->prio = recalc_task_prio(p, now);

+       if (task_has_cap(p)) {
+               inc_cap_stats_cycle(p, now);
+               /* Background tasks are handled in effective_prio()
+                * in order to ensure that they stay at BGND_PRIO
+                * but we need to be careful that we don't override
+                * it here
+                */
+               if (task_exceeding_cap(p) && !task_is_background(p)) {
+                       p->normal_prio = CAPPED_PRIO;
+                       /*
+                        * Don't undo any priority ineheritance
+                        */
+                       if (!rt_task(p))
+                               p->prio = CAPPED_PRIO;
+               }
+       }

Within all tasks at CAPPED_PRIO, is priority of the task used for scheduling?

<snip>

Cheers,
Balbir
Linux Technology Center
IBM ISoftware Labs
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux