[PATCH] sched: CONFIG_FAIR_USER_SCHED: auto adjust users weights

CONFIG_FAIR_USER_SCHED is great and I'm happy to see it is enabled by default
but it suffers from some limitations IMHO at this time:

- on a single user system, it's useful to have root processes be given twice
as CPU as user processes but I don't want nice 19 cron jobs like updatedb or
rpmq to have twice as cpu as my nice -20 tasks.

- on a multi user system, a user should be able to give back its cpu share to
other users. This is not possible for now with CONFIG_FAIR_USER_SCHED.

This implies that returning EPERM on nice(<0) becomes worthless, as it is
equivalent to nice(>0) for every other process of the user, ignoring the limits
of the nice range.

To address these problems, this patch changes the weight of the cfs_rq of each
user to the maximum weight of the processes on this cfs_rq, scaled with
/sys/kernel/uids/UID/cpu_share. It's possible that more elaborate mathematics
than taking the max are needed, but basic testing showed the expected fairness.

Signed-off-by: Guillaume Chazarain <guichaz@yahoo.fr>
---

 include/linux/sched.h |    4 ++
 kernel/sched.c        |   50 +++++++++++++++++++----
 kernel/sched_fair.c   |  108 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 154 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 155d743..d6d2db9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -908,6 +908,10 @@ struct sched_entity {
 	/* rq "owned" by this entity/group: */
 	struct cfs_rq		*my_q;
 #endif
+#ifdef CONFIG_FAIR_USER_SCHED
+	/* used to track the max load.weight */
+	struct rb_node		max_load;
+#endif
 };
 
 struct task_struct {
diff --git a/kernel/sched.c b/kernel/sched.c
index 3f6bd11..df8114b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -260,6 +260,10 @@ struct cfs_rq {
 	struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
 	struct task_group *tg;    /* group that "owns" this runqueue */
 #endif
+#ifdef CONFIG_FAIR_USER_SCHED
+	/* used to track the sched_entity with the max load in this cfs_rq */
+	struct rb_root max_load_se;
+#endif
 };
 
 /* Real-Time classes' related field in a runqueue: */
@@ -7094,14 +7098,12 @@ done:
 	task_rq_unlock(rq, &flags);
 }
 
+/* cfs_rq->rq->lock must be taken */
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
-	struct rq *rq = cfs_rq->rq;
 	int on_rq;
 
-	spin_lock_irq(&rq->lock);
-
 	on_rq = se->on_rq;
 	if (on_rq)
 		dequeue_entity(cfs_rq, se, 0);
@@ -7111,22 +7113,54 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
 
 	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
+}
 
-	spin_unlock_irq(&rq->lock);
+#ifdef CONFIG_FAIR_USER_SCHED
+static void update_group_share(struct task_group *tg, int cpu)
+{
+	struct rb_node *max_load_node = rb_last(&tg->cfs_rq[cpu]->max_load_se);
+	struct sched_entity *max_load_entry;
+	unsigned long shares;
+
+	if (!max_load_node)
+		/* empty cfs_rq */
+		return;
+
+	max_load_entry = rb_entry(max_load_node, struct sched_entity, max_load);
+	shares = scale_tg_weight(tg, max_load_entry->load.weight);
+	set_se_shares(tg->se[cpu], shares);
+}
+#else
+static void update_group_share(struct task_group *tg, int cpu)
+{
+	set_se_shares(tg->se[cpu], tg->shares);
 }
+#endif
 
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
-	int i;
+	int cpu;
+	unsigned long flags;
+
+	if (shares <= 1)
+		return -EINVAL;
+
+#ifdef CONFIG_FAIR_USER_SCHED
+	if ((shares * prio_to_weight[0]) / prio_to_weight[0] != shares)
+		/* The provided value would overflow in scale_tg_weight() */
+		return -EINVAL;
+#endif
 
 	spin_lock(&tg->lock);
 	if (tg->shares == shares)
 		goto done;
 
 	tg->shares = shares;
-	for_each_possible_cpu(i)
-		set_se_shares(tg->se[i], shares);
-
+	for_each_possible_cpu(cpu) {
+		spin_lock_irqsave(&tg->cfs_rq[cpu]->rq->lock, flags);
+		update_group_share(tg, cpu);
+		spin_unlock_irqrestore(&tg->cfs_rq[cpu]->rq->lock, flags);
+	}
 done:
 	spin_unlock(&tg->lock);
 	return 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 01859f6..70ed34e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -135,6 +135,112 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	return se->vruntime - cfs_rq->min_vruntime;
 }
 
+#ifdef CONFIG_FAIR_USER_SCHED
+static void set_se_shares(struct sched_entity *se, unsigned long shares);
+
+static unsigned long scale_tg_weight(struct task_group *tg, unsigned long weight)
+{
+	unsigned long scaled_weight = (weight * tg->shares) / NICE_0_LOAD;
+	return max(scaled_weight, 2UL);
+}
+
+static void enqueue_max_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct rb_node **link;
+	struct rb_node *parent = NULL;
+	struct sched_entity *entry;
+	unsigned long scaled_weight;
+
+	if (!cfs_rq->tg)
+		/* this is the main cfs_rq, not a user specific one */
+		return;
+
+	/*
+	 * We cannot take cfs_rq->tg->lock. As we already lock the rq, we
+	 * could deadlock in sched_group_set_shares()
+	 */
+
+	/*
+	 * Insert the sched_entity at its place in cfs_rq->max_load_se, given
+	 * its load.weight
+	 */
+	link = &cfs_rq->max_load_se.rb_node;
+	while (*link) {
+		parent = *link;
+		entry = rb_entry(parent, struct sched_entity, max_load);
+		if (se->load.weight < entry->load.weight)
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+	rb_link_node(&se->max_load, parent, link);
+	rb_insert_color(&se->max_load, &cfs_rq->max_load_se);
+
+	/*
+	 * Update the weight of the cfs_rq if we inserted a higher weight
+	 * sched_entity
+	 */
+	scaled_weight = scale_tg_weight(cfs_rq->tg, se->load.weight);
+	if (scaled_weight > cfs_rq->load.weight)
+		set_se_shares(se->parent, scaled_weight);
+}
+
+static void dequeue_max_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct rb_node *prev_node;
+	struct rb_node *next_node;
+	struct sched_entity *prev_entry;
+	unsigned long scaled_weight;
+
+	if (!cfs_rq->tg)
+		/* this is the main cfs_rq, not a user specific one */
+		return;
+
+	/*
+	 * We cannot take cfs_rq->tg->lock. As we already lock the rq, we
+	 * could deadlock in sched_group_set_shares()
+	 */
+
+	next_node = rb_next(&se->max_load);
+	if (!next_node)
+		prev_node = rb_prev(&se->max_load);
+	rb_erase(&se->max_load, &cfs_rq->max_load_se);
+
+	if (next_node)
+		/* The removed sched_entity was not the highest weight */
+		return;
+
+	/*
+	 * Here we know we removed the highest weight sched_entity, so
+	 * prev_node now contains the highest weight sched_entity
+	 */
+
+	if (!prev_node)
+		/* The cfs_rq is now empty */
+		return;
+
+	prev_entry = rb_entry(prev_node, struct sched_entity, max_load);
+	if (prev_entry->load.weight == se->load.weight)
+		/*
+		 * A sched_entity in the cfs_rq had the same weight, so the
+		 * max weight did not change
+		 */
+		return;
+
+	scaled_weight = scale_tg_weight(cfs_rq->tg, prev_entry->load.weight);
+	if (scaled_weight != cfs_rq->load.weight)
+		set_se_shares(prev_entry->parent, scaled_weight);
+}
+#else
+static inline void enqueue_max_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void dequeue_max_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
+
 /*
  * Enqueue an entity into the rb-tree:
  */
@@ -173,6 +279,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	rb_link_node(&se->run_node, parent, link);
 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+	enqueue_max_load(cfs_rq, se);
 }
 
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -181,6 +288,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		cfs_rq->rb_leftmost = rb_next(&se->run_node);
 
 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+	dequeue_max_load(cfs_rq, se);
 }
 
 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Prev by Date: Re: [PATCH] mtrr use type bool
Next by Date: Re: [PATCH, RFC] hacks to allow -rt to run kernbench on POWER
Previous by thread: Suspend to ram regression (2.6.24-rc1-git)
Next by thread: [PATCH 0/6] various scheduler patches
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]