[PATCH RFC 6/9] RCU priority boosting for preemptible RCU

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Work in progress, not for inclusion.

RCU priority boosting is needed when running a workload that might include
CPU-bound user tasks running at realtime priorities with preemptible RCU.
In this situation, RCU priority boosting is needed to avoid OOM.

Please note that because Classic RCU does not permit RCU read-side
critical sections to be preempted, there is no need to boost the priority
of Classic RCU readers.  Boosting the priority of a running process
does not make it run any faster, at least not on any hardware that I am
aware of.  ;-)

Signed-off-by: Paul E. McKenney <[email protected]>
---

 include/linux/init_task.h  |   13 
 include/linux/rcupdate.h   |   17 +
 include/linux/rcupreempt.h |   20 +
 include/linux/sched.h      |   24 +
 init/main.c                |    1 
 kernel/Kconfig.preempt     |   14 -
 kernel/fork.c              |    6 
 kernel/rcupreempt.c        |  608 ++++++++++++++++++++++++++++++++++++++++++---
 kernel/rtmutex.c           |    7 
 kernel/sched.c             |    5 
 lib/Kconfig.debug          |   34 ++
 11 files changed, 703 insertions(+), 46 deletions(-)

diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/include/linux/init_task.h linux-2.6.22-F-boostrcu/include/linux/init_task.h
--- linux-2.6.22-E-hotplug/include/linux/init_task.h	2007-07-08 16:32:17.000000000 -0700
+++ linux-2.6.22-F-boostrcu/include/linux/init_task.h	2007-08-31 14:09:02.000000000 -0700
@@ -87,6 +87,17 @@ extern struct nsproxy init_nsproxy;
 	.signalfd_list	= LIST_HEAD_INIT(sighand.signalfd_list),	\
 }
 
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+#define INIT_RCU_BOOST_PRIO .rcu_prio	= MAX_PRIO,
+#define INIT_PREEMPT_RCU_BOOST(tsk)					\
+	.rcub_rbdp	= NULL,						\
+	.rcub_state	= RCU_BOOST_IDLE,				\
+	.rcub_entry	= LIST_HEAD_INIT(tsk.rcub_entry),
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+#define INIT_RCU_BOOST_PRIO
+#define INIT_PREEMPT_RCU_BOOST(tsk)
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */
+
 extern struct group_info init_groups;
 
 #define INIT_STRUCT_PID {						\
@@ -125,6 +136,7 @@ extern struct group_info init_groups;
 	.prio		= MAX_PRIO-20,					\
 	.static_prio	= MAX_PRIO-20,					\
 	.normal_prio	= MAX_PRIO-20,					\
+	INIT_RCU_BOOST_PRIO						\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
@@ -169,6 +181,7 @@ extern struct group_info init_groups;
 	},								\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
+	INIT_PREEMPT_RCU_BOOST(tsk)					\
 }
 
 
diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/include/linux/rcupdate.h linux-2.6.22-F-boostrcu/include/linux/rcupdate.h
--- linux-2.6.22-E-hotplug/include/linux/rcupdate.h	2007-08-24 11:03:22.000000000 -0700
+++ linux-2.6.22-F-boostrcu/include/linux/rcupdate.h	2007-08-24 17:04:14.000000000 -0700
@@ -252,5 +252,22 @@ static inline void rcu_qsctr_inc(int cpu
 	per_cpu(rcu_data_passed_quiesc, cpu) = 1;
 }
 
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+extern void init_rcu_boost_late(void);
+extern void __rcu_preempt_boost(void);
+#define rcu_preempt_boost() /* cpp to avoid #include hell. */ \
+	do { \
+		if (unlikely(current->rcu_read_lock_nesting > 0)) \
+			__rcu_preempt_boost(); \
+	} while (0)
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+static inline void init_rcu_boost_late(void)
+{
+}
+static inline void rcu_preempt_boost(void)
+{
+}
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */
+
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUPDATE_H */
diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/include/linux/rcupreempt.h linux-2.6.22-F-boostrcu/include/linux/rcupreempt.h
--- linux-2.6.22-E-hotplug/include/linux/rcupreempt.h	2007-08-24 11:20:32.000000000 -0700
+++ linux-2.6.22-F-boostrcu/include/linux/rcupreempt.h	2007-08-24 11:24:59.000000000 -0700
@@ -42,6 +42,26 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+/*
+ * Task state with respect to being RCU-boosted.  This state is changed
+ * by the task itself in response to the following three events:
+ * 1. Preemption (or block on lock) while in RCU read-side critical section.
+ * 2. Outermost rcu_read_unlock() for blocked RCU read-side critical section.
+ *
+ * The RCU-boost task also updates the state when boosting priority.
+ */
+enum rcu_boost_state {
+	RCU_BOOST_IDLE = 0,	   /* Not yet blocked if in RCU read-side. */
+	RCU_BOOST_BLOCKED = 1,	   /* Blocked from RCU read-side. */
+	RCU_BOOSTED = 2,	   /* Boosting complete. */
+	RCU_BOOST_INVALID = 3,	   /* For bogus state sightings. */
+};
+
+#define N_RCU_BOOST_STATE (RCU_BOOST_INVALID + 1)
+
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+
 #define call_rcu_bh(head, rcu) call_rcu(head, rcu)
 #define rcu_bh_qsctr_inc(cpu)	do { } while (0)
 #define __rcu_read_lock_bh()	{ rcu_read_lock(); local_bh_disable(); }
diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/include/linux/sched.h linux-2.6.22-F-boostrcu/include/linux/sched.h
--- linux-2.6.22-E-hotplug/include/linux/sched.h	2007-08-24 11:00:39.000000000 -0700
+++ linux-2.6.22-F-boostrcu/include/linux/sched.h	2007-08-24 17:07:01.000000000 -0700
@@ -546,6 +546,22 @@ struct signal_struct {
 #define is_rt_policy(p)		((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
 #define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
 
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+#define set_rcu_prio(p, prio) /* cpp to avoid #include hell */ \
+	do { \
+		(p)->rcu_prio = (prio); \
+	} while (0)
+#define get_rcu_prio(p) (p)->rcu_prio  /* cpp to avoid #include hell */
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+static inline void set_rcu_prio(struct task_struct *p, int prio)
+{
+}
+static inline int get_rcu_prio(struct task_struct *p)
+{
+	return MAX_PRIO;
+}
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */
+
 /*
  * Some day this will be a full-fledged user tracking system..
  */
@@ -834,6 +850,9 @@ struct task_struct {
 #endif
 	int load_weight;	/* for niceness load balancing purposes */
 	int prio, static_prio, normal_prio;
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+	int rcu_prio;
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
 	struct list_head run_list;
 	struct prio_array *array;
 
@@ -858,6 +877,11 @@ struct task_struct {
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
 #endif
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+	struct rcu_boost_dat *rcub_rbdp;
+	enum rcu_boost_state rcub_state;
+	struct list_head rcub_entry;
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
 
 	struct list_head tasks;
 	/*
diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/init/main.c linux-2.6.22-F-boostrcu/init/main.c
--- linux-2.6.22-E-hotplug/init/main.c	2007-07-08 16:32:17.000000000 -0700
+++ linux-2.6.22-F-boostrcu/init/main.c	2007-08-24 11:24:59.000000000 -0700
@@ -722,6 +722,7 @@ static void __init do_basic_setup(void)
 	driver_init();
 	init_irq_proc();
 	do_initcalls();
+	init_rcu_boost_late();
 }
 
 static void __init do_pre_smp_initcalls(void)
diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/kernel/fork.c linux-2.6.22-F-boostrcu/kernel/fork.c
--- linux-2.6.22-E-hotplug/kernel/fork.c	2007-08-24 11:00:39.000000000 -0700
+++ linux-2.6.22-F-boostrcu/kernel/fork.c	2007-08-24 11:24:59.000000000 -0700
@@ -1036,6 +1036,12 @@ static struct task_struct *copy_process(
 	p->rcu_read_lock_nesting = 0;
 	p->rcu_flipctr_idx = 0;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+	p->rcu_prio = MAX_PRIO;
+	p->rcub_rbdp = NULL;
+	p->rcub_state = RCU_BOOST_IDLE;
+	INIT_LIST_HEAD(&p->rcub_entry);
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
 
diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/kernel/Kconfig.preempt linux-2.6.22-F-boostrcu/kernel/Kconfig.preempt
--- linux-2.6.22-E-hotplug/kernel/Kconfig.preempt	2007-08-24 11:00:39.000000000 -0700
+++ linux-2.6.22-F-boostrcu/kernel/Kconfig.preempt	2007-08-24 11:24:59.000000000 -0700
@@ -91,13 +91,13 @@ config PREEMPT_RCU
 
 endchoice
 
-config RCU_TRACE
-	bool "Enable tracing for RCU - currently stats in debugfs"
-	select DEBUG_FS
-	default y
+config PREEMPT_RCU_BOOST
+	bool "Enable priority boosting of RCU read-side critical sections"
+	depends on PREEMPT_RCU
+	default n
 	help
-	  This option provides tracing in RCU which presents stats
-	  in debugfs for debugging RCU implementation.
+	  This option permits priority boosting of RCU read-side critical
+	  sections that have been preempted in order to prevent indefinite
+	  delay of grace periods in face of runaway non-realtime processes.
 
-	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/kernel/rcupreempt.c linux-2.6.22-F-boostrcu/kernel/rcupreempt.c
--- linux-2.6.22-E-hotplug/kernel/rcupreempt.c	2007-08-24 11:20:32.000000000 -0700
+++ linux-2.6.22-F-boostrcu/kernel/rcupreempt.c	2007-08-31 14:06:49.000000000 -0700
@@ -51,6 +51,15 @@
 #include <linux/byteorder/swabb.h>
 #include <linux/cpumask.h>
 #include <linux/rcupreempt_trace.h>
+#include <linux/kthread.h>
+
+/*
+ * Macro that prevents the compiler from reordering accesses, but does
+ * absolutely -nothing- to prevent CPUs from reordering.  This is used
+ * only to mediate communication between mainline code and hardware
+ * interrupt and NMI handlers.
+ */
+#define ORDERED_WRT_IRQ(x) (*(volatile typeof(x) *)&(x))
 
 /*
  * PREEMPT_RCU data structures.
@@ -82,6 +91,531 @@ static struct rcu_ctrlblk rcu_ctrlblk = 
 };
 static DEFINE_PER_CPU(int [2], rcu_flipctr) = { 0, 0 };
 
+#ifndef CONFIG_PREEMPT_RCU_BOOST
+static inline void init_rcu_boost_early(void) { }
+static inline void rcu_read_unlock_unboost(void) { }
+#else /* #ifndef CONFIG_PREEMPT_RCU_BOOST */
+
+/* Defines possible event indices for ->rbs_stats[] (first index). */
+
+#define RCU_BOOST_DAT_BLOCK	0
+#define RCU_BOOST_DAT_BOOST	1
+#define RCU_BOOST_DAT_UNLOCK	2
+#define N_RCU_BOOST_DAT_EVENTS	3
+
+/* RCU-boost per-CPU array element. */
+
+struct rcu_boost_dat {
+	spinlock_t rbs_lock;	/* Protects state/CPU slice of structures. */
+	struct list_head rbs_toboost;
+	struct list_head rbs_boosted;
+	unsigned long rbs_blocked;
+	unsigned long rbs_boost_attempt;
+	unsigned long rbs_boost;
+	unsigned long rbs_unlock;
+	unsigned long rbs_unboosted;
+#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
+	unsigned long rbs_stats[N_RCU_BOOST_DAT_EVENTS][N_RCU_BOOST_STATE];
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
+};
+#define RCU_BOOST_ELEMENTS 4
+
+static int rcu_boost_idx = -1; /* invalid value for early RCU use. */
+static DEFINE_PER_CPU(struct rcu_boost_dat, rcu_boost_dat[RCU_BOOST_ELEMENTS]);
+static struct task_struct *rcu_boost_task;
+
+#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
+
+/*
+ * Function to increment indicated ->rbs_stats[] element.
+ */
+static inline void rcu_boost_dat_stat(struct rcu_boost_dat *rbdp,
+				      int event,
+				      enum rcu_boost_state oldstate)
+{
+	if (oldstate >= RCU_BOOST_IDLE && oldstate <= RCU_BOOSTED) {
+		rbdp->rbs_stats[event][oldstate]++;
+	} else {
+		rbdp->rbs_stats[event][RCU_BOOST_INVALID]++;
+	}
+}
+
+static inline void rcu_boost_dat_stat_block(struct rcu_boost_dat *rbdp,
+					    enum rcu_boost_state oldstate)
+{
+	rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BLOCK, oldstate);
+}
+
+static inline void rcu_boost_dat_stat_boost(struct rcu_boost_dat *rbdp,
+					    enum rcu_boost_state oldstate)
+{
+	rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BOOST, oldstate);
+}
+
+static inline void rcu_boost_dat_stat_unlock(struct rcu_boost_dat *rbdp,
+					     enum rcu_boost_state oldstate)
+{
+	rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_UNLOCK, oldstate);
+}
+
+/*
+ * Prefix for kprint() strings for periodic statistics messages.
+ */
+static char *rcu_boost_state_event[] = {
+	"block:  ",
+	"boost:  ",
+	"unlock: ",
+};
+
+/*
+ * Indicators for numbers in kprint() strings.  "!" indicates a state-event
+ * pair that should not happen, while "?" indicates a state that should
+ * not happen.
+ */
+static char *rcu_boost_state_error[] = {
+	/*ibBe*/
+	 "   ?",  /* block */
+	 "!  ?",  /* boost */
+	 "?  ?",  /* unlock */
+};
+
+/*
+ * Print out RCU booster task statistics at the specified interval.
+ */
+static void rcu_boost_dat_stat_print(void)
+{
+	/* Three decimal digits per byte plus spacing per number and line. */
+	char buf[N_RCU_BOOST_STATE * (sizeof(long) * 3 + 2) + 2];
+	int cpu;
+	int event;
+	int i;
+	static time_t lastprint;	/* static implies 0 initial value. */
+	struct rcu_boost_dat *rbdp;
+	int state;
+	struct rcu_boost_dat sum;
+
+	/* Wait a graceful interval between printk spamming. */
+	/* Note: time_after() dislikes time_t. */
+
+	if (xtime.tv_sec - lastprint < CONFIG_PREEMPT_RCU_BOOST_STATS_INTERVAL)
+		return;
+
+	/* Sum up the state/event-independent counters. */
+
+	sum.rbs_blocked = 0;
+	sum.rbs_boost_attempt = 0;
+	sum.rbs_boost = 0;
+	sum.rbs_unlock = 0;
+	sum.rbs_unboosted = 0;
+	for_each_possible_cpu(cpu)
+		for (i = 0; i < RCU_BOOST_ELEMENTS; i++) {
+			rbdp = per_cpu(rcu_boost_dat, cpu);
+			sum.rbs_blocked += rbdp[i].rbs_blocked;
+			sum.rbs_boost_attempt += rbdp[i].rbs_boost_attempt;
+			sum.rbs_boost += rbdp[i].rbs_boost;
+			sum.rbs_unlock += rbdp[i].rbs_unlock;
+			sum.rbs_unboosted += rbdp[i].rbs_unboosted;
+		}
+
+	/* Sum up the state/event-dependent counters. */
+
+	for (event = 0; event < N_RCU_BOOST_DAT_EVENTS; event++)
+		for (state = 0; state < N_RCU_BOOST_STATE; state++) {
+			sum.rbs_stats[event][state] = 0;
+			for_each_possible_cpu(cpu) {
+				for (i = 0; i < RCU_BOOST_ELEMENTS; i++)
+					sum.rbs_stats[event][state]
+					    += per_cpu(rcu_boost_dat,
+						       cpu)[i].rbs_stats[event][state];
+			}
+		}
+
+	/* Print them out! */
+
+	printk(KERN_INFO
+	       "rcu_boost_dat: idx=%d "
+	       "b=%lu ul=%lu ub=%lu boost: a=%lu b=%lu\n",
+	       rcu_boost_idx,
+	       sum.rbs_blocked, sum.rbs_unlock, sum.rbs_unboosted,
+	       sum.rbs_boost_attempt, sum.rbs_boost);
+	for (event = 0; event < N_RCU_BOOST_DAT_EVENTS; event++) {
+		i = 0;
+		for (state = 0; state < N_RCU_BOOST_STATE; state++)
+			i += sprintf(&buf[i], " %ld%c",
+				     sum.rbs_stats[event][state],
+				     rcu_boost_state_error[event][state]);
+		printk(KERN_INFO "rcu_boost_dat %s %s\n",
+		       rcu_boost_state_event[event], buf);
+	}
+
+	/* Go away and don't come back for awhile. */
+
+	lastprint = xtime.tv_sec;
+}
+
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
+
+static inline void rcu_boost_dat_stat_block(struct rcu_boost_dat *rbdp,
+					    enum rcu_boost_state oldstate)
+{
+}
+static inline void rcu_boost_dat_stat_boost(struct rcu_boost_dat *rbdp,
+					    enum rcu_boost_state oldstate)
+{
+}
+static inline void rcu_boost_dat_stat_unlock(struct rcu_boost_dat *rbdp,
+					     enum rcu_boost_state oldstate)
+{
+}
+static void rcu_boost_dat_stat_print(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
+
+/*
+ * Initialize RCU-boost state.  This happens early in the boot process,
+ * when the scheduler does not yet exist.  So don't try to use it.
+ */
+static void init_rcu_boost_early(void)
+{
+	struct rcu_boost_dat *rbdp;
+	int cpu;
+	int i;
+
+	for_each_possible_cpu(cpu) {
+		rbdp = per_cpu(rcu_boost_dat, cpu);
+		for (i = 0; i < RCU_BOOST_ELEMENTS; i++) {
+			spin_lock_init(&rbdp[i].rbs_lock);
+			INIT_LIST_HEAD(&rbdp[i].rbs_toboost);
+			INIT_LIST_HEAD(&rbdp[i].rbs_boosted);
+			rbdp[i].rbs_blocked = 0;
+			rbdp[i].rbs_boost_attempt = 0;
+			rbdp[i].rbs_boost = 0;
+			rbdp[i].rbs_unlock = 0;
+			rbdp[i].rbs_unboosted = 0;
+#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
+			{
+				int j, k;
+
+				for (j = 0; j < N_RCU_BOOST_DAT_EVENTS; j++)
+					for (k = 0; k < N_RCU_BOOST_STATE; k++)
+						rbdp[i].rbs_stats[j][k] = 0;
+			}
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
+		}
+		smp_wmb();  /* Make sure readers see above initialization. */
+		rcu_boost_idx = 0;  /* Allow readers to access data. */
+	}
+}
+
+/*
+ * Return the list on which the calling task should add itself, or
+ * NULL if too early during initialization.
+ */
+static inline struct rcu_boost_dat *rcu_rbd_new(void)
+{
+	int cpu = smp_processor_id();  /* locks used, so preemption OK. */
+	int idx = ORDERED_WRT_IRQ(rcu_boost_idx);
+
+	if (unlikely(idx < 0))
+		return NULL;
+	return &per_cpu(rcu_boost_dat, cpu)[idx];
+}
+
+/*
+ * Return the list from which to boost target tasks.
+ * May only be invoked by the booster task, so guaranteed to
+ * already be initialized.  Use rcu_boost_dat element least recently
+ * the destination for task blocking in RCU read-side critical sections.
+ */
+static inline struct rcu_boost_dat *rcu_rbd_boosting(int cpu)
+{
+	int idx = (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1);
+
+	return &per_cpu(rcu_boost_dat, cpu)[idx];
+}
+
+#define PREEMPT_RCU_BOOSTER_PRIO 49  /* Match curr_irq_prio manually. */
+				     /*  Administrators can always adjust */
+				     /*  via the /proc interface. */
+
+/*
+ * Boost the specified task from an RCU viewpoint.
+ * Boost the target task to a priority just a bit less-favored than
+ * that of the RCU-boost task, but boost to a realtime priority even
+ * if the RCU-boost task is running at a non-realtime priority.
+ * We check the priority of the RCU-boost task each time we boost
+ * in case the sysadm manually changes the priority.
+ */
+static void rcu_boost_prio(struct task_struct *taskp)
+{
+	unsigned long flags;
+	int rcuprio;
+
+	spin_lock_irqsave(&current->pi_lock, flags);
+	rcuprio = rt_mutex_getprio(current) + 1;
+	if (rcuprio >= MAX_USER_RT_PRIO)
+		rcuprio = MAX_USER_RT_PRIO - 1;
+	spin_unlock_irqrestore(&current->pi_lock, flags);
+	spin_lock_irqsave(&taskp->pi_lock, flags);
+	if (taskp->rcu_prio != rcuprio) {
+		taskp->rcu_prio = rcuprio;
+		if (taskp->rcu_prio < taskp->prio)
+			rt_mutex_setprio(taskp, taskp->rcu_prio);
+	}
+	spin_unlock_irqrestore(&taskp->pi_lock, flags);
+}
+
+/*
+ * Unboost the specified task from an RCU viewpoint.
+ */
+static void rcu_unboost_prio(struct task_struct *taskp)
+{
+	int nprio;
+	unsigned long flags;
+
+	spin_lock_irqsave(&taskp->pi_lock, flags);
+	taskp->rcu_prio = MAX_PRIO;
+	nprio = rt_mutex_getprio(taskp);
+	if (nprio > taskp->prio)
+		rt_mutex_setprio(taskp, nprio);
+	spin_unlock_irqrestore(&taskp->pi_lock, flags);
+}
+
+/*
+ * Boost all of the RCU-reader tasks on the specified list.
+ */
+static void rcu_boost_one_reader_list(struct rcu_boost_dat *rbdp)
+{
+	LIST_HEAD(list);
+	unsigned long flags;
+	struct task_struct *taskp;
+
+	/*
+	 * Splice both lists onto a local list.  We will still
+	 * need to hold the lock when manipulating the local list
+	 * because tasks can remove themselves at any time.
+	 * The reason for splicing the rbs_boosted list is that
+	 * our priority may have changed, so reboosting may be
+	 * required.
+	 */
+
+	spin_lock_irqsave(&rbdp->rbs_lock, flags);
+	list_splice_init(&rbdp->rbs_toboost, &list);
+	list_splice_init(&rbdp->rbs_boosted, &list);
+	while (!list_empty(&list)) {
+
+		/*
+		 * Pause for a bit before boosting each task.
+		 * @@@FIXME: reduce/eliminate pausing in case of OOM.
+		 */
+
+		spin_unlock_irqrestore(&rbdp->rbs_lock, flags);
+		schedule_timeout_uninterruptible(1);
+		spin_lock_irqsave(&rbdp->rbs_lock, flags);
+
+		/*
+		 * All tasks might have removed themselves while
+		 * we were waiting.  Recheck list emptiness.
+		 */
+
+		if (list_empty(&list))
+			break;
+
+		/* Remove first task in local list, count the attempt. */
+
+		taskp = list_entry(list.next, typeof(*taskp), rcub_entry);
+		list_del_init(&taskp->rcub_entry);
+		rbdp->rbs_boost_attempt++;
+
+		/* Ignore tasks in unexpected states. */
+
+		if (taskp->rcub_state == RCU_BOOST_IDLE) {
+			list_add_tail(&taskp->rcub_entry, &rbdp->rbs_toboost);
+			rcu_boost_dat_stat_boost(rbdp, taskp->rcub_state);
+			continue;
+		}
+
+		/* Boost the task's priority. */
+
+		rcu_boost_prio(taskp);
+		rbdp->rbs_boost++;
+		rcu_boost_dat_stat_boost(rbdp, taskp->rcub_state);
+		taskp->rcub_state = RCU_BOOSTED;
+		list_add_tail(&taskp->rcub_entry, &rbdp->rbs_boosted);
+	}
+	spin_unlock_irqrestore(&rbdp->rbs_lock, flags);
+}
+
+/*
+ * Priority-boost tasks stuck in RCU read-side critical sections as
+ * needed (presumably rarely).
+ */
+static int rcu_booster(void *arg)
+{
+	int cpu;
+	struct sched_param sp = { .sched_priority = PREEMPT_RCU_BOOSTER_PRIO, };
+
+	sched_setscheduler(current, SCHED_RR, &sp);
+	current->flags |= PF_NOFREEZE;
+
+	do {
+
+		/* Advance the lists of tasks. */
+
+		rcu_boost_idx = (rcu_boost_idx + 1) % RCU_BOOST_ELEMENTS;
+		for_each_possible_cpu(cpu) {
+
+			/*
+			 * Boost all sufficiently aged readers.
+			 * Readers must first be preempted or block
+			 * on a mutex in an RCU read-side critical section,
+			 * then remain in that critical section for
+			 * RCU_BOOST_ELEMENTS-1 time intervals.
+			 * So most of the time we should end up doing
+			 * nothing.
+			 */
+
+			rcu_boost_one_reader_list(rcu_rbd_boosting(cpu));
+
+			/*
+			 * Large SMP systems may need to sleep sometimes
+			 * in this loop.  Or have multiple RCU-boost tasks.
+			 */
+		}
+
+		/*
+		 * Sleep to allow any unstalled RCU read-side critical
+		 * sections to age out of the list.  @@@ FIXME: reduce,
+		 * adjust, or eliminate in case of OOM.
+		 */
+
+		schedule_timeout_uninterruptible(HZ);
+
+		/* Print stats if enough time has passed. */
+
+		rcu_boost_dat_stat_print();
+
+	} while (!kthread_should_stop());
+
+	return 0;
+}
+
+/*
+ * Perform the portions of RCU-boost initialization that require the
+ * scheduler to be up and running.
+ */
+void init_rcu_boost_late(void)
+{
+
+	/* Spawn RCU-boost task. */
+
+	printk(KERN_INFO "Starting RCU priority booster\n");
+	rcu_boost_task = kthread_run(rcu_booster, NULL, "RCU Prio Booster");
+	if (IS_ERR(rcu_boost_task))
+		panic("Unable to create RCU Priority Booster, errno %ld\n",
+		      -PTR_ERR(rcu_boost_task));
+}
+
+/*
+ * Update task's RCU-boost state to reflect blocking in RCU read-side
+ * critical section, so that the RCU-boost task can find it in case it
+ * later needs its priority boosted.
+ */
+void __rcu_preempt_boost(void)
+{
+	struct rcu_boost_dat *rbdp;
+	unsigned long flags;
+
+	/* Identify list to place task on for possible later boosting. */
+
+	local_irq_save(flags);
+	rbdp = rcu_rbd_new();
+	if (rbdp == NULL) {
+		local_irq_restore(flags);
+		printk(KERN_INFO
+		       "Preempted RCU read-side critical section too early.\n");
+		return;
+	}
+	spin_lock(&rbdp->rbs_lock);
+	rbdp->rbs_blocked++;
+
+	/*
+	 * Update state.  We hold the lock and aren't yet on the list,
+	 * so the booster cannot mess with us yet.
+	 */
+
+	rcu_boost_dat_stat_block(rbdp, current->rcub_state);
+	if (current->rcub_state != RCU_BOOST_IDLE) {
+
+		/*
+		 * We have been here before, so just update stats.
+		 * It may seem strange to do all this work just to
+		 * accumulate statistics, but this is such a
+		 * low-probability code path that we shouldn't care.
+		 * If it becomes a problem, it can be fixed.
+		 */
+
+		spin_unlock_irqrestore(&rbdp->rbs_lock, flags);
+		return;
+	}
+	current->rcub_state = RCU_BOOST_BLOCKED;
+
+	/* Now add ourselves to the list so that the booster can find us. */
+
+	list_add_tail(&current->rcub_entry, &rbdp->rbs_toboost);
+	current->rcub_rbdp = rbdp;
+	spin_unlock_irqrestore(&rbdp->rbs_lock, flags);
+}
+
+/*
+ * Do the list-removal and priority-unboosting "heavy lifting" when
+ * required.
+ */
+static void __rcu_read_unlock_unboost(void)
+{
+	unsigned long flags;
+	struct rcu_boost_dat *rbdp;
+
+	/* Identify the list structure and acquire the corresponding lock. */
+
+	rbdp = current->rcub_rbdp;
+	spin_lock_irqsave(&rbdp->rbs_lock, flags);
+
+	/* Remove task from the list it was on. */
+
+	list_del_init(&current->rcub_entry);
+	rbdp->rbs_unlock++;
+	current->rcub_rbdp = NULL;
+
+	/* Record stats, unboost if needed, and update state. */
+
+	rcu_boost_dat_stat_unlock(rbdp, current->rcub_state);
+	if (current->rcub_state == RCU_BOOSTED) {
+		rcu_unboost_prio(current);
+		rbdp->rbs_unboosted++;
+	}
+	current->rcub_state = RCU_BOOST_IDLE;
+	spin_unlock_irqrestore(&rbdp->rbs_lock, flags);
+}
+
+/*
+ * Do any state changes and unboosting needed for rcu_read_unlock().
+ * Pass any complex work on to __rcu_read_unlock_unboost().
+ * The vast majority of the time, no work will be needed, as preemption
+ * and blocking within RCU read-side critical sections is comparatively
+ * rare.
+ */
+static inline void rcu_read_unlock_unboost(void)
+{
+
+	if (unlikely(current->rcub_state != RCU_BOOST_IDLE))
+		__rcu_read_unlock_unboost();
+}
+
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU_BOOST */
+
 /*
  * States for rcu_try_flip() and friends.
  */
@@ -128,14 +662,6 @@ static DEFINE_PER_CPU(enum rcu_mb_flag_v
 static cpumask_t rcu_cpu_online_map = CPU_MASK_NONE;
 
 /*
- * Macro that prevents the compiler from reordering accesses, but does
- * absolutely -nothing- to prevent CPUs from reordering.  This is used
- * only to mediate communication between mainline code and hardware
- * interrupt and NMI handlers.
- */
-#define ORDERED_WRT_IRQ(x) (*(volatile typeof(x) *)&(x))
-
-/*
  * RCU_DATA_ME: find the current CPU's rcu_data structure.
  * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
  */
@@ -194,7 +720,7 @@ void __rcu_read_lock(void)
 		me->rcu_read_lock_nesting = nesting + 1;
 
 	} else {
-		unsigned long oldirq;
+		unsigned long flags;
 
 		/*
 		 * Disable local interrupts to prevent the grace-period
@@ -203,7 +729,7 @@ void __rcu_read_lock(void)
 		 * contain rcu_read_lock().
 		 */
 
-		local_irq_save(oldirq);
+		local_irq_save(flags);
 
 		/*
 		 * Outermost nesting of rcu_read_lock(), so increment
@@ -233,7 +759,7 @@ void __rcu_read_lock(void)
 		 */
 
 		ORDERED_WRT_IRQ(me->rcu_flipctr_idx) = idx;
-		local_irq_restore(oldirq);
+		local_irq_restore(flags);
 	}
 }
 EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -255,7 +781,7 @@ void __rcu_read_unlock(void)
 		me->rcu_read_lock_nesting = nesting - 1;
 
 	} else {
-		unsigned long oldirq;
+		unsigned long flags;
 
 		/*
 		 * Disable local interrupts to prevent the grace-period
@@ -264,7 +790,7 @@ void __rcu_read_unlock(void)
 		 * contain rcu_read_lock() and rcu_read_unlock().
 		 */
 
-		local_irq_save(oldirq);
+		local_irq_save(flags);
 
 		/*
 		 * Outermost nesting of rcu_read_unlock(), so we must
@@ -305,7 +831,10 @@ void __rcu_read_unlock(void)
 		 */
 
 		ORDERED_WRT_IRQ(__get_cpu_var(rcu_flipctr)[idx])--;
-		local_irq_restore(oldirq);
+
+		rcu_read_unlock_unboost();
+
+		local_irq_restore(flags);
 	}
 }
 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
@@ -504,10 +1033,10 @@ rcu_try_flip_waitmb(void)
  */
 static void rcu_try_flip(void)
 {
-	unsigned long oldirq;
+	unsigned long flags;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
-	if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, oldirq))) {
+	if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
 		RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
 		return;
 	}
@@ -534,7 +1063,7 @@ static void rcu_try_flip(void)
 		if (rcu_try_flip_waitmb())
 			rcu_try_flip_state = rcu_try_flip_idle_state;
 	}
-	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
 }
 
 /*
@@ -553,16 +1082,16 @@ static void rcu_check_mb(int cpu)
 
 void rcu_check_callbacks_rt(int cpu, int user)
 {
-	unsigned long oldirq;
+	unsigned long flags;
 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 
 	rcu_check_mb(cpu);
 	if (rcu_ctrlblk.completed == rdp->completed)
 		rcu_try_flip();
-	spin_lock_irqsave(&rdp->lock, oldirq);
+	spin_lock_irqsave(&rdp->lock, flags);
 	RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
 	__rcu_advance_callbacks(rdp);
-	spin_unlock_irqrestore(&rdp->lock, oldirq);
+	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 
 /*
@@ -571,18 +1100,19 @@ void rcu_check_callbacks_rt(int cpu, int
  */
 void rcu_advance_callbacks_rt(int cpu, int user)
 {
-	unsigned long oldirq;
+	unsigned long flags;
 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 
 	if (rcu_ctrlblk.completed == rdp->completed) {
 		rcu_try_flip();
 		if (rcu_ctrlblk.completed == rdp->completed)
 			return;
+		rcu_read_unlock_unboost();
 	}
-	spin_lock_irqsave(&rdp->lock, oldirq);
+	spin_lock_irqsave(&rdp->lock, flags);
 	RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
 	__rcu_advance_callbacks(rdp);
-	spin_unlock_irqrestore(&rdp->lock, oldirq);
+	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -601,24 +1131,24 @@ void rcu_offline_cpu_rt(int cpu)
 {
 	int i;
 	struct rcu_head *list = NULL;
-	unsigned long oldirq;
+	unsigned long flags;
 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 	struct rcu_head **tail = &list;
 
 	/* Remove all callbacks from the newly dead CPU, retaining order. */
 
-	spin_lock_irqsave(&rdp->lock, oldirq);
+	spin_lock_irqsave(&rdp->lock, flags);
 	rcu_offline_cpu_rt_enqueue(rdp->donelist, rdp->donetail, list, tail);
 	for (i = GP_STAGES - 1; i >= 0; i--)
 		rcu_offline_cpu_rt_enqueue(rdp->waitlist[i], rdp->waittail[i],
 					   list, tail);
 	rcu_offline_cpu_rt_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
-	spin_unlock_irqrestore(&rdp->lock, oldirq);
+	spin_unlock_irqrestore(&rdp->lock, flags);
 	rdp->waitlistcount = 0;
 
 	/* Disengage the newly dead CPU from grace-period computation. */
 
-	spin_lock_irqsave(&rcu_ctrlblk.fliplock, oldirq);
+	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
 	rcu_check_mb(cpu);
 	if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
 		smp_mb();  /* Subsequent counter accesses must see new value */
@@ -627,7 +1157,7 @@ void rcu_offline_cpu_rt(int cpu)
 			   /*  seen -after- acknowledgement. */
 	}
 	cpu_clear(cpu, rcu_cpu_online_map);
-	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
 
 	/*
 	 * Place the removed callbacks on the current CPU's queue.
@@ -640,20 +1170,20 @@ void rcu_offline_cpu_rt(int cpu)
 	 */
 
 	rdp = RCU_DATA_ME();
-	spin_lock_irqsave(&rdp->lock, oldirq);
+	spin_lock_irqsave(&rdp->lock, flags);
 	*rdp->nexttail = list;
 	if (list)
 		rdp->nexttail = tail;
-	spin_unlock_irqrestore(&rdp->lock, oldirq);
+	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 
 void __devinit rcu_online_cpu_rt(int cpu)
 {
-	unsigned long oldirq;
+	unsigned long flags;
 
-	spin_lock_irqsave(&rcu_ctrlblk.fliplock, oldirq);
+	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
 	cpu_set(cpu, rcu_cpu_online_map);
-	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
 }
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -695,12 +1225,12 @@ void rcu_process_callbacks_rt(struct sof
 void fastcall call_rcu_preempt(struct rcu_head *head,
 			       void (*func)(struct rcu_head *rcu))
 {
-	unsigned long oldirq;
+	unsigned long flags;
 	struct rcu_data *rdp;
 
 	head->func = func;
 	head->next = NULL;
-	local_irq_save(oldirq);
+	local_irq_save(flags);
 	rdp = RCU_DATA_ME();
 	spin_lock(&rdp->lock);
 	__rcu_advance_callbacks(rdp);
@@ -708,7 +1238,7 @@ void fastcall call_rcu_preempt(struct rc
 	rdp->nexttail = &head->next;
 	RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
 	spin_unlock(&rdp->lock);
-	local_irq_restore(oldirq);
+	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(call_rcu_preempt);
 
@@ -757,6 +1287,11 @@ int rcu_pending_rt(int cpu)
 	return 0;
 }
 
+/*
+ * Initialize RCU.  This is called very early in boot, so is restricted
+ * to very simple operations.  Don't even think about messing with anything
+ * that involves the scheduler, as it doesn't exist yet.
+ */
 void __init rcu_init_rt(void)
 {
 	int cpu;
@@ -778,6 +1313,7 @@ void __init rcu_init_rt(void)
 		rdp->donelist = NULL;
 		rdp->donetail = &rdp->donelist;
 	}
+	init_rcu_boost_early();
 }
 
 /*
diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/kernel/rtmutex.c linux-2.6.22-F-boostrcu/kernel/rtmutex.c
--- linux-2.6.22-E-hotplug/kernel/rtmutex.c	2007-07-08 16:32:17.000000000 -0700
+++ linux-2.6.22-F-boostrcu/kernel/rtmutex.c	2007-08-24 11:24:59.000000000 -0700
@@ -111,11 +111,12 @@ static inline void mark_rt_mutex_waiters
  */
 int rt_mutex_getprio(struct task_struct *task)
 {
+	int prio = min(task->normal_prio, get_rcu_prio(task));
+
 	if (likely(!task_has_pi_waiters(task)))
-		return task->normal_prio;
+		return prio;
 
-	return min(task_top_pi_waiter(task)->pi_list_entry.prio,
-		   task->normal_prio);
+	return min(task_top_pi_waiter(task)->pi_list_entry.prio, prio);
 }
 
 /*
diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/kernel/sched.c linux-2.6.22-F-boostrcu/kernel/sched.c
--- linux-2.6.22-E-hotplug/kernel/sched.c	2007-07-08 16:32:17.000000000 -0700
+++ linux-2.6.22-F-boostrcu/kernel/sched.c	2007-08-24 11:24:59.000000000 -0700
@@ -1702,6 +1702,7 @@ void fastcall sched_fork(struct task_str
 	 * Make sure we do not leak PI boosting priority to the child:
 	 */
 	p->prio = current->normal_prio;
+	set_rcu_prio(p, MAX_PRIO);
 
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
@@ -1784,6 +1785,7 @@ void fastcall wake_up_new_task(struct ta
 			else {
 				p->prio = current->prio;
 				p->normal_prio = current->normal_prio;
+				set_rcu_prio(p, MAX_PRIO);
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
@@ -3590,6 +3592,8 @@ asmlinkage void __sched schedule(void)
 	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
+	rcu_preempt_boost();
+
 need_resched:
 	preempt_disable();
 	prev = current;
@@ -5060,6 +5064,7 @@ void __cpuinit init_idle(struct task_str
 	idle->sleep_avg = 0;
 	idle->array = NULL;
 	idle->prio = idle->normal_prio = MAX_PRIO;
+	set_rcu_prio(idle, MAX_PRIO);
 	idle->state = TASK_RUNNING;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
 	set_task_cpu(idle, cpu);
diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/lib/Kconfig.debug linux-2.6.22-F-boostrcu/lib/Kconfig.debug
--- linux-2.6.22-E-hotplug/lib/Kconfig.debug	2007-07-08 16:32:17.000000000 -0700
+++ linux-2.6.22-F-boostrcu/lib/Kconfig.debug	2007-08-24 11:24:59.000000000 -0700
@@ -391,6 +391,40 @@ config RCU_TORTURE_TEST
 	  Say M if you want the RCU torture tests to build as a module.
 	  Say N if you are unsure.
 
+config RCU_TRACE
+	bool "Enable tracing for RCU - currently stats in debugfs"
+	select DEBUG_FS
+	depends on DEBUG_KERNEL
+	default y
+	help
+	  This option provides tracing in RCU which presents stats
+	  in debugfs for debugging RCU implementation.
+
+	  Say Y here if you want to enable RCU tracing
+	  Say N if you are unsure.
+
+config PREEMPT_RCU_BOOST_STATS
+	bool "Enable RCU priority-boosting statistic printing"
+	depends on PREEMPT_RCU_BOOST
+	depends on DEBUG_KERNEL
+	default n
+	help
+	  This option enables debug printk()s of RCU boost statistics,
+	  which are normally only used to debug RCU priority boost
+	  implementations.
+
+	  Say N if you are unsure.
+
+config PREEMPT_RCU_BOOST_STATS_INTERVAL
+	int "RCU priority-boosting statistic printing interval (seconds)"
+	depends on PREEMPT_RCU_BOOST_STATS
+	default 100
+	range 10 86400
+	help
+	  This option controls the timing of debug printk()s of RCU boost
+	  statistics, which are normally only used to debug RCU priority
+	  boost implementations.
+
 config LKDTM
 	tristate "Linux Kernel Dump Test Tool Module"
 	depends on DEBUG_KERNEL
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux