Re: [RFC PATCH -rt 1/2] RCU priority boosting that survives semi-vicious testing

Overall, this code looks sensible to me.  Some comments on the patch below.

Paul E. McKenney wrote:
> --- linux-2.6.20-rc4-rt1/include/linux/rcupreempt.h	2007-01-09 10:59:54.000000000 -0800
> +++ linux-2.6.20-rc4-rt1-rcub/include/linux/rcupreempt.h	2007-01-09 11:01:12.000000000 -0800
[...]
> +enum rcu_boost_state {
> +	RCU_BOOST_IDLE = 0,	   /* Not yet blocked if in RCU read-side. */
> +	RCU_BOOST_BLOCKED = 1,	   /* Blocked from RCU read-side. */
> +	RCU_BOOSTED = 2,	   /* Boosting complete. */
> +};
> +
> +#define N_RCU_BOOST_STATE (RCU_BOOSTED + 1)

Here, you define the three possible states, the number of states.  But
later...

> diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rcupreempt.c linux-2.6.20-rc4-rt1-rcub/kernel/rcupreempt.c
> --- linux-2.6.20-rc4-rt1/kernel/rcupreempt.c	2007-01-09 10:59:54.000000000 -0800
> +++ linux-2.6.20-rc4-rt1-rcub/kernel/rcupreempt.c	2007-01-23 15:41:33.000000000 -0800
> @@ -49,6 +49,7 @@
[...]
> +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
> +	long rbs_stats[N_RCU_BOOST_DAT_EVENTS][N_RCU_BOOST_STATE + 1];
> +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */

...you declare this array to have space for one more than that, and then...

> +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
> +
> +/*
> + * Function to increment indicated ->rbs_stats[] element.
> + */
> +static inline void rcu_boost_dat_stat(struct rcu_boost_dat *rbdp,
> +				      int event,
> +				      enum rcu_boost_state oldstate)
> +{
> +	if (oldstate >= RCU_BOOST_IDLE &&
> +	    oldstate <= RCU_BOOSTED) {
> +		rbdp->rbs_stats[event][oldstate]++;
> +	} else {
> +		rbdp->rbs_stats[event][N_RCU_BOOST_STATE]++;

...you use the last element as what looks like an "unknown" counter.  How
about instead defining the enum to have an "unknown" state, defining
N_RCU_BOOST_STATE accordingly, and using N_RCU_BOOST_STATE without the +1 in
rbs_stats?  I think that would make the code much more self-documenting, and
less error-prone.

> +#define rcu_boost_dat_stat_block(rbdp, oldstate) \
> +	rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BLOCK, oldstate)
> +#define rcu_boost_dat_stat_boost(rbdp, oldstate) \
> +	rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BOOST, oldstate)
> +#define rcu_boost_dat_stat_unlock(rbdp, oldstate) \
> +	rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_UNLOCK, oldstate)
> +
> +/*
> + * Prefix for kprint() strings for periodic statistics messages.
> + */
> +static char *rcu_boost_state_event[] = {
> +	"block:  ",
> +	"boost:  ",
> +	"unlock: ",
> +};

I don't know for sure, but I *think* that GCC may sometimes generate more
efficient code if you use a char [][], rather than a char *[].

> +
> +/*
> + * Indicators for numbers in kprint() strings.  "!" indicates a state-event
> + * pair that should not happen, while "?" indicates a state that should
> + * not happen.
> + */
> +static char *rcu_boost_state_error[] = {
> +       /*ibBe*/
> +	"   ?",  /* block */
> +	"!  ?",  /* boost */
> +	"?  ?",  /* unlock */
> +};

Likewise.  Also, the columns here don't seem to line up, probably because the
comment uses spaces and the other lines use a tab.

> +static void rcu_boost_dat_stat_print(void)
> +{
> +	char buf[N_RCU_BOOST_STATE * (sizeof(long) * 3 + 2) + 2];

This expression really begs for some constants in place of the magic numbers,
or failing that, a comment.

> +	int cpu;
> +	int event;
> +	int i;
> +	static time_t lastprint = 0;
> +	struct rcu_boost_dat *rbdp;
> +	int state;
> +	struct rcu_boost_dat sum;
> +
> +	/* Wait a graceful interval between printk spamming. */
> +
> +	if (xtime.tv_sec - lastprint <
> +	    CONFIG_PREEMPT_RCU_BOOST_STATS_INTERVAL)
> +		return;

How about printk_ratelimit?  It takes a parameter for the rate.  (On the other
hand, it also takes a spinlock, which you might not want here, even in the
stats code.)

> +	/* Print them out! */
> +
> +	printk(KERN_ALERT
> +	       "rcu_boost_dat: idx=%d "
> +	       "b=%ld ul=%ld ub=%ld boost: a=%ld b=%ld\n",
> +	       rcu_boost_idx,
> +	       sum.rbs_blocked, sum.rbs_unlock, sum.rbs_unboosted,
> +	       sum.rbs_boost_attempt, sum.rbs_boost);
> +	for (event = 0; event < N_RCU_BOOST_DAT_EVENTS; event++) {
> +		i = 0;
> +		for (state = 0; state <= N_RCU_BOOST_STATE; state++) {
> +			i += sprintf(&buf[i], " %ld%c",
> +				     sum.rbs_stats[event][state],
> +				     rcu_boost_state_error[event][state]);
> +		}
> +		printk(KERN_ALERT "rcu_boost_dat %s %s\n",
> +		       rcu_boost_state_event[event], buf);
> +	}

Most or all of these counters could likely become unsigned, since negative
values don't make sense for them.

> +	/* Go away and don't come back for awhile. */
> +
> +	lastprint = xtime.tv_sec;
> +}
> +

> +/*
> + * Return the current boost index for boosting target tasks.
> + * May only be invoked by the booster task, so guaranteed to
> + * already be initialized.
> + */
> +static inline int rcu_boost_idx_boosting(void)
> +{
> +	return (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1);
> +}

Quoted for later reference.

> +/*
> + * Initialize RCU-boost state.  This happens early in the boot process,
> + * when the scheduler does not yet exist.  So don't try to use it.
> + */
> +static void init_rcu_boost_early(void)
> +{
> +	struct rcu_boost_dat *rbdp;
> +	int cpu;
> +	int i;
> +
> +	for_each_possible_cpu(cpu) {
> +		rbdp = per_cpu(rcu_boost_dat, cpu);
> +		for (i = 0; i < RCU_BOOST_ELEMENTS; i++) {
> +			rbdp[i].rbs_mutex =
> +				RAW_SPIN_LOCK_UNLOCKED(rbdp[i].rbs_mutex);
> +			INIT_LIST_HEAD(&rbdp[i].rbs_toboost);
> +			INIT_LIST_HEAD(&rbdp[i].rbs_boosted);
> +			rbdp[i].rbs_blocked = 0;
> +			rbdp[i].rbs_boost_attempt = 0;
> +			rbdp[i].rbs_boost = 0;
> +			rbdp[i].rbs_unlock = 0;
> +			rbdp[i].rbs_unboosted = 0;
> +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
> +			{
> +				int j, k;
> +
> +				for (j = 0; j < N_RCU_BOOST_DAT_EVENTS; j++)
> +					for (k = 0; k <= N_RCU_BOOST_STATE; k++)
> +						rbdp[i].rbs_stats[j][k] = 0;
> +			}
> +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
> +		}
> +		smp_wmb();

Barrier without an accompanying comment.  You want to make sure the structure
gets initialized before setting rcu_boost_idx and thus allowing boosting,
right?

> +		rcu_boost_idx = 0;
> +	}
> +}

> +/*
> + * Return the list on which the calling task should add itself, or
> + * NULL if too early during initialization.
> + */
> +static inline struct rcu_boost_dat *rcu_rbd_new(void)
> +{
> +	int cpu = raw_smp_processor_id();  /* locks used, so preemption OK. */
> +	int idx = rcu_boost_idx;
> +
> +	smp_read_barrier_depends(); barrier();

Barriers without accompanying comments, though in this case they seem fairly
obvious.

> +	if (unlikely(idx < 0))
> +		return (NULL);
> +	return &per_cpu(rcu_boost_dat, cpu)[idx];
> +}
> +
> +/*
> + * Return the list from which to boost target tasks.
> + * May only be invoked by the booster task, so guaranteed to
> + * already be initialized.
> + */
> +static inline struct rcu_boost_dat *rcu_rbd_boosting(int cpu)
> +{
> +	int idx = (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1);

You defined rcu_boost_idx_boosting for exactly this expression.

> +	return &per_cpu(rcu_boost_dat, cpu)[idx];

For that matter, since you wrapped it in the function, you could just call it
directly in this array index operation.

> +}

> +/*
> + * Priority-boost tasks stuck in RCU read-side critical sections as
> + * needed (presumably rarely).
> + */
> +static int rcu_booster(void *arg)
> +{
> +	int cpu;
> +	struct sched_param sp;
> +
> +	sp.sched_priority = PREEMPT_RCU_BOOSTER_PRIO;
> +	sched_setscheduler(current, SCHED_RR, &sp);
> +	current->flags |= PF_NOFREEZE;
> +
> +	do {
> +
> +		/* Advance the lists of tasks. */
> +
> +		rcu_boost_idx = (rcu_boost_idx + 1) % RCU_BOOST_ELEMENTS;
> +		for_each_possible_cpu(cpu) {
> +
> +			/*
> +			 * Boost all sufficiently aged readers.
> +			 * Readers must first be preempted or block
> +			 * on a mutex in an RCU read-side critical section,
> +			 * then remain in that critical section for
> +			 * RCU_BOOST_ELEMENTS-1 time intervals.
> +			 * So most of the time we should end up doing
> +			 * nothing.
> +			 */
> +
> +			rcu_boost_one_reader_list(rcu_rbd_boosting(cpu));
> +
> +			/*
> +			 * Large SMP systems may need to sleep sometimes
> +			 * in this loop.  Or have multiple RCU-boost tasks.
> +			 */

The idea of multiple RCU-boost tasks gives me an idea: what about having
per-CPU boost queues, and using that to eliminate the need to lock the queues?
It might not do quite as good a job of scheduling, but it would eliminate
locking overhead *and* solve the problem you describe here.

> +		}
> +
> +		/*
> +		 * Sleep to allow any unstalled RCU read-side critical
> +		 * sections to age out of the list.  @@@ FIXME: reduce,
> +		 * adjust, or eliminate in case of OOM.
> +		 */
> +
> +		schedule_timeout_uninterruptible(HZ / 100);
> +
> +		/* Print stats if enough time has passed. */
> +
> +		rcu_boost_dat_stat_print();
> +
> +	} while (!kthread_should_stop());
> +
> +	return 0;
> +}
> +
> +/*
> + * Perform the portions of RCU-boost initialization that require the
> + * scheduler to be up and running.
> + */
> +void init_rcu_boost_late(void)
> +{
> +	int i;
> +
> +	/* Spawn RCU-boost task. */
> +
> +	printk(KERN_ALERT "Starting RCU priority booster\n");

Judging by other initialization code, I think this belongs at KERN_INFO or
similar.

> +	rcu_boost_task = kthread_run(rcu_booster, NULL, "RCU Prio Booster");
> +	if (IS_ERR(rcu_boost_task)) {
> +		i = PTR_ERR(rcu_boost_task);
> +		printk(KERN_ALERT
> +		       "Unable to create RCU Priority Booster, errno %d\n", -i);

No need for a temporary here.

> +
> +		/*
> +		 * Continue running, but tasks permanently blocked
> +		 * in RCU read-side critical sections will be able
> +		 * to stall grace-period processing, potentially
> +		 * OOMing the machine.
> +		 */
> +
> +		rcu_boost_task = NULL;
> +	}
> +}
> +
> +/*
> + * Update task's RCU-boost state to reflect blocking in RCU read-side
> + * critical section, so that the RCU-boost task can find it in case it
> + * later needs its priority boosted.
> + */
> +void __rcu_preempt_boost(void)
> +{
> +	struct rcu_boost_dat *rbdp;
> +	unsigned long oldirq;
> +
> +	/* Identify list to place task on for possible later boosting. */
> +
> +	local_irq_save(oldirq);
> +	rbdp = rcu_rbd_new();
> +	if (rbdp == NULL) {
> +		local_irq_restore(oldirq);
> +		printk("Preempted RCU read-side critical section too early.\n");

This printk lacks a priority.

> +		return;
> +	}
> +	spin_lock(&rbdp->rbs_mutex);
> +	rbdp->rbs_blocked++;
> +
> +	/*
> +	 * Update state.  We hold the lock and aren't yet on the list,
> +	 * so the booster cannot mess with us yet.
> +	 */
> +
> +	rcu_boost_dat_stat_block(rbdp, current->rcub_state);
> +	if (current->rcub_state != RCU_BOOST_IDLE) {
> +
> +		/*
> +		 * We have been here before, so just update stats.
> +		 * It may seem strange to do all this work just to
> +		 * accumulate statistics, but this is such a
> +		 * low-probability code path that we shouldn't care.
> +		 * If it becomes a problem, it can be fixed.
> +		 */
> +
> +		spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
> +		return;
> +	}
> +	current->rcub_state = RCU_BOOST_BLOCKED;
> +
> +	/* Now add ourselves to the list so that the booster can find use. */

Typo: s/use/us/

> +
> +	list_add_tail(&current->rcub_entry, &rbdp->rbs_toboost);
> +	current->rcub_rbdp = rbdp;
> +	spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
> +}

> diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rtmutex.c linux-2.6.20-rc4-rt1-rcub/kernel/rtmutex.c
> --- linux-2.6.20-rc4-rt1/kernel/rtmutex.c	2007-01-09 10:59:54.000000000 -0800
> +++ linux-2.6.20-rc4-rt1-rcub/kernel/rtmutex.c	2007-01-09 11:01:12.000000000 -0800
> @@ -105,11 +105,14 @@ static inline void init_lists(struct rt_
>   */
>  int rt_mutex_getprio(struct task_struct *task)
>  {
> +	int prio = task->normal_prio;
> +
> +	if (get_rcu_prio(task) < prio)
> +		prio = get_rcu_prio(task);

int prio = min(task->normal_prio, get_rcu_prio(task)); ?

>  	if (likely(!task_has_pi_waiters(task)))
> -		return task->normal_prio;
> +		return prio;
>  
> -	return min(task_top_pi_waiter(task)->pi_list_entry.prio,
> -		   task->normal_prio);
> +	return min(task_top_pi_waiter(task)->pi_list_entry.prio, prio);
>  }

- Josh Triplett
Attachment: signature.asc
Description: OpenPGP digital signature
Follow-Ups:
- Re: [RFC PATCH -rt 1/2] RCU priority boosting that survives semi-vicious testing
  - From: "Paul E. McKenney" <[email protected]>
References:
- [RFC PATCH -rt 0/2] RCU priority boosting that survives semi-vicious testing
  - From: "Paul E. McKenney" <[email protected]>
- [RFC PATCH -rt 1/2] RCU priority boosting that survives semi-vicious testing
  - From: "Paul E. McKenney" <[email protected]>
Prev by Date: [PATCH] Add a rounddown_pow_of_two() macro to log2.h.
Next by Date: Re: what kernel version embeds r1000 (RTL8168 Eth NICs)
Previous by thread: Re: [RFC PATCH -rt 2/2] RCU priority boosting additions to rcutorture
Next by thread: Re: [RFC PATCH -rt 1/2] RCU priority boosting that survives semi-vicious testing
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]