Overall, this code looks sensible to me. Some comments on the patch below. Paul E. McKenney wrote: > --- linux-2.6.20-rc4-rt1/include/linux/rcupreempt.h 2007-01-09 10:59:54.000000000 -0800 > +++ linux-2.6.20-rc4-rt1-rcub/include/linux/rcupreempt.h 2007-01-09 11:01:12.000000000 -0800 [...] > +enum rcu_boost_state { > + RCU_BOOST_IDLE = 0, /* Not yet blocked if in RCU read-side. */ > + RCU_BOOST_BLOCKED = 1, /* Blocked from RCU read-side. */ > + RCU_BOOSTED = 2, /* Boosting complete. */ > +}; > + > +#define N_RCU_BOOST_STATE (RCU_BOOSTED + 1) Here, you define the three possible states, the number of states. But later... > diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rcupreempt.c linux-2.6.20-rc4-rt1-rcub/kernel/rcupreempt.c > --- linux-2.6.20-rc4-rt1/kernel/rcupreempt.c 2007-01-09 10:59:54.000000000 -0800 > +++ linux-2.6.20-rc4-rt1-rcub/kernel/rcupreempt.c 2007-01-23 15:41:33.000000000 -0800 > @@ -49,6 +49,7 @@ [...] > +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS > + long rbs_stats[N_RCU_BOOST_DAT_EVENTS][N_RCU_BOOST_STATE + 1]; > +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */ ...you declare this array to have space for one more than that, and then... > +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS > + > +/* > + * Function to increment indicated ->rbs_stats[] element. > + */ > +static inline void rcu_boost_dat_stat(struct rcu_boost_dat *rbdp, > + int event, > + enum rcu_boost_state oldstate) > +{ > + if (oldstate >= RCU_BOOST_IDLE && > + oldstate <= RCU_BOOSTED) { > + rbdp->rbs_stats[event][oldstate]++; > + } else { > + rbdp->rbs_stats[event][N_RCU_BOOST_STATE]++; ...you use the last element as what looks like an "unknown" counter. How about instead defining the enum to have an "unknown" state, defining N_RCU_BOOST_STATE accordingly, and using N_RCU_BOOST_STATE without the +1 in rbs_stats? I think that would make the code much more self-documenting, and less error-prone. > +#define rcu_boost_dat_stat_block(rbdp, oldstate) \ > + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BLOCK, oldstate) > +#define rcu_boost_dat_stat_boost(rbdp, oldstate) \ > + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BOOST, oldstate) > +#define rcu_boost_dat_stat_unlock(rbdp, oldstate) \ > + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_UNLOCK, oldstate) > + > +/* > + * Prefix for kprint() strings for periodic statistics messages. > + */ > +static char *rcu_boost_state_event[] = { > + "block: ", > + "boost: ", > + "unlock: ", > +}; I don't know for sure, but I *think* that GCC may sometimes generate more efficient code if you use a char [][], rather than a char *[]. > + > +/* > + * Indicators for numbers in kprint() strings. "!" indicates a state-event > + * pair that should not happen, while "?" indicates a state that should > + * not happen. > + */ > +static char *rcu_boost_state_error[] = { > + /*ibBe*/ > + " ?", /* block */ > + "! ?", /* boost */ > + "? ?", /* unlock */ > +}; Likewise. Also, the columns here don't seem to line up, probably because the comment uses spaces and the other lines use a tab. > +static void rcu_boost_dat_stat_print(void) > +{ > + char buf[N_RCU_BOOST_STATE * (sizeof(long) * 3 + 2) + 2]; This expression really begs for some constants in place of the magic numbers, or failing that, a comment. > + int cpu; > + int event; > + int i; > + static time_t lastprint = 0; > + struct rcu_boost_dat *rbdp; > + int state; > + struct rcu_boost_dat sum; > + > + /* Wait a graceful interval between printk spamming. */ > + > + if (xtime.tv_sec - lastprint < > + CONFIG_PREEMPT_RCU_BOOST_STATS_INTERVAL) > + return; How about printk_ratelimit? It takes a parameter for the rate. (On the other hand, it also takes a spinlock, which you might not want here, even in the stats code.) > + /* Print them out! */ > + > + printk(KERN_ALERT > + "rcu_boost_dat: idx=%d " > + "b=%ld ul=%ld ub=%ld boost: a=%ld b=%ld\n", > + rcu_boost_idx, > + sum.rbs_blocked, sum.rbs_unlock, sum.rbs_unboosted, > + sum.rbs_boost_attempt, sum.rbs_boost); > + for (event = 0; event < N_RCU_BOOST_DAT_EVENTS; event++) { > + i = 0; > + for (state = 0; state <= N_RCU_BOOST_STATE; state++) { > + i += sprintf(&buf[i], " %ld%c", > + sum.rbs_stats[event][state], > + rcu_boost_state_error[event][state]); > + } > + printk(KERN_ALERT "rcu_boost_dat %s %s\n", > + rcu_boost_state_event[event], buf); > + } Most or all of these counters could likely become unsigned, since negative values don't make sense for them. > + /* Go away and don't come back for awhile. */ > + > + lastprint = xtime.tv_sec; > +} > + > +/* > + * Return the current boost index for boosting target tasks. > + * May only be invoked by the booster task, so guaranteed to > + * already be initialized. > + */ > +static inline int rcu_boost_idx_boosting(void) > +{ > + return (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1); > +} Quoted for later reference. > +/* > + * Initialize RCU-boost state. This happens early in the boot process, > + * when the scheduler does not yet exist. So don't try to use it. > + */ > +static void init_rcu_boost_early(void) > +{ > + struct rcu_boost_dat *rbdp; > + int cpu; > + int i; > + > + for_each_possible_cpu(cpu) { > + rbdp = per_cpu(rcu_boost_dat, cpu); > + for (i = 0; i < RCU_BOOST_ELEMENTS; i++) { > + rbdp[i].rbs_mutex = > + RAW_SPIN_LOCK_UNLOCKED(rbdp[i].rbs_mutex); > + INIT_LIST_HEAD(&rbdp[i].rbs_toboost); > + INIT_LIST_HEAD(&rbdp[i].rbs_boosted); > + rbdp[i].rbs_blocked = 0; > + rbdp[i].rbs_boost_attempt = 0; > + rbdp[i].rbs_boost = 0; > + rbdp[i].rbs_unlock = 0; > + rbdp[i].rbs_unboosted = 0; > +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS > + { > + int j, k; > + > + for (j = 0; j < N_RCU_BOOST_DAT_EVENTS; j++) > + for (k = 0; k <= N_RCU_BOOST_STATE; k++) > + rbdp[i].rbs_stats[j][k] = 0; > + } > +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */ > + } > + smp_wmb(); Barrier without an accompanying comment. You want to make sure the structure gets initialized before setting rcu_boost_idx and thus allowing boosting, right? > + rcu_boost_idx = 0; > + } > +} > +/* > + * Return the list on which the calling task should add itself, or > + * NULL if too early during initialization. > + */ > +static inline struct rcu_boost_dat *rcu_rbd_new(void) > +{ > + int cpu = raw_smp_processor_id(); /* locks used, so preemption OK. */ > + int idx = rcu_boost_idx; > + > + smp_read_barrier_depends(); barrier(); Barriers without accompanying comments, though in this case they seem fairly obvious. > + if (unlikely(idx < 0)) > + return (NULL); > + return &per_cpu(rcu_boost_dat, cpu)[idx]; > +} > + > +/* > + * Return the list from which to boost target tasks. > + * May only be invoked by the booster task, so guaranteed to > + * already be initialized. > + */ > +static inline struct rcu_boost_dat *rcu_rbd_boosting(int cpu) > +{ > + int idx = (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1); You defined rcu_boost_idx_boosting for exactly this expression. > + return &per_cpu(rcu_boost_dat, cpu)[idx]; For that matter, since you wrapped it in the function, you could just call it directly in this array index operation. > +} > +/* > + * Priority-boost tasks stuck in RCU read-side critical sections as > + * needed (presumably rarely). > + */ > +static int rcu_booster(void *arg) > +{ > + int cpu; > + struct sched_param sp; > + > + sp.sched_priority = PREEMPT_RCU_BOOSTER_PRIO; > + sched_setscheduler(current, SCHED_RR, &sp); > + current->flags |= PF_NOFREEZE; > + > + do { > + > + /* Advance the lists of tasks. */ > + > + rcu_boost_idx = (rcu_boost_idx + 1) % RCU_BOOST_ELEMENTS; > + for_each_possible_cpu(cpu) { > + > + /* > + * Boost all sufficiently aged readers. > + * Readers must first be preempted or block > + * on a mutex in an RCU read-side critical section, > + * then remain in that critical section for > + * RCU_BOOST_ELEMENTS-1 time intervals. > + * So most of the time we should end up doing > + * nothing. > + */ > + > + rcu_boost_one_reader_list(rcu_rbd_boosting(cpu)); > + > + /* > + * Large SMP systems may need to sleep sometimes > + * in this loop. Or have multiple RCU-boost tasks. > + */ The idea of multiple RCU-boost tasks gives me an idea: what about having per-CPU boost queues, and using that to eliminate the need to lock the queues? It might not do quite as good a job of scheduling, but it would eliminate locking overhead *and* solve the problem you describe here. > + } > + > + /* > + * Sleep to allow any unstalled RCU read-side critical > + * sections to age out of the list. @@@ FIXME: reduce, > + * adjust, or eliminate in case of OOM. > + */ > + > + schedule_timeout_uninterruptible(HZ / 100); > + > + /* Print stats if enough time has passed. */ > + > + rcu_boost_dat_stat_print(); > + > + } while (!kthread_should_stop()); > + > + return 0; > +} > + > +/* > + * Perform the portions of RCU-boost initialization that require the > + * scheduler to be up and running. > + */ > +void init_rcu_boost_late(void) > +{ > + int i; > + > + /* Spawn RCU-boost task. */ > + > + printk(KERN_ALERT "Starting RCU priority booster\n"); Judging by other initialization code, I think this belongs at KERN_INFO or similar. > + rcu_boost_task = kthread_run(rcu_booster, NULL, "RCU Prio Booster"); > + if (IS_ERR(rcu_boost_task)) { > + i = PTR_ERR(rcu_boost_task); > + printk(KERN_ALERT > + "Unable to create RCU Priority Booster, errno %d\n", -i); No need for a temporary here. > + > + /* > + * Continue running, but tasks permanently blocked > + * in RCU read-side critical sections will be able > + * to stall grace-period processing, potentially > + * OOMing the machine. > + */ > + > + rcu_boost_task = NULL; > + } > +} > + > +/* > + * Update task's RCU-boost state to reflect blocking in RCU read-side > + * critical section, so that the RCU-boost task can find it in case it > + * later needs its priority boosted. > + */ > +void __rcu_preempt_boost(void) > +{ > + struct rcu_boost_dat *rbdp; > + unsigned long oldirq; > + > + /* Identify list to place task on for possible later boosting. */ > + > + local_irq_save(oldirq); > + rbdp = rcu_rbd_new(); > + if (rbdp == NULL) { > + local_irq_restore(oldirq); > + printk("Preempted RCU read-side critical section too early.\n"); This printk lacks a priority. > + return; > + } > + spin_lock(&rbdp->rbs_mutex); > + rbdp->rbs_blocked++; > + > + /* > + * Update state. We hold the lock and aren't yet on the list, > + * so the booster cannot mess with us yet. > + */ > + > + rcu_boost_dat_stat_block(rbdp, current->rcub_state); > + if (current->rcub_state != RCU_BOOST_IDLE) { > + > + /* > + * We have been here before, so just update stats. > + * It may seem strange to do all this work just to > + * accumulate statistics, but this is such a > + * low-probability code path that we shouldn't care. > + * If it becomes a problem, it can be fixed. > + */ > + > + spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq); > + return; > + } > + current->rcub_state = RCU_BOOST_BLOCKED; > + > + /* Now add ourselves to the list so that the booster can find use. */ Typo: s/use/us/ > + > + list_add_tail(¤t->rcub_entry, &rbdp->rbs_toboost); > + current->rcub_rbdp = rbdp; > + spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq); > +} > diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rtmutex.c linux-2.6.20-rc4-rt1-rcub/kernel/rtmutex.c > --- linux-2.6.20-rc4-rt1/kernel/rtmutex.c 2007-01-09 10:59:54.000000000 -0800 > +++ linux-2.6.20-rc4-rt1-rcub/kernel/rtmutex.c 2007-01-09 11:01:12.000000000 -0800 > @@ -105,11 +105,14 @@ static inline void init_lists(struct rt_ > */ > int rt_mutex_getprio(struct task_struct *task) > { > + int prio = task->normal_prio; > + > + if (get_rcu_prio(task) < prio) > + prio = get_rcu_prio(task); int prio = min(task->normal_prio, get_rcu_prio(task)); ? > if (likely(!task_has_pi_waiters(task))) > - return task->normal_prio; > + return prio; > > - return min(task_top_pi_waiter(task)->pi_list_entry.prio, > - task->normal_prio); > + return min(task_top_pi_waiter(task)->pi_list_entry.prio, prio); > } - Josh Triplett
Attachment:
signature.asc
Description: OpenPGP digital signature
- Follow-Ups:
- Re: [RFC PATCH -rt 1/2] RCU priority boosting that survives semi-vicious testing
- From: "Paul E. McKenney" <[email protected]>
- Re: [RFC PATCH -rt 1/2] RCU priority boosting that survives semi-vicious testing
- References:
- [RFC PATCH -rt 0/2] RCU priority boosting that survives semi-vicious testing
- From: "Paul E. McKenney" <[email protected]>
- [RFC PATCH -rt 1/2] RCU priority boosting that survives semi-vicious testing
- From: "Paul E. McKenney" <[email protected]>
- [RFC PATCH -rt 0/2] RCU priority boosting that survives semi-vicious testing
- Prev by Date: [PATCH] Add a rounddown_pow_of_two() macro to log2.h.
- Next by Date: Re: what kernel version embeds r1000 (RTL8168 Eth NICs)
- Previous by thread: Re: [RFC PATCH -rt 2/2] RCU priority boosting additions to rcutorture
- Next by thread: Re: [RFC PATCH -rt 1/2] RCU priority boosting that survives semi-vicious testing
- Index(es):