From 042d4c70a203998697b34eaad1a99f6f09d09e4d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Oct 2018 07:43:22 -0700 Subject: [PATCH 001/112] rcu: Eliminate BUG_ON() for sync.c The sync.c file has a number of calls to BUG_ON(), which panics the kernel, which is not a good strategy for devices (like embedded) that don't have a way to capture console output. This commit therefore changes these BUG_ON() calls to WARN_ON_ONCE(), but does so quite naively. Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney Acked-by: Oleg Nesterov Cc: Peter Zijlstra --- kernel/rcu/sync.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 3f943efcf61c1e..a6ba446a9693bb 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -125,8 +125,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) rsp->gp_state = GP_PENDING; spin_unlock_irq(&rsp->rss_lock); - BUG_ON(need_wait && need_sync); - + WARN_ON_ONCE(need_wait && need_sync); if (need_sync) { gp_ops[rsp->gp_type].sync(); rsp->gp_state = GP_PASSED; @@ -139,7 +138,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) * Nobody has yet been allowed the 'fast' path and thus we can * avoid doing any sync(). The callback will get 'dropped'. */ - BUG_ON(rsp->gp_state != GP_PASSED); + WARN_ON_ONCE(rsp->gp_state != GP_PASSED); } } @@ -166,8 +165,8 @@ static void rcu_sync_func(struct rcu_head *rhp) struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); unsigned long flags; - BUG_ON(rsp->gp_state != GP_PASSED); - BUG_ON(rsp->cb_state == CB_IDLE); + WARN_ON_ONCE(rsp->gp_state != GP_PASSED); + WARN_ON_ONCE(rsp->cb_state == CB_IDLE); spin_lock_irqsave(&rsp->rss_lock, flags); if (rsp->gp_count) { @@ -225,7 +224,7 @@ void rcu_sync_dtor(struct rcu_sync *rsp) { int cb_state; - BUG_ON(rsp->gp_count); + WARN_ON_ONCE(rsp->gp_count); spin_lock_irq(&rsp->rss_lock); if (rsp->cb_state == CB_REPLAY) @@ -235,6 +234,6 @@ void rcu_sync_dtor(struct rcu_sync *rsp) if (cb_state != CB_IDLE) { gp_ops[rsp->gp_type].wait(); - BUG_ON(rsp->cb_state != CB_IDLE); + WARN_ON_ONCE(rsp->cb_state != CB_IDLE); } } From 08543bda42ef06c5ee4cd74501c894aa7cc13ea8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Oct 2018 08:04:03 -0700 Subject: [PATCH 002/112] rcu: Eliminate BUG_ON() for kernel/rcu/tree.c The tree.c file has a number of calls to BUG_ON(), which panics the kernel, which is not a good strategy for devices (like embedded) that don't have a way to capture console output. This commit therefore converts these BUG_ON() calls to WARN_ON_ONCE() and WARN_ONCE(). Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 121f833acd046c..bdb6659a8dbc0b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2826,7 +2826,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) * Very early boot, before rcu_init(). Initialize if needed * and then drop through to queue the callback. */ - BUG_ON(cpu != -1); + WARN_ON_ONCE(cpu != -1); WARN_ON_ONCE(!rcu_is_watching()); if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); @@ -3485,7 +3485,8 @@ static int __init rcu_spawn_gp_kthread(void) rcu_scheduler_fully_active = 1; t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); - BUG_ON(IS_ERR(t)); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__)) + return 0; rnp = rcu_get_root(); raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.gp_kthread = t; From 75a8f72245223cce1571b0405b0881ca3c046df7 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 22 Sep 2018 19:41:25 -0400 Subject: [PATCH 003/112] rcu: Remove unused rcu_state externs The rcu_bh_state and rcu_sched_state variables were removed during the RCU flavor consolidations, but external declarations remain in tree.h. This commit therefore removes these obsolete declarations. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 703e19ff532d4d..57a937ac51c275 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -398,17 +398,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; #define RCU_NAME rcu_name #endif /* #else #ifdef CONFIG_TRACING */ -/* - * RCU implementation internal declarations: - */ -extern struct rcu_state rcu_sched_state; - -extern struct rcu_state rcu_bh_state; - -#ifdef CONFIG_PREEMPT_RCU -extern struct rcu_state rcu_preempt_state; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - int rcu_dynticks_snap(struct rcu_data *rdp); #ifdef CONFIG_RCU_BOOST From adbccddb4a16b1dbf047d330ae1e78fd1ec80352 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 22 Sep 2018 19:41:26 -0400 Subject: [PATCH 004/112] rcu: Fix rcu_{node,data} comments about gp_seq_needed Recent changes have removed the old ->gp_seq_needed field from the rcu_state structure, which in turn obsoleted a couple of comments in the rcu_node and rcu_data structures. This commit therefore updates these comments accordingly. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 57a937ac51c275..c3e2807a834ae4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -57,7 +57,7 @@ struct rcu_node { /* some rcu_state fields as well as */ /* following. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ - unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed. */ + unsigned long gp_seq_needed; /* Track furthest future GP request. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ @@ -163,7 +163,7 @@ union rcu_noqs { struct rcu_data { /* 1) quiescent-state and grace-period handling : */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ - unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed ctr. */ + unsigned long gp_seq_needed; /* Track furthest future GP request. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ From 309ba859b95085f61f4f2a154df6be9cb9713a12 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jul 2018 14:36:49 -0700 Subject: [PATCH 005/112] rcu: Eliminate synchronize_rcu_mult() Now that synchronize_rcu() waits for both RCU read-side critical sections and preempt-disabled regions of code, the sole caller of synchronize_rcu_mult() can be replaced by synchronize_rcu(). This patch makes this change and removes synchronize_rcu_mult(). Note that _wait_rcu_gp() still supports synchronize_rcu_mult(), and thus might be simplified in the future to take only take a single call_rcu() function rather than the current list of them. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_wait.h | 17 ----------------- kernel/rcu/update.c | 6 ++---- kernel/sched/core.c | 2 +- 3 files changed, 3 insertions(+), 22 deletions(-) diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index 8a16c3eb3dd0a6..c0578ba23c1a9f 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -31,21 +31,4 @@ do { \ #define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__) -/** - * synchronize_rcu_mult - Wait concurrently for multiple grace periods - * @...: List of call_rcu() functions for different grace periods to wait on - * - * This macro waits concurrently for multiple types of RCU grace periods. - * For example, synchronize_rcu_mult(call_rcu, call_rcu_tasks) would wait - * on concurrent RCU and RCU-tasks grace periods. Waiting on a give SRCU - * domain requires you to write a wrapper function for that SRCU domain's - * call_srcu() function, supplying the corresponding srcu_struct. - * - * If Tiny RCU, tell _wait_rcu_gp() does not bother waiting for RCU, - * given that anywhere synchronize_rcu_mult() can be called is automatically - * a grace period. - */ -#define synchronize_rcu_mult(...) \ - _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__) - #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f203b94f6b5be1..c729ca5e6ee24c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -335,8 +335,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, /* Initialize and register callbacks for each crcu_array element. */ for (i = 0; i < n; i++) { if (checktiny && - (crcu_array[i] == call_rcu || - crcu_array[i] == call_rcu_bh)) { + (crcu_array[i] == call_rcu)) { might_sleep(); continue; } @@ -352,8 +351,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, /* Wait for all callbacks to be invoked. */ for (i = 0; i < n; i++) { if (checktiny && - (crcu_array[i] == call_rcu || - crcu_array[i] == call_rcu_bh)) + (crcu_array[i] == call_rcu)) continue; for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f12225f26b70a6..ea12ebc57840ae 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5788,7 +5788,7 @@ int sched_cpu_deactivate(unsigned int cpu) * * Do sync before park smpboot threads to take care the rcu boost case. */ - synchronize_rcu_mult(call_rcu, call_rcu_sched); + synchronize_rcu(); if (!sched_smp_initialized) return 0; From d3ff3891b2edba63a7dee9023306bb66878fc3d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jul 2018 14:42:53 -0700 Subject: [PATCH 006/112] rcu: Consolidate the RCU update functions invoked by sync.c This commit retains all the various gp_ops[] entries, but makes their update functions all be synchronize_rcu(), call_rcu() and rcu_barrier(). The read-side checks remain consistent with the various RCU flavors, which still exist on the read side. Signed-off-by: Paul E. McKenney Cc: Oleg Nesterov Cc: Peter Zijlstra --- kernel/rcu/sync.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 3f943efcf61c1e..9d570b1892b0df 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -44,15 +44,15 @@ static const struct { __INIT_HELD(rcu_read_lock_held) }, [RCU_SCHED_SYNC] = { - .sync = synchronize_sched, - .call = call_rcu_sched, - .wait = rcu_barrier_sched, + .sync = synchronize_rcu, + .call = call_rcu, + .wait = rcu_barrier, __INIT_HELD(rcu_read_lock_sched_held) }, [RCU_BH_SYNC] = { - .sync = synchronize_rcu_bh, - .call = call_rcu_bh, - .wait = rcu_barrier_bh, + .sync = synchronize_rcu, + .call = call_rcu, + .wait = rcu_barrier, __INIT_HELD(rcu_read_lock_bh_held) }, }; From 78d125d33858d00756baf0f40e75e77dcbfbea55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jul 2018 15:36:43 -0700 Subject: [PATCH 007/112] sched/membarrier: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, the synchronize_sched() in sys_membarrier() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers --- kernel/sched/membarrier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 76e0eaf4654e07..388a7a6c1aa2cc 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -298,7 +298,7 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) if (tick_nohz_full_enabled()) return -EINVAL; if (num_online_cpus() > 1) - synchronize_sched(); + synchronize_rcu(); return 0; case MEMBARRIER_CMD_GLOBAL_EXPEDITED: return membarrier_global_expedited(); From 8fa9eb388eb00f7a63caf0fd370f8c134015fc1f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Oct 2018 15:45:04 -0700 Subject: [PATCH 008/112] sparc/oprofile: Convert timer_stop() to use synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code in addition to explicitly marked RCU read-side critical sections, synchronize_rcu() can be used in place of synchronize_sched(). This commit therefore makes that change. Signed-off-by: Paul E. McKenney Cc: Robert Richter Cc: "David S. Miller" Cc: Cc: --- arch/sparc/oprofile/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/oprofile/init.c b/arch/sparc/oprofile/init.c index f9024bccff163f..43730c9b1c8671 100644 --- a/arch/sparc/oprofile/init.c +++ b/arch/sparc/oprofile/init.c @@ -53,7 +53,7 @@ static void timer_stop(void) { nmi_adjust_hz(1); unregister_die_notifier(&profile_timer_exceptions_nb); - synchronize_sched(); /* Allow already-started NMIs to complete. */ + synchronize_rcu(); /* Allow already-started NMIs to complete. */ } static int op_nmi_timer_init(struct oprofile_operations *ops) From 0d4e68e2f3979c67a3596c61c118e0c73a2bdfe0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Oct 2018 16:30:07 -0700 Subject: [PATCH 009/112] s390/mm: Convert tlb_table_flush() to use call_rcu() Now that call_rcu()'s callback is not invoked until after all preempt-disable regions of code have completed (in addition to explicitly marked RCU read-side critical sections), call_rcu() can be used in place of call_rcu_sched(). This commit therefore makes that change. Signed-off-by: Paul E. McKenney Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: --- arch/s390/mm/pgalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 76d89ee8b42883..da64e4b9324e4a 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -350,7 +350,7 @@ void tlb_table_flush(struct mmu_gather *tlb) struct mmu_table_batch **batch = &tlb->batch; if (*batch) { - call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); + call_rcu(&(*batch)->rcu, tlb_remove_table_rcu); *batch = NULL; } } From 04229110adfba984950fc0209632640a76eb1de4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Nov 2018 16:53:13 -0800 Subject: [PATCH 010/112] powerpc: Convert hugepd_free() to use call_rcu() Now that call_rcu()'s callback is not invoked until after all preempt-disable regions of code have completed (in addition to explicitly marked RCU read-side critical sections), call_rcu() can be used in place of call_rcu_sched(). This commit therefore makes that change. Signed-off-by: Paul E. McKenney Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: --- arch/powerpc/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 8cf035e68378b7..4c01e9a01a7415 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -289,7 +289,7 @@ static void hugepd_free(struct mmu_gather *tlb, void *hugepte) (*batchp)->ptes[(*batchp)->index++] = hugepte; if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { - call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback); + call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback); *batchp = NULL; } put_cpu_var(hugepd_freelist_cur); From 832aa35a65bac800a1adbf2eab0b42427032cab8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Aug 2018 13:37:47 -0700 Subject: [PATCH 011/112] doc: Set down forward-progress requirements This commit adds a section to the requirements documentation setting down requirements for grace-period and callback-invocation forward progress. Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 110 +++++++++++++++++- 1 file changed, 108 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 43c4e2f05f4020..7efc1c1da7afd1 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1381,6 +1381,7 @@

Quality-of-Implementation R
  1. Specialization
  2. Performance and Scalability +
  3. Forward Progress
  4. Composability
  5. Corner Cases
@@ -1822,6 +1823,106 @@

Performance and Scalability

RCU thus provides a range of tools to allow updaters to strike the required tradeoff between latency, flexibility and CPU overhead. +

Forward Progress

+ +

+In theory, delaying grace-period completion and callback invocation +is harmless. +In practice, not only are memory sizes finite but also callbacks sometimes +do wakeups, and sufficiently deferred wakeups can be difficult +to distinguish from system hangs. +Therefore, RCU must provide a number of mechanisms to promote forward +progress. + +

+These mechanisms are not foolproof, nor can they be. +For one simple example, an infinite loop in an RCU read-side critical +section must by definition prevent later grace periods from ever completing. +For a more involved example, consider a 64-CPU system built with +CONFIG_RCU_NOCB_CPU=y and booted with rcu_nocbs=1-63, +where CPUs 1 through 63 spin in tight loops that invoke +call_rcu(). +Even if these tight loops also contain calls to cond_resched() +(thus allowing grace periods to complete), CPU 0 simply will +not be able to invoke callbacks as fast as the other 63 CPUs can +register them, at least not until the system runs out of memory. +In both of these examples, the Spiderman principle applies: With great +power comes great responsibility. +However, short of this level of abuse, RCU is required to +ensure timely completion of grace periods and timely invocation of +callbacks. + +

+RCU takes the following steps to encourage timely completion of +grace periods: + +

    +
  1. If a grace period fails to complete within 100 milliseconds, + RCU causes future invocations of cond_resched() on + the holdout CPUs to provide an RCU quiescent state. + RCU also causes those CPUs' need_resched() invocations + to return true, but only after the corresponding CPU's + next scheduling-clock. +
  2. CPUs mentioned in the nohz_full kernel boot parameter + can run indefinitely in the kernel without scheduling-clock + interrupts, which defeats the above need_resched() + strategem. + RCU will therefore invoke resched_cpu() on any + nohz_full CPUs still holding out after + 109 milliseconds. +
  3. In kernels built with CONFIG_RCU_BOOST=y, if a given + task that has been preempted within an RCU read-side critical + section is holding out for more than 500 milliseconds, + RCU will resort to priority boosting. +
  4. If a CPU is still holding out 10 seconds into the grace + period, RCU will invoke resched_cpu() on it regardless + of its nohz_full state. +
+ +

+The above values are defaults for systems running with HZ=1000. +They will vary as the value of HZ varies, and can also be +changed using the relevant Kconfig options and kernel boot parameters. +RCU currently does not do much sanity checking of these +parameters, so please use caution when changing them. +Note that these forward-progress measures are provided only for RCU, +not for +SRCU or +Tasks RCU. + +

+RCU takes the following steps in call_rcu() to encourage timely +invocation of callbacks when any given non-rcu_nocbs CPU has +10,000 callbacks, or has 10,000 more callbacks than it had the last time +encouragement was provided: + +

    +
  1. Starts a grace period, if one is not already in progress. +
  2. Forces immediate checking for quiescent states, rather than + waiting for three milliseconds to have elapsed since the + beginning of the grace period. +
  3. Immediately tags the CPU's callbacks with their grace period + completion numbers, rather than waiting for the RCU_SOFTIRQ + handler to get around to it. +
  4. Lifts callback-execution batch limits, which speeds up callback + invocation at the expense of degrading realtime response. +
+ +

+Again, these are default values when running at HZ=1000, +and can be overridden. +Again, these forward-progress measures are provided only for RCU, +not for +SRCU or +Tasks RCU. +Even for RCU, callback-invocation forward progress for rcu_nocbs +CPUs is much less well-developed, in part because workloads benefiting +from rcu_nocbs CPUs tend to invoke call_rcu() +relatively infrequently. +If workloads emerge that need both rcu_nocbs CPUs and high +call_rcu() invocation rates, then additional forward-progress +work will be required. +

Composability

@@ -2272,7 +2373,7 @@

Interrupts and NMIs

Furthermore, NMI handlers can be interrupted by what appear to RCU to be normal interrupts. One way that this can happen is for code that directly invokes -rcu_irq_enter() and rcu_irq_exit() to be called +rcu_irq_enter() and rcu_irq_exit() to be called from an NMI handler. This astonishing fact of life prompted the current code structure, which has rcu_irq_enter() invoking rcu_nmi_enter() @@ -2294,7 +2395,7 @@

Loadable Modules

Unfortunately, there is no way to cancel an RCU callback; once you invoke call_rcu(), the callback function is -going to eventually be invoked, unless the system goes down first. +eventually going to be invoked, unless the system goes down first. Because it is normally considered socially irresponsible to crash the system in response to a module unload request, we need some other way to deal with in-flight RCU callbacks. @@ -3233,6 +3334,11 @@

Possible Future Changes

originating call_rcu() instance, though probably not in production kernels. +

+Additional work may be required to provide reasonable forward-progress +guarantees under heavy load for grace periods and for callback +invocation. +

Summary

From 2d0350a8f0e6eb5494141c61c5c749b5155df33d Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 21 Sep 2018 18:31:53 -0400 Subject: [PATCH 012/112] doc: Clarify RCU data-structure comment about rcu_tree fanout RCU Data-Structures document describes a trick to test RCU with small number of CPUs but with a taller tree. It wasn't immediately clear how the document arrived at 16 CPUs which also requires setting the FANOUT_LEAF to 2 instead of the default of 16. This commit therefore provides the needed clarification. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- .../RCU/Design/Data-Structures/Data-Structures.html | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 1d2051c0c3fc16..476b1ac38e4c2a 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -127,9 +127,11 @@

Data-Structure Relationships

RCU currently permits up to a four-level tree, which on a 64-bit system accommodates up to 4,194,304 CPUs, though only a mere 524,288 CPUs for 32-bit systems. -On the other hand, you can set CONFIG_RCU_FANOUT to be -as small as 2 if you wish, which would permit only 16 CPUs, which -is useful for testing. +On the other hand, you can set both CONFIG_RCU_FANOUT and +CONFIG_RCU_FANOUT_LEAF to be as small as 2, which would result +in a 16-CPU test using a 4-level tree. +This can be useful for testing large-system capabilities on small test +machines.

This multi-level combining tree allows us to get most of the performance and scalability From dd944caa8173a19c702076471aae17a2d793ebeb Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 22 Sep 2018 19:41:27 -0400 Subject: [PATCH 013/112] doc: Remove rcu_preempt_state reference in stallwarn Consolidation of RCU-bh, RCU-preempt, and RCU-sched into one RCU flavor to rule them all resulted in the removal of rcu_preempt_state. However, stallwarn.txt still mentions rcu_preempt_state. This commit therefore Updates stallwarn documentation accordingly. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 491043fd976f8b..b01bcafc64aa02 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -176,9 +176,8 @@ causing stalls, and that the stall was affecting RCU-sched. This message will normally be followed by stack dumps for each CPU. Please note that PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, and that the tasks will be indicated by PID, for example, "P3421". It is even -possible for a rcu_preempt_state stall to be caused by both CPUs -and- -tasks, in which case the offending CPUs and tasks will all be called -out in the list. +possible for an rcu_state stall to be caused by both CPUs -and- tasks, +in which case the offending CPUs and tasks will all be called out in the list. CPU 2's "(3 GPs behind)" indicates that this CPU has not interacted with the RCU core for the past three grace periods. In contrast, CPU 16's "(0 From 8f15c682ac5a778feb8e343f9057b89beb40d85b Mon Sep 17 00:00:00 2001 From: Connor Shu Date: Wed, 22 Aug 2018 14:16:46 -0700 Subject: [PATCH 014/112] rcutorture: Automatically create initrd directory The rcutorture scripts currently expect the user to create the tools/testing/selftests/rcutorture/initrd directory. Should the user fail to do this, the kernel build will fail with obscure and confusing error messages. This commit therefore adds explicit checks for the tools/testing/selftests/rcutorture/initrd directory, and if not present, creates one on systems on which dracut is installed. If this directory could not be created, a less obscure error message is emitted and the test is aborted. Suggested-by: Thomas Gleixner Signed-off-by: Connor Shu [ paulmck: Adapt the script to fit into the rcutorture framework and severely abbreviate the initrd/init script. ] Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 8 +++ .../selftests/rcutorture/bin/mkinitrd.sh | 60 +++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100755 tools/testing/selftests/rcutorture/bin/mkinitrd.sh diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 5a7a62d76a50b9..19864f1cb27a42 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -194,6 +194,14 @@ do shift done +if test -z "$TORTURE_INITRD" || tools/testing/selftests/rcutorture/bin/mkinitrd.sh +then + : +else + echo No initrd and unable to create one, aborting test >&2 + exit 1 +fi + CONFIGFRAG=${KVM}/configs/${TORTURE_SUITE}; export CONFIGFRAG if test -z "$configs" diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh new file mode 100755 index 00000000000000..ae773760f39696 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# +# Create an initrd directory if one does not already exist. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2013 +# +# Author: Connor Shu + +D=tools/testing/selftests/rcutorture + +# Prerequisite checks +[ -z "$D" ] && echo >&2 "No argument supplied" && exit 1 +if [ ! -d "$D" ]; then + echo >&2 "$D does not exist: Malformed kernel source tree?" + exit 1 +fi +if [ -d "$D/initrd" ]; then + echo "$D/initrd already exists, no need to create it" + exit 0 +fi + +T=${TMPDIR-/tmp}/mkinitrd.sh.$$ +trap 'rm -rf $T' 0 2 +mkdir $T + +cat > $T/init << '__EOF___' +#!/bin/sh +while : +do + sleep 1000000 +done +__EOF___ + +# Try using dracut to create initrd +command -v dracut >/dev/null 2>&1 || { echo >&2 "Dracut not installed"; exit 1; } +echo Creating $D/initrd using dracut. + +# Filesystem creation +dracut --force --no-hostonly --no-hostonly-cmdline --module "base" $T/initramfs.img +cd $D +mkdir initrd +cd initrd +zcat $T/initramfs.img | cpio -id +cp $T/init init +echo Done creating $D/initrd using dracut +exit 0 From 38e630424ba304dbe07ae52aa78d1ed6d38d9f75 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 Aug 2018 10:48:18 -0700 Subject: [PATCH 015/112] rcutorture: Add initrd support for systems lacking dracut The support for creating initrd directories using dracut is a great improvement over having to always hand-create them, it is a bit annoying to have to install some otherwise irrelevant package just to be able to run rcutorture. This commit therefore adds support for creating initrd directories on systems innocent of dracut. You do need gcc, but then again you need that to build the kernel (or to build llvm) in any case. The idea is to create an initrd directory containing nothing but a statically linked binary having a for-loop over a long-term sleep(). The result is a Linux kernel with almost no userspace: even the time-honored /dev, /lib, /tmp, and /usr directories are gone. In fact, the only directory present is "/", but only because I don't know how to get rid of it, at least short of not having an initrd in the first place. Although statically linked binaries are much maligned, and rightly so, their disadvantages seem to be irrelevant for this particular use case. From https://www.akkadia.org/drepper/no_static_linking.html: 1. Fixes are difficult to apply to hordes of widely scattered statically linked binaries. But in this case, there is only one binary, but there would otherwise be no fewer than four libraries. 2. Security measures like local address randomization cannot be used. Prudence prevents me from asserting that it is impossible to base a remote attack on a networking-free rcutorture instance. Nevertheless, bonus points to the first person who comes up with such an attack! 3. More efficient use of physical memory. Not in this case, given that libc is 1.8MB and the statically linked binary "only" 800K. 4. Features such as locales, name service switch (NSS), internationalized domain names (IDN) tool, and so on require dynamic linking. Bonus points to the first person coming up with a valid rcutorture use case requiring these features in its initrd. 5. Accidental violations of (L)GPL. Actually, this change actually helps -avoid- such violations by reducing the temptation to pass around tarballs of rcutorture-ready initrd directories. After all, the rcutorture scripts automatically create an initrd directory for you, so why bother with the tarballs? 6. Tools and hacks like ltrace, LD_PRELOAD, LD_PROFILE, and LD_AUDIT don't work. Again, bonus points to the first person coming up with a valid rcutorture use case requiring these features in its initrd. Nevertheless, the script will use dracut if available, and will create the statically linked binary only when dracut are missing. Those preferring the smaller initrd directory resulting from the statically linked binary (like me) are free to hand-edit mkinitrd.sh to remove the code using dracut. ;-) Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/mkinitrd.sh | 40 ++++++-- .../selftests/rcutorture/doc/initrd.txt | 99 +++---------------- 2 files changed, 45 insertions(+), 94 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh index ae773760f39696..87a87ffeaa85a1 100755 --- a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh +++ b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh @@ -46,15 +46,41 @@ done __EOF___ # Try using dracut to create initrd -command -v dracut >/dev/null 2>&1 || { echo >&2 "Dracut not installed"; exit 1; } -echo Creating $D/initrd using dracut. +if command -v dracut >/dev/null 2>&1 +then + echo Creating $D/initrd using dracut. + # Filesystem creation + dracut --force --no-hostonly --no-hostonly-cmdline --module "base" $T/initramfs.img + cd $D + mkdir initrd + cd initrd + zcat $T/initramfs.img | cpio -id + cp $T/init init + chmod +x init + echo Done creating $D/initrd using dracut + exit 0 +fi -# Filesystem creation -dracut --force --no-hostonly --no-hostonly-cmdline --module "base" $T/initramfs.img +# No dracut, so create a C-language initrd/init program and statically +# link it. This results in a very small initrd, but might be a bit less +# future-proof than dracut. +echo "Could not find dracut, attempting C initrd" cd $D mkdir initrd cd initrd -zcat $T/initramfs.img | cpio -id -cp $T/init init -echo Done creating $D/initrd using dracut +cat > init.c << '___EOF___' +#include + +int main(int argc, int argv[]) +{ + for (;;) + sleep(1000*1000*1000); /* One gigasecond is ~30 years. */ + return 0; +} +___EOF___ +gcc -static -Os -o init init.c +strip init +rm init.c +echo "Done creating a statically linked C-language initrd" + exit 0 diff --git a/tools/testing/selftests/rcutorture/doc/initrd.txt b/tools/testing/selftests/rcutorture/doc/initrd.txt index 833f826d6ec2c9..933b4fd1232715 100644 --- a/tools/testing/selftests/rcutorture/doc/initrd.txt +++ b/tools/testing/selftests/rcutorture/doc/initrd.txt @@ -1,9 +1,12 @@ -This document describes one way to create the initrd directory hierarchy -in order to allow an initrd to be built into your kernel. The trick -here is to steal the initrd file used on your Linux laptop, Ubuntu in -this case. There are probably much better ways of doing this. +The rcutorture scripting tools automatically create the needed initrd +directory using dracut. Failing that, this tool will create an initrd +containing a single statically linked binary named "init" that loops +over a very long sleep() call. In both cases, this creation is done +by tools/testing/selftests/rcutorture/bin/mkinitrd.sh. -That said, here are the commands: +However, if you are attempting to run rcutorture on a system that does +not have dracut installed, and if you don't like the notion of static +linking, you might wish to press an existing initrd into service: ------------------------------------------------------------------------ cd tools/testing/selftests/rcutorture @@ -11,22 +14,7 @@ zcat /initrd.img > /tmp/initrd.img.zcat mkdir initrd cd initrd cpio -id < /tmp/initrd.img.zcat ------------------------------------------------------------------------- - -Another way to create an initramfs image is using "dracut"[1], which is -available on many distros, however the initramfs dracut generates is a cpio -archive with another cpio archive in it, so an extra step is needed to create -the initrd directory hierarchy. - -Here are the commands to create a initrd directory for rcutorture using -dracut: - ------------------------------------------------------------------------- -dracut --no-hostonly --no-hostonly-cmdline --module "base bash shutdown" /tmp/initramfs.img -cd tools/testing/selftests/rcutorture -mkdir initrd -cd initrd -/usr/lib/dracut/skipcpio /tmp/initramfs.img | zcat | cpio -id < /tmp/initramfs.img +# Manually verify that initrd contains needed binaries and libraries. ------------------------------------------------------------------------ Interestingly enough, if you are running rcutorture, you don't really @@ -39,75 +27,12 @@ with 0755 mode. ------------------------------------------------------------------------ #!/bin/sh -[ -d /dev ] || mkdir -m 0755 /dev -[ -d /root ] || mkdir -m 0700 /root -[ -d /sys ] || mkdir /sys -[ -d /proc ] || mkdir /proc -[ -d /tmp ] || mkdir /tmp -mkdir -p /var/lock -mount -t sysfs -o nodev,noexec,nosuid sysfs /sys -mount -t proc -o nodev,noexec,nosuid proc /proc -# Some things don't work properly without /etc/mtab. -ln -sf /proc/mounts /etc/mtab - -# Note that this only becomes /dev on the real filesystem if udev's scripts -# are used; which they will be, but it's worth pointing out -if ! mount -t devtmpfs -o mode=0755 udev /dev; then - echo "W: devtmpfs not available, falling back to tmpfs for /dev" - mount -t tmpfs -o mode=0755 udev /dev - [ -e /dev/console ] || mknod --mode=600 /dev/console c 5 1 - [ -e /dev/kmsg ] || mknod --mode=644 /dev/kmsg c 1 11 - [ -e /dev/null ] || mknod --mode=666 /dev/null c 1 3 -fi - -mkdir /dev/pts -mount -t devpts -o noexec,nosuid,gid=5,mode=0620 devpts /dev/pts || true -mount -t tmpfs -o "nosuid,size=20%,mode=0755" tmpfs /run -mkdir /run/initramfs -# compatibility symlink for the pre-oneiric locations -ln -s /run/initramfs /dev/.initramfs - -# Export relevant variables -export ROOT= -export ROOTDELAY= -export ROOTFLAGS= -export ROOTFSTYPE= -export IP= -export BOOT= -export BOOTIF= -export UBIMTD= -export break= -export init=/sbin/init -export quiet=n -export readonly=y -export rootmnt=/root -export debug= -export panic= -export blacklist= -export resume= -export resume_offset= -export recovery= - -for i in /sys/devices/system/cpu/cpu*/online -do - case $i in - '/sys/devices/system/cpu/cpu0/online') - ;; - '/sys/devices/system/cpu/cpu*/online') - ;; - *) - echo 1 > $i - ;; - esac -done - while : do sleep 10 done ------------------------------------------------------------------------ -References: -[1]: https://dracut.wiki.kernel.org/index.php/Main_Page -[2]: http://blog.elastocloud.org/2015/06/rapid-linux-kernel-devtest-with-qemu.html -[3]: https://www.centos.org/forums/viewtopic.php?t=51621 +This approach also allows most of the binaries and libraries in the +initrd filesystem to be dispensed with, which can save significant +space in rcutorture's "res" directory. From 229ab0cb5be3bfbac5947df7240f6905470ca413 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 Aug 2018 15:23:23 -0700 Subject: [PATCH 016/112] rcutorture: Make initrd/init execute in userspace Currently, the initrd/init script and executable remain blocked almost all the time. However, it is necessary to test nohz_full userspace execution, which both variants of initrd/init fail to do. This commit therefore causes initrd/init to spend about a millisecond per second executing in userspace. Reported-by: Josh Triplett Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/mkinitrd.sh | 43 +++++++++++++++++-- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh index 87a87ffeaa85a1..b48c504edfe12d 100755 --- a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh +++ b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh @@ -39,9 +39,22 @@ mkdir $T cat > $T/init << '__EOF___' #!/bin/sh +# Run in userspace a few milliseconds every second. This helps to +# exercise the NO_HZ_FULL portions of RCU. while : do - sleep 1000000 + q= + for i in \ + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a \ + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a \ + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a \ + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a \ + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a \ + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + do + q="$q $i" + done + sleep 1 done __EOF___ @@ -70,15 +83,37 @@ mkdir initrd cd initrd cat > init.c << '___EOF___' #include +#include + +volatile unsigned long delaycount; int main(int argc, int argv[]) { - for (;;) - sleep(1000*1000*1000); /* One gigasecond is ~30 years. */ + int i; + struct timeval tv; + struct timeval tvb; + + for (;;) { + sleep(1); + /* Need some userspace time. */ + if (gettimeofday(&tvb, NULL)) + continue; + do { + for (i = 0; i < 1000 * 100; i++) + delaycount = i * i; + if (gettimeofday(&tv, NULL)) + break; + tv.tv_sec -= tvb.tv_sec; + if (tv.tv_sec > 1) + break; + tv.tv_usec += tv.tv_sec * 1000 * 1000; + tv.tv_usec -= tvb.tv_usec; + } while (tv.tv_usec < 1000); + } return 0; } ___EOF___ -gcc -static -Os -o init init.c +cc -static -Os -o init init.c strip init rm init.c echo "Done creating a statically linked C-language initrd" From 70e9f504774b35aacd7b43d873b51ec5260e58ad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Sep 2018 10:26:07 -0700 Subject: [PATCH 017/112] rcutorture: Add cross-compile capability to initrd.sh This adds the CROSS_COMPILE environment to the initrd.sh script's gcc command to enable cross compilation. Reported-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/mkinitrd.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh index b48c504edfe12d..70661457e3d64b 100755 --- a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh +++ b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh @@ -113,7 +113,7 @@ int main(int argc, int argv[]) return 0; } ___EOF___ -cc -static -Os -o init init.c +${CROSS_COMPILE}gcc -static -Os -o init init.c strip init rm init.c echo "Done creating a statically linked C-language initrd" From 18d7bf8ed3a1628ee653d3abde051703642ecd60 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 9 Sep 2018 11:41:10 +0200 Subject: [PATCH 018/112] rcutorture: Always strip using the cross-compiler Strip using -s on the compiler command line instead of calling the "strip" utility as the latter isn't necessarily compatible with the target arch. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/mkinitrd.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh index 70661457e3d64b..dbb6f016028177 100755 --- a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh +++ b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh @@ -113,8 +113,7 @@ int main(int argc, int argv[]) return 0; } ___EOF___ -${CROSS_COMPILE}gcc -static -Os -o init init.c -strip init +${CROSS_COMPILE}gcc -s -static -Os -o init init.c rm init.c echo "Done creating a statically linked C-language initrd" From 825fa4cdfb10d8cbf784ebdadd6d5d93130a0cb5 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 9 Sep 2018 11:46:48 +0200 Subject: [PATCH 019/112] rcutorture: Check initrd/init instead of initrd only If the build fails, we can end up with an empty initrd directory which prevents the build script from operating again. Better rely on the resulting init executable instead. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/mkinitrd.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh index dbb6f016028177..56a56ea06983dd 100755 --- a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh +++ b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh @@ -28,8 +28,8 @@ if [ ! -d "$D" ]; then echo >&2 "$D does not exist: Malformed kernel source tree?" exit 1 fi -if [ -d "$D/initrd" ]; then - echo "$D/initrd already exists, no need to create it" +if [ -s "$D/initrd/init" ]; then + echo "$D/initrd/init already exists, no need to create it" exit 0 fi @@ -65,7 +65,7 @@ then # Filesystem creation dracut --force --no-hostonly --no-hostonly-cmdline --module "base" $T/initramfs.img cd $D - mkdir initrd + mkdir -p initrd cd initrd zcat $T/initramfs.img | cpio -id cp $T/init init @@ -79,7 +79,7 @@ fi # future-proof than dracut. echo "Could not find dracut, attempting C initrd" cd $D -mkdir initrd +mkdir -p initrd cd initrd cat > init.c << '___EOF___' #include From 66b6f755ad45d354c5b74abd258f67aa8b40b3c7 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 9 Sep 2018 13:26:04 +0200 Subject: [PATCH 020/112] rcutorture: Import a copy of nolibc This is a definition of the most common syscalls needed in minimalist init executables, allowing to statically build them with no external dependencies. It is sufficient in its current form to build rcutorture's init on x86_64, i386, arm, and arm64. Others have not been ported or tested. Updates may be found here : http://git.formilux.org/?p=people/willy/nolibc.git Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- .../testing/selftests/rcutorture/bin/nolibc.h | 2197 +++++++++++++++++ 1 file changed, 2197 insertions(+) create mode 100644 tools/testing/selftests/rcutorture/bin/nolibc.h diff --git a/tools/testing/selftests/rcutorture/bin/nolibc.h b/tools/testing/selftests/rcutorture/bin/nolibc.h new file mode 100644 index 00000000000000..f98f5b92d3eb31 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/nolibc.h @@ -0,0 +1,2197 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* nolibc.h + * Copyright (C) 2017-2018 Willy Tarreau + */ + +/* some archs (at least aarch64) don't expose the regular syscalls anymore by + * default, either because they have an "_at" replacement, or because there are + * more modern alternatives. For now we'd rather still use them. + */ +#define __ARCH_WANT_SYSCALL_NO_AT +#define __ARCH_WANT_SYSCALL_NO_FLAGS +#define __ARCH_WANT_SYSCALL_DEPRECATED + +#include +#include +#include +#include +#include + +#define NOLIBC + +/* Build a static executable this way : + * $ gcc -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ + * -static -include nolibc.h -lgcc -o hello hello.c + * + * Useful calling convention table found here : + * http://man7.org/linux/man-pages/man2/syscall.2.html + * + * This doc is even better : + * https://w3challs.com/syscalls/ + */ + + +/* this way it will be removed if unused */ +static int errno; + +#ifndef NOLIBC_IGNORE_ERRNO +#define SET_ERRNO(v) do { errno = (v); } while (0) +#else +#define SET_ERRNO(v) do { } while (0) +#endif + +/* errno codes all ensure that they will not conflict with a valid pointer + * because they all correspond to the highest addressable memry page. + */ +#define MAX_ERRNO 4095 + +/* Declare a few quite common macros and types that usually are in stdlib.h, + * stdint.h, ctype.h, unistd.h and a few other common locations. + */ + +#define NULL ((void *)0) + +/* stdint types */ +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef unsigned short uint16_t; +typedef signed short int16_t; +typedef unsigned int uint32_t; +typedef signed int int32_t; +typedef unsigned long long uint64_t; +typedef signed long long int64_t; +typedef unsigned long size_t; +typedef signed long ssize_t; +typedef unsigned long uintptr_t; +typedef signed long intptr_t; +typedef signed long ptrdiff_t; + +/* for stat() */ +typedef unsigned int dev_t; +typedef unsigned long ino_t; +typedef unsigned int mode_t; +typedef signed int pid_t; +typedef unsigned int uid_t; +typedef unsigned int gid_t; +typedef unsigned long nlink_t; +typedef signed long off_t; +typedef signed long blksize_t; +typedef signed long blkcnt_t; +typedef signed long time_t; + +/* for poll() */ +struct pollfd { + int fd; + short int events; + short int revents; +}; + +/* for select() */ +struct timeval { + long tv_sec; + long tv_usec; +}; + +/* for pselect() */ +struct timespec { + long tv_sec; + long tv_nsec; +}; + +/* for gettimeofday() */ +struct timezone { + int tz_minuteswest; + int tz_dsttime; +}; + +/* for getdents64() */ +struct linux_dirent64 { + uint64_t d_ino; + int64_t d_off; + unsigned short d_reclen; + unsigned char d_type; + char d_name[]; +}; + +/* commonly an fd_set represents 256 FDs */ +#define FD_SETSIZE 256 +typedef struct { uint32_t fd32[FD_SETSIZE/32]; } fd_set; + +/* needed by wait4() */ +struct rusage { + struct timeval ru_utime; + struct timeval ru_stime; + long ru_maxrss; + long ru_ixrss; + long ru_idrss; + long ru_isrss; + long ru_minflt; + long ru_majflt; + long ru_nswap; + long ru_inblock; + long ru_oublock; + long ru_msgsnd; + long ru_msgrcv; + long ru_nsignals; + long ru_nvcsw; + long ru_nivcsw; +}; + +/* stat flags (WARNING, octal here) */ +#define S_IFDIR 0040000 +#define S_IFCHR 0020000 +#define S_IFBLK 0060000 +#define S_IFREG 0100000 +#define S_IFIFO 0010000 +#define S_IFLNK 0120000 +#define S_IFSOCK 0140000 +#define S_IFMT 0170000 + +#define S_ISDIR(mode) (((mode) & S_IFDIR) == S_IFDIR) +#define S_ISCHR(mode) (((mode) & S_IFCHR) == S_IFCHR) +#define S_ISBLK(mode) (((mode) & S_IFBLK) == S_IFBLK) +#define S_ISREG(mode) (((mode) & S_IFREG) == S_IFREG) +#define S_ISFIFO(mode) (((mode) & S_IFIFO) == S_IFIFO) +#define S_ISLNK(mode) (((mode) & S_IFLNK) == S_IFLNK) +#define S_ISSOCK(mode) (((mode) & S_IFSOCK) == S_IFSOCK) + +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 + +/* all the *at functions */ +#ifndef AT_FDWCD +#define AT_FDCWD -100 +#endif + +/* lseek */ +#define SEEK_SET 0 +#define SEEK_CUR 1 +#define SEEK_END 2 + +/* reboot */ +#define LINUX_REBOOT_MAGIC1 0xfee1dead +#define LINUX_REBOOT_MAGIC2 0x28121969 +#define LINUX_REBOOT_CMD_HALT 0xcdef0123 +#define LINUX_REBOOT_CMD_POWER_OFF 0x4321fedc +#define LINUX_REBOOT_CMD_RESTART 0x01234567 +#define LINUX_REBOOT_CMD_SW_SUSPEND 0xd000fce2 + + +/* The format of the struct as returned by the libc to the application, which + * significantly differs from the format returned by the stat() syscall flavours. + */ +struct stat { + dev_t st_dev; /* ID of device containing file */ + ino_t st_ino; /* inode number */ + mode_t st_mode; /* protection */ + nlink_t st_nlink; /* number of hard links */ + uid_t st_uid; /* user ID of owner */ + gid_t st_gid; /* group ID of owner */ + dev_t st_rdev; /* device ID (if special file) */ + off_t st_size; /* total size, in bytes */ + blksize_t st_blksize; /* blocksize for file system I/O */ + blkcnt_t st_blocks; /* number of 512B blocks allocated */ + time_t st_atime; /* time of last access */ + time_t st_mtime; /* time of last modification */ + time_t st_ctime; /* time of last status change */ +}; + +#define WEXITSTATUS(status) (((status) & 0xff00) >> 8) +#define WIFEXITED(status) (((status) & 0x7f) == 0) + + +/* Below comes the architecture-specific code. For each architecture, we have + * the syscall declarations and the _start code definition. This is the only + * global part. On all architectures the kernel puts everything in the stack + * before jumping to _start just above us, without any return address (_start + * is not a function but an entry pint). So at the stack pointer we find argc. + * Then argv[] begins, and ends at the first NULL. Then we have envp which + * starts and ends with a NULL as well. So envp=argv+argc+1. + */ + +#if defined(__x86_64__) +/* Syscalls for x86_64 : + * - registers are 64-bit + * - syscall number is passed in rax + * - arguments are in rdi, rsi, rdx, r10, r8, r9 respectively + * - the system call is performed by calling the syscall instruction + * - syscall return comes in rax + * - rcx and r8..r11 may be clobbered, others are preserved. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + */ + +#define my_syscall0(num) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret) \ + : "0"(_num) \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret) \ + : "r"(_arg1), \ + "0"(_num) \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), \ + "0"(_num) \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "0"(_num) \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + register long _arg4 asm("r10") = (long)(arg4); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret), "=r"(_arg4) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "0"(_num) \ + : "rcx", "r8", "r9", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + register long _arg4 asm("r10") = (long)(arg4); \ + register long _arg5 asm("r8") = (long)(arg5); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret), "=r"(_arg4), "=r"(_arg5) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "0"(_num) \ + : "rcx", "r9", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + register long _arg4 asm("r10") = (long)(arg4); \ + register long _arg5 asm("r8") = (long)(arg5); \ + register long _arg6 asm("r9") = (long)(arg6); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret), "=r"(_arg4), "=r"(_arg5) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_arg6), "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" + "pop %rdi\n" // argc (first arg, %rdi) + "mov %rsp, %rsi\n" // argv[] (second arg, %rsi) + "lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx) + "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned when + "sub $8, %rsp\n" // entering the callee + "call main\n" // main() returns the status code, we'll exit with it. + "movzb %al, %rdi\n" // retrieve exit code from 8 lower bits + "mov $60, %rax\n" // NR_exit == 60 + "syscall\n" // really exit + "hlt\n" // ensure it does not return + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall, equivalent to stat64(). The + * syscall returns 116 bytes and stops in the middle of __unused. + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned long st_nlink; + unsigned int st_mode; + unsigned int st_uid; + + unsigned int st_gid; + unsigned int __pad0; + unsigned long st_rdev; + long st_size; + long st_blksize; + + long st_blocks; + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + + unsigned long st_mtime_nsec; + unsigned long st_ctime; + unsigned long st_ctime_nsec; + long __unused[3]; +}; + +#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) +/* Syscalls for i386 : + * - mostly similar to x86_64 + * - registers are 32-bit + * - syscall number is passed in eax + * - arguments are in ebx, ecx, edx, esi, edi, ebp respectively + * - all registers are preserved (except eax of course) + * - the system call is performed by calling int $0x80 + * - syscall return comes in eax + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + * + * Also, i386 supports the old_select syscall if newselect is not available + */ +#define __ARCH_WANT_SYS_OLD_SELECT + +#define my_syscall0(num) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + register long _arg3 asm("edx") = (long)(arg3); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + register long _arg3 asm("edx") = (long)(arg3); \ + register long _arg4 asm("esi") = (long)(arg4); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + register long _arg3 asm("edx") = (long)(arg3); \ + register long _arg4 asm("esi") = (long)(arg4); \ + register long _arg5 asm("edi") = (long)(arg5); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" + "pop %eax\n" // argc (first arg, %eax) + "mov %esp, %ebx\n" // argv[] (second arg, %ebx) + "lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx) + "and $-16, %esp\n" // x86 ABI : esp must be 16-byte aligned when + "push %ecx\n" // push all registers on the stack so that we + "push %ebx\n" // support both regparm and plain stack modes + "push %eax\n" + "call main\n" // main() returns the status code in %eax + "movzbl %al, %ebx\n" // retrieve exit code from lower 8 bits + "movl $1, %eax\n" // NR_exit == 1 + "int $0x80\n" // exit now + "hlt\n" // ensure it does not + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall, 32-bit only, the syscall returns + * exactly 56 bytes (stops before the unused array). + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + + unsigned long st_rdev; + unsigned long st_size; + unsigned long st_blksize; + unsigned long st_blocks; + + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + + unsigned long st_ctime; + unsigned long st_ctime_nsec; + unsigned long __unused[2]; +}; + +#elif defined(__ARM_EABI__) +/* Syscalls for ARM in ARM or Thumb modes : + * - registers are 32-bit + * - stack is 8-byte aligned + * ( http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka4127.html) + * - syscall number is passed in r7 + * - arguments are in r0, r1, r2, r3, r4, r5 + * - the system call is performed by calling svc #0 + * - syscall return comes in r0. + * - only lr is clobbered. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + * + * Also, ARM supports the old_select syscall if newselect is not available + */ +#define __ARCH_WANT_SYS_OLD_SELECT + +#define my_syscall0(num) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0"); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + register long _arg3 asm("r2") = (long)(arg3); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + register long _arg3 asm("r2") = (long)(arg3); \ + register long _arg4 asm("r3") = (long)(arg4); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + register long _arg3 asm("r2") = (long)(arg3); \ + register long _arg4 asm("r3") = (long)(arg4); \ + register long _arg5 asm("r4") = (long)(arg5); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" +#if defined(__THUMBEB__) || defined(__THUMBEL__) + /* We enter here in 32-bit mode but if some previous functions were in + * 16-bit mode, the assembler cannot know, so we need to tell it we're in + * 32-bit now, then switch to 16-bit (is there a better way to do it than + * adding 1 by hand ?) and tell the asm we're now in 16-bit mode so that + * it generates correct instructions. Note that we do not support thumb1. + */ + ".code 32\n" + "add r0, pc, #1\n" + "bx r0\n" + ".code 16\n" +#endif + "pop {%r0}\n" // argc was in the stack + "mov %r1, %sp\n" // argv = sp + "add %r2, %r1, %r0, lsl #2\n" // envp = argv + 4*argc ... + "add %r2, %r2, $4\n" // ... + 4 + "and %r3, %r1, $-8\n" // AAPCS : sp must be 8-byte aligned in the + "mov %sp, %r3\n" // callee, an bl doesn't push (lr=pc) + "bl main\n" // main() returns the status code, we'll exit with it. + "and %r0, %r0, $0xff\n" // limit exit code to 8 bits + "movs r7, $1\n" // NR_exit == 1 + "svc $0x00\n" + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x4000 + +/* The struct returned by the stat() syscall, 32-bit only, the syscall returns + * exactly 56 bytes (stops before the unused array). In big endian, the format + * differs as devices are returned as short only. + */ +struct sys_stat_struct { +#if defined(__ARMEB__) + unsigned short st_dev; + unsigned short __pad1; +#else + unsigned long st_dev; +#endif + unsigned long st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; +#if defined(__ARMEB__) + unsigned short st_rdev; + unsigned short __pad2; +#else + unsigned long st_rdev; +#endif + unsigned long st_size; + unsigned long st_blksize; + unsigned long st_blocks; + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + unsigned long st_ctime; + unsigned long st_ctime_nsec; + unsigned long __unused[2]; +}; + +#elif defined(__aarch64__) +/* Syscalls for AARCH64 : + * - registers are 64-bit + * - stack is 16-byte aligned + * - syscall number is passed in x8 + * - arguments are in x0, x1, x2, x3, x4, x5 + * - the system call is performed by calling svc 0 + * - syscall return comes in x0. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * + * On aarch64, select() is not implemented so we have to use pselect6(). + */ +#define __ARCH_WANT_SYS_PSELECT6 + +#define my_syscall0(num) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0"); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + register long _arg4 asm("x3") = (long)(arg4); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + register long _arg4 asm("x3") = (long)(arg4); \ + register long _arg5 asm("x4") = (long)(arg5); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + register long _arg4 asm("x3") = (long)(arg4); \ + register long _arg5 asm("x4") = (long)(arg5); \ + register long _arg6 asm("x5") = (long)(arg6); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_arg6), "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" + "ldr x0, [sp]\n" // argc (x0) was in the stack + "add x1, sp, 8\n" // argv (x1) = sp + "lsl x2, x0, 3\n" // envp (x2) = 8*argc ... + "add x2, x2, 8\n" // + 8 (skip null) + "add x2, x2, x1\n" // + argv + "and sp, x1, -16\n" // sp must be 16-byte aligned in the callee + "bl main\n" // main() returns the status code, we'll exit with it. + "and x0, x0, 0xff\n" // limit exit code to 8 bits + "mov x8, 93\n" // NR_exit == 93 + "svc #0\n" + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x4000 + +/* The struct returned by the newfstatat() syscall. Differs slightly from the + * x86_64's stat one by field ordering, so be careful. + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned int st_mode; + unsigned int st_nlink; + unsigned int st_uid; + unsigned int st_gid; + + unsigned long st_rdev; + unsigned long __pad1; + long st_size; + int st_blksize; + int __pad2; + + long st_blocks; + long st_atime; + unsigned long st_atime_nsec; + long st_mtime; + + unsigned long st_mtime_nsec; + long st_ctime; + unsigned long st_ctime_nsec; + unsigned int __unused[2]; +}; + +#elif defined(__mips__) && defined(_ABIO32) +/* Syscalls for MIPS ABI O32 : + * - WARNING! there's always a delayed slot! + * - WARNING again, the syntax is different, registers take a '$' and numbers + * do not. + * - registers are 32-bit + * - stack is 8-byte aligned + * - syscall number is passed in v0 (starts at 0xfa0). + * - arguments are in a0, a1, a2, a3, then the stack. The caller needs to + * leave some room in the stack for the callee to save a0..a3 if needed. + * - Many registers are clobbered, in fact only a0..a2 and s0..s8 are + * preserved. See: https://www.linux-mips.org/wiki/Syscall as well as + * scall32-o32.S in the kernel sources. + * - the system call is performed by calling "syscall" + * - syscall return comes in v0, and register a3 needs to be checked to know + * if an error occured, in which case errno is in v0. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + */ + +#define my_syscall0(num) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "r"(_num) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3") = (long)(arg4); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r" (_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3") = (long)(arg4); \ + register long _arg5 = (long)(arg5); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "sw %7, 16($sp)\n" \ + "syscall\n " \ + "addiu $sp, $sp, 32\n" \ + : "=r" (_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + \ + ); \ + _arg4 ? -_num : _num; \ +}) + +/* startup code, note that it's called __start on MIPS */ +asm(".section .text\n" + ".set nomips16\n" + ".global __start\n" + ".set noreorder\n" + ".option pic0\n" + ".ent __start\n" + "__start:\n" + "lw $a0,($sp)\n" // argc was in the stack + "addiu $a1, $sp, 4\n" // argv = sp + 4 + "sll $a2, $a0, 2\n" // a2 = argc * 4 + "add $a2, $a2, $a1\n" // envp = argv + 4*argc ... + "addiu $a2, $a2, 4\n" // ... + 4 + "li $t0, -8\n" + "and $sp, $sp, $t0\n" // sp must be 8-byte aligned + "addiu $sp,$sp,-16\n" // the callee expects to save a0..a3 there! + "jal main\n" // main() returns the status code, we'll exit with it. + "nop\n" // delayed slot + "and $a0, $v0, 0xff\n" // limit exit code to 8 bits + "li $v0, 4001\n" // NR_exit == 4001 + "syscall\n" + ".end __start\n" + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_APPEND 0x0008 +#define O_NONBLOCK 0x0080 +#define O_CREAT 0x0100 +#define O_TRUNC 0x0200 +#define O_EXCL 0x0400 +#define O_NOCTTY 0x0800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall. 88 bytes are returned by the + * syscall. + */ +struct sys_stat_struct { + unsigned int st_dev; + long st_pad1[3]; + unsigned long st_ino; + unsigned int st_mode; + unsigned int st_nlink; + unsigned int st_uid; + unsigned int st_gid; + unsigned int st_rdev; + long st_pad2[2]; + long st_size; + long st_pad3; + long st_atime; + long st_atime_nsec; + long st_mtime; + long st_mtime_nsec; + long st_ctime; + long st_ctime_nsec; + long st_blksize; + long st_blocks; + long st_pad4[14]; +}; + +#endif + + +/* Below are the C functions used to declare the raw syscalls. They try to be + * architecture-agnostic, and return either a success or -errno. Declaring them + * static will lead to them being inlined in most cases, but it's still possible + * to reference them by a pointer if needed. + */ +static __attribute__((unused)) +void *sys_brk(void *addr) +{ + return (void *)my_syscall1(__NR_brk, addr); +} + +static __attribute__((noreturn,unused)) +void sys_exit(int status) +{ + my_syscall1(__NR_exit, status & 255); + while(1); // shut the "noreturn" warnings. +} + +static __attribute__((unused)) +int sys_chdir(const char *path) +{ + return my_syscall1(__NR_chdir, path); +} + +static __attribute__((unused)) +int sys_chmod(const char *path, mode_t mode) +{ +#ifdef __NR_fchmodat + return my_syscall4(__NR_fchmodat, AT_FDCWD, path, mode, 0); +#else + return my_syscall2(__NR_chmod, path, mode); +#endif +} + +static __attribute__((unused)) +int sys_chown(const char *path, uid_t owner, gid_t group) +{ +#ifdef __NR_fchownat + return my_syscall5(__NR_fchownat, AT_FDCWD, path, owner, group, 0); +#else + return my_syscall3(__NR_chown, path, owner, group); +#endif +} + +static __attribute__((unused)) +int sys_chroot(const char *path) +{ + return my_syscall1(__NR_chroot, path); +} + +static __attribute__((unused)) +int sys_close(int fd) +{ + return my_syscall1(__NR_close, fd); +} + +static __attribute__((unused)) +int sys_dup(int fd) +{ + return my_syscall1(__NR_dup, fd); +} + +static __attribute__((unused)) +int sys_dup2(int old, int new) +{ + return my_syscall2(__NR_dup2, old, new); +} + +static __attribute__((unused)) +int sys_execve(const char *filename, char *const argv[], char *const envp[]) +{ + return my_syscall3(__NR_execve, filename, argv, envp); +} + +static __attribute__((unused)) +pid_t sys_fork(void) +{ + return my_syscall0(__NR_fork); +} + +static __attribute__((unused)) +int sys_fsync(int fd) +{ + return my_syscall1(__NR_fsync, fd); +} + +static __attribute__((unused)) +int sys_getdents64(int fd, struct linux_dirent64 *dirp, int count) +{ + return my_syscall3(__NR_getdents64, fd, dirp, count); +} + +static __attribute__((unused)) +pid_t sys_getpgrp(void) +{ + return my_syscall0(__NR_getpgrp); +} + +static __attribute__((unused)) +pid_t sys_getpid(void) +{ + return my_syscall0(__NR_getpid); +} + +static __attribute__((unused)) +int sys_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + return my_syscall2(__NR_gettimeofday, tv, tz); +} + +static __attribute__((unused)) +int sys_ioctl(int fd, unsigned long req, void *value) +{ + return my_syscall3(__NR_ioctl, fd, req, value); +} + +static __attribute__((unused)) +int sys_kill(pid_t pid, int signal) +{ + return my_syscall2(__NR_kill, pid, signal); +} + +static __attribute__((unused)) +int sys_link(const char *old, const char *new) +{ +#ifdef __NR_linkat + return my_syscall5(__NR_linkat, AT_FDCWD, old, AT_FDCWD, new, 0); +#else + return my_syscall2(__NR_link, old, new); +#endif +} + +static __attribute__((unused)) +off_t sys_lseek(int fd, off_t offset, int whence) +{ + return my_syscall3(__NR_lseek, fd, offset, whence); +} + +static __attribute__((unused)) +int sys_mkdir(const char *path, mode_t mode) +{ +#ifdef __NR_mkdirat + return my_syscall3(__NR_mkdirat, AT_FDCWD, path, mode); +#else + return my_syscall2(__NR_mkdir, path, mode); +#endif +} + +static __attribute__((unused)) +long sys_mknod(const char *path, mode_t mode, dev_t dev) +{ +#ifdef __NR_mknodat + return my_syscall4(__NR_mknodat, AT_FDCWD, path, mode, dev); +#else + return my_syscall3(__NR_mknod, path, mode, dev); +#endif +} + +static __attribute__((unused)) +int sys_mount(const char *src, const char *tgt, const char *fst, + unsigned long flags, const void *data) +{ + return my_syscall5(__NR_mount, src, tgt, fst, flags, data); +} + +static __attribute__((unused)) +int sys_open(const char *path, int flags, mode_t mode) +{ +#ifdef __NR_openat + return my_syscall4(__NR_openat, AT_FDCWD, path, flags, mode); +#else + return my_syscall3(__NR_open, path, flags, mode); +#endif +} + +static __attribute__((unused)) +int sys_pivot_root(const char *new, const char *old) +{ + return my_syscall2(__NR_pivot_root, new, old); +} + +static __attribute__((unused)) +int sys_poll(struct pollfd *fds, int nfds, int timeout) +{ + return my_syscall3(__NR_poll, fds, nfds, timeout); +} + +static __attribute__((unused)) +ssize_t sys_read(int fd, void *buf, size_t count) +{ + return my_syscall3(__NR_read, fd, buf, count); +} + +static __attribute__((unused)) +ssize_t sys_reboot(int magic1, int magic2, int cmd, void *arg) +{ + return my_syscall4(__NR_reboot, magic1, magic2, cmd, arg); +} + +static __attribute__((unused)) +int sys_sched_yield(void) +{ + return my_syscall0(__NR_sched_yield); +} + +static __attribute__((unused)) +int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) +{ +#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect) + struct sel_arg_struct { + unsigned long n; + fd_set *r, *w, *e; + struct timeval *t; + } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout }; + return my_syscall1(__NR_select, &arg); +#elif defined(__ARCH_WANT_SYS_PSELECT6) && defined(__NR_pselect6) + struct timespec t; + + if (timeout) { + t.tv_sec = timeout->tv_sec; + t.tv_nsec = timeout->tv_usec * 1000; + } + return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); +#else +#ifndef __NR__newselect +#define __NR__newselect __NR_select +#endif + return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout); +#endif +} + +static __attribute__((unused)) +int sys_setpgid(pid_t pid, pid_t pgid) +{ + return my_syscall2(__NR_setpgid, pid, pgid); +} + +static __attribute__((unused)) +pid_t sys_setsid(void) +{ + return my_syscall0(__NR_setsid); +} + +static __attribute__((unused)) +int sys_stat(const char *path, struct stat *buf) +{ + struct sys_stat_struct stat; + long ret; + +#ifdef __NR_newfstatat + /* only solution for arm64 */ + ret = my_syscall4(__NR_newfstatat, AT_FDCWD, path, &stat, 0); +#else + ret = my_syscall2(__NR_stat, path, &stat); +#endif + buf->st_dev = stat.st_dev; + buf->st_ino = stat.st_ino; + buf->st_mode = stat.st_mode; + buf->st_nlink = stat.st_nlink; + buf->st_uid = stat.st_uid; + buf->st_gid = stat.st_gid; + buf->st_rdev = stat.st_rdev; + buf->st_size = stat.st_size; + buf->st_blksize = stat.st_blksize; + buf->st_blocks = stat.st_blocks; + buf->st_atime = stat.st_atime; + buf->st_mtime = stat.st_mtime; + buf->st_ctime = stat.st_ctime; + return ret; +} + + +static __attribute__((unused)) +int sys_symlink(const char *old, const char *new) +{ +#ifdef __NR_symlinkat + return my_syscall3(__NR_symlinkat, old, AT_FDCWD, new); +#else + return my_syscall2(__NR_symlink, old, new); +#endif +} + +static __attribute__((unused)) +mode_t sys_umask(mode_t mode) +{ + return my_syscall1(__NR_umask, mode); +} + +static __attribute__((unused)) +int sys_umount2(const char *path, int flags) +{ + return my_syscall2(__NR_umount2, path, flags); +} + +static __attribute__((unused)) +int sys_unlink(const char *path) +{ +#ifdef __NR_unlinkat + return my_syscall3(__NR_unlinkat, AT_FDCWD, path, 0); +#else + return my_syscall1(__NR_unlink, path); +#endif +} + +static __attribute__((unused)) +pid_t sys_wait4(pid_t pid, int *status, int options, struct rusage *rusage) +{ + return my_syscall4(__NR_wait4, pid, status, options, rusage); +} + +static __attribute__((unused)) +pid_t sys_waitpid(pid_t pid, int *status, int options) +{ + return sys_wait4(pid, status, options, 0); +} + +static __attribute__((unused)) +pid_t sys_wait(int *status) +{ + return sys_waitpid(-1, status, 0); +} + +static __attribute__((unused)) +ssize_t sys_write(int fd, const void *buf, size_t count) +{ + return my_syscall3(__NR_write, fd, buf, count); +} + + +/* Below are the libc-compatible syscalls which return x or -1 and set errno. + * They rely on the functions above. Similarly they're marked static so that it + * is possible to assign pointers to them if needed. + */ + +static __attribute__((unused)) +int brk(void *addr) +{ + void *ret = sys_brk(addr); + + if (!ret) { + SET_ERRNO(ENOMEM); + return -1; + } + return 0; +} + +static __attribute__((noreturn,unused)) +void exit(int status) +{ + sys_exit(status); +} + +static __attribute__((unused)) +int chdir(const char *path) +{ + int ret = sys_chdir(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int chmod(const char *path, mode_t mode) +{ + int ret = sys_chmod(path, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int chown(const char *path, uid_t owner, gid_t group) +{ + int ret = sys_chown(path, owner, group); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int chroot(const char *path) +{ + int ret = sys_chroot(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int close(int fd) +{ + int ret = sys_close(fd); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int dup2(int old, int new) +{ + int ret = sys_dup2(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int execve(const char *filename, char *const argv[], char *const envp[]) +{ + int ret = sys_execve(filename, argv, envp); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t fork(void) +{ + pid_t ret = sys_fork(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int fsync(int fd) +{ + int ret = sys_fsync(fd); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int getdents64(int fd, struct linux_dirent64 *dirp, int count) +{ + int ret = sys_getdents64(fd, dirp, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t getpgrp(void) +{ + pid_t ret = sys_getpgrp(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t getpid(void) +{ + pid_t ret = sys_getpid(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int gettimeofday(struct timeval *tv, struct timezone *tz) +{ + int ret = sys_gettimeofday(tv, tz); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int ioctl(int fd, unsigned long req, void *value) +{ + int ret = sys_ioctl(fd, req, value); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int kill(pid_t pid, int signal) +{ + int ret = sys_kill(pid, signal); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int link(const char *old, const char *new) +{ + int ret = sys_link(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +off_t lseek(int fd, off_t offset, int whence) +{ + off_t ret = sys_lseek(fd, offset, whence); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int mkdir(const char *path, mode_t mode) +{ + int ret = sys_mkdir(path, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int mknod(const char *path, mode_t mode, dev_t dev) +{ + int ret = sys_mknod(path, mode, dev); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int mount(const char *src, const char *tgt, + const char *fst, unsigned long flags, + const void *data) +{ + int ret = sys_mount(src, tgt, fst, flags, data); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int open(const char *path, int flags, mode_t mode) +{ + int ret = sys_open(path, flags, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int pivot_root(const char *new, const char *old) +{ + int ret = sys_pivot_root(new, old); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int poll(struct pollfd *fds, int nfds, int timeout) +{ + int ret = sys_poll(fds, nfds, timeout); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +ssize_t read(int fd, void *buf, size_t count) +{ + ssize_t ret = sys_read(fd, buf, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int reboot(int cmd) +{ + int ret = sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, 0); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +void *sbrk(intptr_t inc) +{ + void *ret; + + /* first call to find current end */ + if ((ret = sys_brk(0)) && (sys_brk(ret + inc) == ret + inc)) + return ret + inc; + + SET_ERRNO(ENOMEM); + return (void *)-1; +} + +static __attribute__((unused)) +int sched_yield(void) +{ + int ret = sys_sched_yield(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) +{ + int ret = sys_select(nfds, rfds, wfds, efds, timeout); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int setpgid(pid_t pid, pid_t pgid) +{ + int ret = sys_setpgid(pid, pgid); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t setsid(void) +{ + pid_t ret = sys_setsid(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +unsigned int sleep(unsigned int seconds) +{ + struct timeval my_timeval = { seconds, 0 }; + + if (sys_select(0, 0, 0, 0, &my_timeval) < 0) + return my_timeval.tv_sec + !!my_timeval.tv_usec; + else + return 0; +} + +static __attribute__((unused)) +int stat(const char *path, struct stat *buf) +{ + int ret = sys_stat(path, buf); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int symlink(const char *old, const char *new) +{ + int ret = sys_symlink(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int tcsetpgrp(int fd, pid_t pid) +{ + return ioctl(fd, TIOCSPGRP, &pid); +} + +static __attribute__((unused)) +mode_t umask(mode_t mode) +{ + return sys_umask(mode); +} + +static __attribute__((unused)) +int umount2(const char *path, int flags) +{ + int ret = sys_umount2(path, flags); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int unlink(const char *path) +{ + int ret = sys_unlink(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage) +{ + pid_t ret = sys_wait4(pid, status, options, rusage); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t waitpid(pid_t pid, int *status, int options) +{ + pid_t ret = sys_waitpid(pid, status, options); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t wait(int *status) +{ + pid_t ret = sys_wait(status); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +ssize_t write(int fd, const void *buf, size_t count) +{ + ssize_t ret = sys_write(fd, buf, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +/* some size-optimized reimplementations of a few common str* and mem* + * functions. They're marked static, except memcpy() and raise() which are used + * by libgcc on ARM, so they are marked weak instead in order not to cause an + * error when building a program made of multiple files (not recommended). + */ + +static __attribute__((unused)) +void *memmove(void *dst, const void *src, size_t len) +{ + ssize_t pos = (dst <= src) ? -1 : (long)len; + void *ret = dst; + + while (len--) { + pos += (dst <= src) ? 1 : -1; + ((char *)dst)[pos] = ((char *)src)[pos]; + } + return ret; +} + +static __attribute__((unused)) +void *memset(void *dst, int b, size_t len) +{ + char *p = dst; + + while (len--) + *(p++) = b; + return dst; +} + +static __attribute__((unused)) +int memcmp(const void *s1, const void *s2, size_t n) +{ + size_t ofs = 0; + char c1 = 0; + + while (ofs < n && !(c1 = ((char *)s1)[ofs] - ((char *)s2)[ofs])) { + ofs++; + } + return c1; +} + +static __attribute__((unused)) +char *strcpy(char *dst, const char *src) +{ + char *ret = dst; + + while ((*dst++ = *src++)); + return ret; +} + +static __attribute__((unused)) +char *strchr(const char *s, int c) +{ + while (*s) { + if (*s == (char)c) + return (char *)s; + s++; + } + return NULL; +} + +static __attribute__((unused)) +char *strrchr(const char *s, int c) +{ + const char *ret = NULL; + + while (*s) { + if (*s == (char)c) + ret = s; + s++; + } + return (char *)ret; +} + +static __attribute__((unused)) +size_t nolibc_strlen(const char *str) +{ + size_t len; + + for (len = 0; str[len]; len++); + return len; +} + +#define strlen(str) ({ \ + __builtin_constant_p((str)) ? \ + __builtin_strlen((str)) : \ + nolibc_strlen((str)); \ +}) + +static __attribute__((unused)) +int isdigit(int c) +{ + return (unsigned int)(c - '0') <= 9; +} + +static __attribute__((unused)) +long atol(const char *s) +{ + unsigned long ret = 0; + unsigned long d; + int neg = 0; + + if (*s == '-') { + neg = 1; + s++; + } + + while (1) { + d = (*s++) - '0'; + if (d > 9) + break; + ret *= 10; + ret += d; + } + + return neg ? -ret : ret; +} + +static __attribute__((unused)) +int atoi(const char *s) +{ + return atol(s); +} + +static __attribute__((unused)) +const char *ltoa(long in) +{ + /* large enough for -9223372036854775808 */ + static char buffer[21]; + char *pos = buffer + sizeof(buffer) - 1; + int neg = in < 0; + unsigned long n = neg ? -in : in; + + *pos-- = '\0'; + do { + *pos-- = '0' + n % 10; + n /= 10; + if (pos < buffer) + return pos + 1; + } while (n); + + if (neg) + *pos-- = '-'; + return pos + 1; +} + +__attribute__((weak,unused)) +void *memcpy(void *dst, const void *src, size_t len) +{ + return memmove(dst, src, len); +} + +/* needed by libgcc for divide by zero */ +__attribute__((weak,unused)) +int raise(int signal) +{ + return kill(getpid(), signal); +} + +/* Here come a few helper functions */ + +static __attribute__((unused)) +void FD_ZERO(fd_set *set) +{ + memset(set, 0, sizeof(*set)); +} + +static __attribute__((unused)) +void FD_SET(int fd, fd_set *set) +{ + if (fd < 0 || fd >= FD_SETSIZE) + return; + set->fd32[fd / 32] |= 1 << (fd & 31); +} + +/* WARNING, it only deals with the 4096 first majors and 256 first minors */ +static __attribute__((unused)) +dev_t makedev(unsigned int major, unsigned int minor) +{ + return ((major & 0xfff) << 8) | (minor & 0xff); +} From b94ec36896dafc0a12106b1536fe87f99e9a0c5d Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 9 Sep 2018 13:33:02 +0200 Subject: [PATCH 021/112] rcutorture: Make use of nolibc when available This reduces the size of the init executable from ~800 kB to ~800 bytes on x86_64. This is only implemented for x86_64, i386, arm and arm64. Others not tested. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/mkinitrd.sh | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh index 56a56ea06983dd..da298394daa244 100755 --- a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh +++ b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh @@ -82,8 +82,10 @@ cd $D mkdir -p initrd cd initrd cat > init.c << '___EOF___' +#ifndef NOLIBC #include #include +#endif volatile unsigned long delaycount; @@ -113,7 +115,21 @@ int main(int argc, int argv[]) return 0; } ___EOF___ -${CROSS_COMPILE}gcc -s -static -Os -o init init.c + +# build using nolibc on supported archs (smaller executable) and fall +# back to regular glibc on other ones. +if echo -e "#if __x86_64__||__i386__||__i486__||__i586__||__i686__" \ + "||__ARM_EABI__||__aarch64__\nyes\n#endif" \ + | ${CROSS_COMPILE}gcc -E -nostdlib -xc - \ + | grep -q '^yes'; then + # architecture supported by nolibc + ${CROSS_COMPILE}gcc -fno-asynchronous-unwind-tables -fno-ident \ + -nostdlib -include ../bin/nolibc.h -lgcc -s -static -Os \ + -o init init.c +else + ${CROSS_COMPILE}gcc -s -static -Os -o init init.c +fi + rm init.c echo "Done creating a statically linked C-language initrd" From 868f7a09a4f385e5167fc0ff9ec4c3f817897f3a Mon Sep 17 00:00:00 2001 From: Lance Roy Date: Thu, 4 Oct 2018 23:45:38 -0700 Subject: [PATCH 022/112] x86/PCI: Replace spin_is_locked() with lockdep lockdep_assert_held() is better suited to checking locking requirements, since it only checks if the current thread holds the lock regardless of whether someone else does. This is also a step towards possibly removing spin_is_locked(). Signed-off-by: Lance Roy Cc: Bjorn Helgaas Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Cc: Signed-off-by: Paul E. McKenney --- arch/x86/pci/i386.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 8cd66152cdb0e8..9df652d3d9275b 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -59,7 +59,7 @@ static struct pcibios_fwaddrmap *pcibios_fwaddrmap_lookup(struct pci_dev *dev) { struct pcibios_fwaddrmap *map; - WARN_ON_SMP(!spin_is_locked(&pcibios_fwaddrmap_lock)); + lockdep_assert_held(&pcibios_fwaddrmap_lock); list_for_each_entry(map, &pcibios_fwaddrmappings, list) if (map->dev == dev) From f3e763c3e544b73ae5c4a3842cedb9ff6ca37715 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 3 Sep 2018 12:45:45 -0700 Subject: [PATCH 023/112] srcu: Fix kernel-doc missing notation Fix kernel-doc warnings for missing parameter descriptions: ../include/linux/srcu.h:175: warning: Function parameter or member 'p' not described in 'srcu_dereference_notrace' ../include/linux/srcu.h:175: warning: Function parameter or member 'sp' not described in 'srcu_dereference_notrace' Fixes: 0b764a6e4e19d ("srcu: Add notrace variant of srcu_dereference") Signed-off-by: Randy Dunlap Cc: Lai Jiangshan Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 67135d4a8a30e3..ebd5f1511690f1 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -171,6 +171,9 @@ static inline int srcu_read_lock_held(const struct srcu_struct *sp) /** * srcu_dereference_notrace - no tracing and no lockdep calls from here + * @p: the pointer to fetch and protect for later dereferencing + * @sp: pointer to the srcu_struct, which is used to check that we + * really are in an SRCU read-side critical section. */ #define srcu_dereference_notrace(p, sp) srcu_dereference_check((p), (sp), 1) From 0607ba8403c4cdb253f8c5200ecf654dfb7790cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Apr 2018 13:01:25 -0700 Subject: [PATCH 024/112] srcu: Prevent __call_srcu() counter wrap with read-side critical section Ever since cdf7abc4610a ("srcu: Allow use of Tiny/Tree SRCU from both process and interrupt context"), it has been permissible to use SRCU read-side critical sections in interrupt context. This allows __call_srcu() to use SRCU read-side critical sections to prevent a new SRCU grace period from ending before the call to either srcu_funnel_gp_start() or srcu_funnel_exp_start completes, thus preventing SRCU grace-period counter overflow during that time. Note that this does not permit removal of the counter-wrap checks in srcu_gp_end(). These check are necessary to handle the case where a given CPU does not interact at all with SRCU for an extended time period. This commit therefore adds an SRCU read-side critical section to __call_srcu() in order to prevent grace period counter wrap during the funnel-locking process. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index a8846ed7f35298..60f3236beaf721 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -858,6 +858,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rcu_callback_t func, bool do_norm) { unsigned long flags; + int idx; bool needexp = false; bool needgp = false; unsigned long s; @@ -871,6 +872,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, return; } rhp->func = func; + idx = srcu_read_lock(sp); local_irq_save(flags); sdp = this_cpu_ptr(sp->sda); spin_lock_rcu_node(sdp); @@ -892,6 +894,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, srcu_funnel_gp_start(sp, sdp, s, do_norm); else if (needexp) srcu_funnel_exp_start(sp, sdp->mynode, s); + srcu_read_unlock(sp, idx); } /** From 9cac83a57e99e4692315b7a91a81bab787961d97 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Sep 2018 08:57:48 -0700 Subject: [PATCH 025/112] rcu: Stop expedited grace periods from relying on stop-machine The CPU-selection code in sync_rcu_exp_select_cpus() disables preemption to prevent the cpu_online_mask from changing. However, this relies on the stop-machine mechanism in the CPU-hotplug offline code, which is not desirable (it would be good to someday remove the stop-machine mechanism). This commit therefore instead uses the relevant leaf rcu_node structure's ->ffmask, which has a bit set for all CPUs that are fully functional. A given CPU's bit is cleared very early during offline processing by rcutree_offline_cpu() and set very late during online processing by rcutree_online_cpu(). Therefore, if a CPU's bit is set in this mask, and preemption is disabled, we have to be before the synchronize_sched() in the CPU-hotplug offline code, which means that the CPU is guaranteed to be workqueue-ready throughout the duration of the enclosing preempt_disable() region of code. This also has the side-effect of using WORK_CPU_UNBOUND if all the CPUs for this leaf rcu_node structure are offline, which is an acceptable difference in behavior. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8d18c1014e2beb..e669ccf3751b1b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -450,10 +450,12 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) } INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); preempt_disable(); - cpu = cpumask_next(rnp->grplo - 1, cpu_online_mask); + cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi)) + if (unlikely(cpu > rnp->grphi - rnp->grplo)) cpu = WORK_CPU_UNBOUND; + else + cpu += rnp->grplo; queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); preempt_enable(); rnp->exp_need_flush = true; From 9213784b48f8ba666b4695ca3f5d34f583daab83 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Oct 2018 08:26:00 -0700 Subject: [PATCH 026/112] rcu: Eliminate BUG_ON() for kernel/rcu/tree_plugin.h The tree_plugin.h file has a number of calls to BUG_ON(), which panics the kernel, which is not a good strategy for devices (like embedded) that don't have a way to capture console output. This commit therefore converts these BUG_ON() calls to WARN_ON_ONCE() and WARN_ONCE(). Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney [ paulmck: Fix typo: s/rcuo/rcub/. ] --- kernel/rcu/tree_plugin.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 05915e53633657..adda608874a869 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1464,7 +1464,8 @@ static void __init rcu_spawn_boost_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; - BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); + if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) + return; rcu_for_each_leaf_node(rnp) (void)rcu_spawn_one_boost_kthread(rnp); } @@ -2322,7 +2323,8 @@ static int rcu_nocb_kthread(void *arg) tail = rdp->nocb_follower_tail; rdp->nocb_follower_tail = &rdp->nocb_follower_head; raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - BUG_ON(!list); + if (WARN_ON_ONCE(!list)) + continue; trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty")); /* Each pass through the following loop invokes a callback. */ @@ -2495,7 +2497,8 @@ static void rcu_spawn_one_nocb_kthread(int cpu) /* Spawn the kthread for this CPU. */ t = kthread_run(rcu_nocb_kthread, rdp_spawn, "rcuo%c/%d", rcu_state.abbr, cpu); - BUG_ON(IS_ERR(t)); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__)) + return; WRITE_ONCE(rdp_spawn->nocb_kthread, t); } From f0ad56e876cdd67730065274625edbcfe0cca278 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Oct 2018 08:33:06 -0700 Subject: [PATCH 027/112] rcu: Eliminate BUG_ON() for kernel/rcu/update.c The update.c file has a number of calls to BUG_ON(), which panics the kernel, which is not a good strategy for devices (like embedded) that don't have a way to capture console output. This commit therefore converts these BUG_ON() calls to WARN_ON_ONCE() and WARN_ONCE(). Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f203b94f6b5be1..cb26de25658aa5 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -822,7 +822,8 @@ static int __init rcu_spawn_tasks_kthread(void) struct task_struct *t; t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); - BUG_ON(IS_ERR(t)); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) + return 0; smp_mb(); /* Ensure others see full kthread. */ WRITE_ONCE(rcu_tasks_kthread_ptr, t); return 0; From 5cc379a42acd7104747077db7aaf4b01115ee484 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 25 Sep 2018 11:25:57 -0700 Subject: [PATCH 028/112] doc: Update information about resched_cpu Since commit fced9c8cfe6b ("rcu: Avoid resched_cpu() when rescheduling the current CPU"), resched_cpu is not directly called from sync_sched_exp_handler. Update the documentation about the same. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- .../Expedited-Grace-Periods/Expedited-Grace-Periods.html | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html index e62c7c34a369ee..8e4f873b979ffd 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html @@ -160,9 +160,9 @@

If the CPU is idle, then sync_sched_exp_handler() reports the quiescent state. -

-Otherwise, the handler invokes resched_cpu(), which forces -a future context switch. +

Otherwise, the handler forces a future context switch by setting the +NEED_RESCHED flag of the current task's thread flag and the CPU preempt +counter. At the time of the context switch, the CPU reports the quiescent state. Should the CPU go offline first, it will report the quiescent state at that time. From c9b6f899e120c83ef144b3d4a8365413ef49cce4 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 3 Oct 2018 17:37:25 -0700 Subject: [PATCH 029/112] doc: Remove rcu_dynticks from Data-Structures rcu_dynticks was folded into rcu_data structure. Update the data structures RCU document accordingly. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- .../BigTreeClassicRCUBHdyntick.svg | 695 ------------------ .../Data-Structures/Data-Structures.html | 90 +-- 2 files changed, 25 insertions(+), 760 deletions(-) delete mode 100644 Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg deleted file mode 100644 index 21ba7823479d4f..00000000000000 --- a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg +++ /dev/null @@ -1,695 +0,0 @@ - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rcu_bh - - struct - - rcu_node - - struct - - rcu_node - - rcu_node - - struct - - struct - - rcu_data - - struct - - rcu_data - - struct - - rcu_data - - struct - - rcu_data - - struct rcu_state - - struct - - rcu_dynticks - - struct - - rcu_dynticks - - struct - - rcu_dynticks - - struct - - rcu_dynticks - - rcu_sched - - - - - diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 476b1ac38e4c2a..4eb603e3a0057a 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -23,8 +23,6 @@

Introduction

The rcu_segcblist Structure
  • The rcu_data Structure -
  • - The rcu_dynticks Structure
  • The rcu_head Structure
  • @@ -174,16 +172,8 @@

    Data-Structure Relationships

    RCU must handle dyntick-idle CPUs specially because RCU would otherwise wake up each CPU on every grace period, which would defeat the whole purpose of CONFIG_NO_HZ_IDLE. -RCU uses the rcu_dynticks structure to track -which CPUs are in dyntick idle mode, as shown below: - -

    BigTreeClassicRCUBHdyntick.svg - -

    However, if a CPU is in dyntick-idle mode, it is in that mode -for all flavors of RCU. -Therefore, a single rcu_dynticks structure is allocated per -CPU, and all of a given CPU's rcu_data structures share -that rcu_dynticks, as shown in the figure. +RCU uses the dynticks related fields in the rcu_data structure +to track which CPUs are in dyntick idle mode.

    Kernels built with CONFIG_PREEMPT_RCU support rcu_preempt in addition to rcu_sched and rcu_bh, as shown below: @@ -216,9 +206,6 @@

    Data-Structure Relationships

  • Each rcu_node structure has a spinlock.
  • The fields in rcu_data are private to the corresponding CPU, although a few can be read and written by other CPUs. -
  • Similarly, the fields in rcu_dynticks are private - to the corresponding CPU, although a few can be read by - other CPUs.

    It is important to note that different data structures can have @@ -274,11 +261,6 @@

    Data-Structure Relationships

    access to this information from the corresponding CPU. Finally, this structure records past dyntick-idle state for the corresponding CPU and also tracks statistics. -
  • rcu_dynticks: - This per-CPU structure tracks the current dyntick-idle - state for the corresponding CPU. - Unlike the other three structures, the rcu_dynticks - structure is not replicated per RCU flavor.
  • rcu_head: This structure represents RCU callbacks, and is the only structure allocated and managed by RCU users. @@ -289,8 +271,8 @@

    Data-Structure Relationships

    If all you wanted from this article was a general notion of how RCU's data structures are related, you are done. Otherwise, each of the following sections give more details on -the rcu_state, rcu_node, rcu_data, -and rcu_dynticks data structures. +the rcu_state, rcu_node and rcu_data data +structures.

    The rcu_state Structure

    @@ -1017,30 +999,19 @@
    Connection to Other Data Structures
       1   int cpu;
    -  2   struct rcu_state *rsp;
    -  3   struct rcu_node *mynode;
    -  4   struct rcu_dynticks *dynticks;
    -  5   unsigned long grpmask;
    -  6   bool beenonline;
    +  2   struct rcu_node *mynode;
    +  3   unsigned long grpmask;
    +  4   bool beenonline;
     

    The ->cpu field contains the number of the -corresponding CPU, the ->rsp pointer references -the corresponding rcu_state structure (and is most frequently -used to locate the name of the corresponding flavor of RCU for tracing), -and the ->mynode field references the corresponding -rcu_node structure. +corresponding CPU and the ->mynode field references the +corresponding rcu_node structure. The ->mynode is used to propagate quiescent states up the combining tree. -

    The ->dynticks pointer references the -rcu_dynticks structure corresponding to this -CPU. -Recall that a single per-CPU instance of the rcu_dynticks -structure is shared among all flavors of RCU. -These first four fields are constant and therefore require not -synchronization. +These two fields are constant and therefore do not require synchronization. -

    The ->grpmask field indicates the bit in +

    The ->grpmask field indicates the bit in the ->mynode->qsmask corresponding to this rcu_data structure, and is also used when propagating quiescent states. @@ -1181,26 +1152,22 @@

    Dyntick-Idle Handling
    count the number of times this CPU is determined to be in dyntick-idle state, and is used for tracing and debugging purposes. -

    -The rcu_dynticks Structure

    - -

    The rcu_dynticks maintains the per-CPU dyntick-idle state -for the corresponding CPU. -Unlike the other structures, rcu_dynticks is not -replicated over the different flavors of RCU. -The fields in this structure may be accessed only from the corresponding -CPU (and from tracing) unless otherwise stated. -Its fields are as follows: +

    +This portion of the rcu_data structure is declared as follows:

       1   long dynticks_nesting;
       2   long dynticks_nmi_nesting;
       3   atomic_t dynticks;
       4   bool rcu_need_heavy_qs;
    -  5   unsigned long rcu_qs_ctr;
    -  6   bool rcu_urgent_qs;
    +  5   bool rcu_urgent_qs;
     
    +

    These fields in the rcu_data structure maintain the per-CPU dyntick-idle +state for the corresponding CPU. +The fields may be accessed only from the corresponding CPU (and from tracing) +unless otherwise stated. +

    The ->dynticks_nesting field counts the nesting depth of process execution, so that in normal circumstances this counter has value zero or one. @@ -1242,19 +1209,12 @@

    This flag is checked by RCU's context-switch and cond_resched() code, which provide a momentary idle sojourn in response. -

    The ->rcu_qs_ctr field is used to record -quiescent states from cond_resched(). -Because cond_resched() can execute quite frequently, this -must be quite lightweight, as in a non-atomic increment of this -per-CPU field. -

    Finally, the ->rcu_urgent_qs field is used to record -the fact that the RCU core code would really like to see a quiescent -state from the corresponding CPU, with the various other fields indicating -just how badly RCU wants this quiescent state. -This flag is checked by RCU's context-switch and cond_resched() -code, which, if nothing else, non-atomically increment ->rcu_qs_ctr -in response. +the fact that the RCU core code would really like to see a quiescent state from +the corresponding CPU, with the various other fields indicating just how badly +RCU wants this quiescent state. +This flag is checked by RCU's context-switch path +(rcu_note_context_switch) and the cond_resched code. @@ -1431,7 +1391,7 @@

    which contains a combining tree of rcu_node and rcu_data structures. Finally, in CONFIG_NO_HZ_IDLE kernels, each CPU's dyntick-idle -state is tracked by an rcu_dynticks structure. +state is tracked by dynticks-related fields in the rcu_data structure. If you made it this far, you are well prepared to read the code walkthroughs in the other articles in this series. From b54d9db26031d6dc96222164092eacbaa0329255 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 3 Oct 2018 17:40:28 -0700 Subject: [PATCH 030/112] doc: rcu: Update Data-Structures for RCU flavor consolidation This patch updates all Data-Structures document figures and text and removes some unwanted figures, to reflect the recent work Paul has been doing with consolidating all flavors of RCU. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- .../Data-Structures/BigTreeClassicRCUBH.svg | 499 ----------- .../BigTreePreemptRCUBHdyntick.svg | 741 ---------------- .../BigTreePreemptRCUBHdyntickCB.svg | 834 +++++++----------- .../Data-Structures/Data-Structures.html | 49 +- .../RCU/Design/Data-Structures/blkd_task.svg | 676 +++++--------- 5 files changed, 559 insertions(+), 2240 deletions(-) delete mode 100644 Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg delete mode 100644 Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg deleted file mode 100644 index 9bbb1944f962d5..00000000000000 --- a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg +++ /dev/null @@ -1,499 +0,0 @@ - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rcu_bh - - struct - - rcu_node - - struct - - rcu_node - - rcu_node - - struct - - struct - - rcu_data - - struct - - rcu_data - - struct - - rcu_data - - struct - - rcu_data - - struct rcu_state - - rcu_sched - - - - - - - - - - - diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg deleted file mode 100644 index 15adcac036c733..00000000000000 --- a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg +++ /dev/null @@ -1,741 +0,0 @@ - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rcu_bh - - struct - - rcu_node - - struct - - rcu_node - - rcu_node - - struct - - struct - - rcu_data - - struct - - rcu_data - - struct - - rcu_data - - struct - - rcu_data - - struct rcu_state - - struct - - rcu_dynticks - - struct - - rcu_dynticks - - struct - - rcu_dynticks - - struct - - rcu_dynticks - - rcu_preempt - - rcu_sched - - - - - diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg index bbc3801470d095..3a1a4f85dc3a5d 100644 --- a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg +++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg @@ -13,12 +13,12 @@ xmlns="http://www.w3.org/2000/svg" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" - width="7.4in" - height="9.9in" - viewBox="-44 -44 8938 11938" + width="7.4000001in" + height="7.9000001in" + viewBox="-44 -44 8938 9526.283" id="svg2" version="1.1" - inkscape:version="0.48.4 r9939" + inkscape:version="0.92.2pre0 (973e216, 2017-07-25)" sodipodi:docname="BigTreePreemptRCUBHdyntickCB.svg"> @@ -37,15 +37,46 @@ + + + + + + + style="overflow:visible"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.00000003pt" + transform="matrix(-0.4,0,0,-0.4,-4,0)" + inkscape:connector-curvature="0" /> + style="fill:none;stroke-width:0.025in" + id="g4" + transform="translate(0,-2415.6743)"> - - - - - - - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline20" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline24" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline28" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline32" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline36" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline40" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> + style="stroke:#00d1d1;stroke-width:29.99464035;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline46" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> + style="stroke:#00d1d1;stroke-width:29.99464035;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline50" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> + style="stroke:#00d1d1;stroke-width:29.99464035;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline54" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> + style="stroke:#00d1d1;stroke-width:29.99464035;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline58" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> + style="stroke:#00d1d1;stroke-width:29.99464035;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline62" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> - - - - - - + - + - + - + - + - + - + - + - + + style="stroke:#000000;stroke-width:30;stroke-linecap:butt;stroke-linejoin:miter;marker-end:url(#Arrow1Mend)" + id="polyline108" + transform="matrix(1,0,0,0.95854605,-604.69715,525.62477)" /> + style="stroke:#000000;stroke-width:30;stroke-linecap:butt;stroke-linejoin:miter;marker-end:url(#Arrow1Mend)" + id="polyline114" + transform="matrix(1,0,0,0.95854605,-604.69715,525.62477)" /> - - - - struct + id="text140" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_head + id="text142" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_head struct + id="text144" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_head + id="text146" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_head struct + id="text148" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_head + id="text150" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_head rcu_sched + id="text152" + style="font-style:normal;font-weight:normal;font-size:187.978302px;font-family:Helvetica;text-anchor:end;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_state - rcu_bh struct + id="text156" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_node + id="text158" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_node struct + id="text160" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_node + id="text162" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_node rcu_node + id="text164" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_node struct + id="text166" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct struct + id="text168" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_data + id="text170" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_data struct + id="text172" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_data + id="text174" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_data struct + id="text176" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_data + id="text178" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_data struct + id="text180" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_data + id="text182" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:middle;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">rcu_data struct rcu_state + id="text184" + style="font-style:normal;font-weight:bold;font-size:187.978302px;font-family:Courier;text-anchor:start;fill:#000000;stroke-width:0.02447634in" + transform="scale(1.0213945,0.97905363)">struct rcu_state - struct - rcu_dynticks - struct - rcu_dynticks - struct - rcu_dynticks - struct - rcu_dynticks - rcu_preempt + style="stroke:#00d1d1;stroke-width:29.99464035;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline204" + transform="matrix(1,0,0,0.95854605,12.340758,1579.9033)" /> + diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 4eb603e3a0057a..28b241074c86f6 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -154,36 +154,9 @@

    Data-Structure Relationships

    keeping lock contention under control at all tree levels regardless of the level of loading on the system. -

    The Linux kernel actually supports multiple flavors of RCU -running concurrently, so RCU builds separate data structures for each -flavor. -For example, for CONFIG_TREE_RCU=y kernels, RCU provides -rcu_sched and rcu_bh, as shown below: - -

    BigTreeClassicRCUBH.svg - -

    Energy efficiency is increasingly important, and for that -reason the Linux kernel provides CONFIG_NO_HZ_IDLE, which -turns off the scheduling-clock interrupts on idle CPUs, which in -turn allows those CPUs to attain deeper sleep states and to consume -less energy. -CPUs whose scheduling-clock interrupts have been turned off are -said to be in dyntick-idle mode. -RCU must handle dyntick-idle CPUs specially -because RCU would otherwise wake up each CPU on every grace period, -which would defeat the whole purpose of CONFIG_NO_HZ_IDLE. -RCU uses the dynticks related fields in the rcu_data structure -to track which CPUs are in dyntick idle mode. - -

    Kernels built with CONFIG_PREEMPT_RCU support -rcu_preempt in addition to rcu_sched and rcu_bh, as shown below: - -

    BigTreePreemptRCUBHdyntick.svg -

    RCU updaters wait for normal grace periods by registering RCU callbacks, either directly via call_rcu() and friends (namely call_rcu_bh() and call_rcu_sched()), -there being a separate interface per flavor of RCU) or indirectly via synchronize_rcu() and friends. RCU callbacks are represented by rcu_head structures, which are queued on rcu_data structures while they are @@ -278,7 +251,7 @@

    The rcu_state Structure

    The rcu_state structure is the base structure that -represents a flavor of RCU. +represents the state of RCU in the system. This structure forms the interconnection between the rcu_node and rcu_data structures, tracks grace periods, contains the lock used to @@ -373,7 +346,7 @@

    Grace-Period Tracking
    The bottom two bits are the state of the current grace period, which can be zero for not yet started or one for in progress. In other words, if the bottom two bits of ->gp_seq are -zero, the corresponding flavor of RCU is idle. +zero, then RCU is idle. Any other value in the bottom two bits indicates that something is broken. This field is protected by the root rcu_node structure's ->lock field. @@ -403,10 +376,10 @@
    Miscellaneous
    grace period in jiffies. It is protected by the root rcu_node's ->lock. -

    The ->name field points to the name of the RCU flavor -(for example, “rcu_sched”), and is constant. -The ->abbr field contains a one-character abbreviation, -for example, “s” for RCU-sched. +

    The ->name and ->abbr fields distinguish +between preemptible RCU (“rcu_preempt” and “p”) +and non-preemptible RCU (“rcu_sched” and “s”). +These fields are used for diagnostic and tracing purposes.

    The rcu_node Structure

    @@ -972,8 +945,7 @@

    The rcu_data Structure

    -

    The rcu_data maintains the per-CPU state for the -corresponding flavor of RCU. +

    The rcu_data maintains the per-CPU state for the RCU subsystem. The fields in this structure may be accessed only from the corresponding CPU (and from tracing) unless otherwise stated. This structure is the @@ -1030,7 +1002,6 @@

    Quiescent-State and Grace-Period Tracking
    3 bool cpu_no_qs; 4 bool core_needs_qs; 5 bool gpwrap; - 6 unsigned long rcu_qs_ctr_snap;

    The ->gp_seq and ->gp_seq_needed @@ -1076,10 +1047,6 @@

    Quiescent-State and Grace-Period Tracking
    gp_seq counter is in danger of overflow, which will cause the CPU to disregard the values of its counters on its next exit from idle. -Finally, the rcu_qs_ctr_snap field is used to detect -cases where a given operation has resulted in a quiescent state -for all flavors of RCU, for example, cond_resched() -when RCU has indicated a need for quiescent states.
    RCU Callback Handling
    @@ -1387,7 +1354,7 @@

    Summary

    -So each flavor of RCU is represented by an rcu_state structure, +So the state of RCU is represented by an rcu_state structure, which contains a combining tree of rcu_node and rcu_data structures. Finally, in CONFIG_NO_HZ_IDLE kernels, each CPU's dyntick-idle diff --git a/Documentation/RCU/Design/Data-Structures/blkd_task.svg b/Documentation/RCU/Design/Data-Structures/blkd_task.svg index 00e810bb84194a..bed13e9ecab899 100644 --- a/Documentation/RCU/Design/Data-Structures/blkd_task.svg +++ b/Documentation/RCU/Design/Data-Structures/blkd_task.svg @@ -14,12 +14,12 @@ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" width="10.1in" - height="8.6in" - viewBox="-44 -44 12088 10288" + height="6.5999999in" + viewBox="-44 -44 12088 7895.4414" id="svg2" version="1.1" - inkscape:version="0.48.4 r9939" - sodipodi:docname="blkd_task.fig"> + inkscape:version="0.92.2pre0 (973e216, 2017-07-25)" + sodipodi:docname="blkd_task.svg"> @@ -37,15 +37,16 @@ + style="overflow:visible"> + d="M 0,0 5,-5 -12.5,0 5,5 Z" + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.00000003pt" + transform="matrix(-0.4,0,0,-0.4,-4,0)" + inkscape:connector-curvature="0" /> + inkscape:cx="456.40569" + inkscape:cy="348.88682" + inkscape:window-x="0" + inkscape:window-y="0" + inkscape:window-maximized="1" + inkscape:current-layer="g4" + showguides="false" /> + style="fill:none;stroke-width:0.025in" + id="g4" + transform="translate(0,-2393.6637)"> - - - - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline14" + transform="translate(23.757862,2185.7233)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline18" + transform="translate(23.757862,2185.7233)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline22" + transform="translate(23.757862,2185.7233)" /> - + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8" + id="polyline26" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#00d1d1;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline32" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#00d1d1;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline36" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#00d1d1;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline40" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#00d1d1;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline44" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#00d1d1;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline48" + transform="translate(23.757862,2185.7233)" /> - - - - - - - - + points="7350,2850 7350,5100 5550,4350 5550,3450 " + style="fill:#ffbfbf;stroke:#000000;stroke-width:14;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:120, 120" + id="polygon106" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#000000;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline108" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#000000;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline114" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#000000;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline120" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#000000;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline126" + transform="translate(23.757862,2185.7233)" /> + style="stroke:#000000;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline130" + transform="translate(23.757862,2185.7233)" /> - rcu_bh struct + id="text136" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">struct rcu_node + id="text138" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">rcu_node struct + id="text140" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">struct rcu_node + id="text142" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">rcu_node struct + id="text144" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">struct rcu_data + id="text146" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">rcu_data struct + id="text148" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">struct rcu_data + id="text150" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">rcu_data struct + id="text152" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">struct rcu_data + id="text154" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">rcu_data struct + id="text156" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">struct rcu_data + id="text158" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">rcu_data struct rcu_state + id="text160" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:start;fill:#000000">struct rcu_state - struct - rcu_dynticks - struct - rcu_dynticks - struct - rcu_dynticks - struct - rcu_dynticks rcu_sched + id="text178" + style="font-style:normal;font-weight:normal;font-size:192px;font-family:Helvetica;text-anchor:end;fill:#000000">rcu_state T3 + id="text180" + style="font-style:normal;font-weight:normal;font-size:216px;font-family:Helvetica;text-anchor:middle;fill:#000000">T3 T2 + id="text182" + style="font-style:normal;font-weight:normal;font-size:216px;font-family:Helvetica;text-anchor:middle;fill:#000000">T2 T1 + id="text184" + style="font-style:normal;font-weight:normal;font-size:216px;font-family:Helvetica;text-anchor:middle;fill:#000000">T1 + style="stroke:#00d1d1;stroke-width:30.00057793;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline186" + transform="translate(23.757862,2185.7233)" /> rcu_node + id="text198" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">rcu_node struct + id="text200" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:middle;fill:#000000">struct blkd_tasks + id="text202" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:start;fill:#000000">blkd_tasks gp_tasks + id="text204" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:start;fill:#000000">gp_tasks exp_tasks + id="text206" + style="font-style:normal;font-weight:bold;font-size:192px;font-family:Courier;text-anchor:start;fill:#000000">exp_tasks From 82eccec851478e55bfd398d7e9d03300026fc4a9 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 25 Sep 2018 11:26:00 -0700 Subject: [PATCH 031/112] doc: rcu: Better clarify the rcu_segcblist ->len field An important note under the rcu_segcblist description could use a more detailed description. Especially explanation of the scenario where the ->head field may be temporarily NULL making it not wise to rely on it to determine if callbacks are associated with the rcu_segcblist. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- .../Data-Structures/Data-Structures.html | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 28b241074c86f6..3ed5f0182bc446 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -928,17 +928,24 @@

    pointer. The reason for this is that all the ready-to-invoke callbacks (that is, those in the RCU_DONE_TAIL segment) are extracted -all at once at callback-invocation time. +all at once at callback-invocation time (rcu_do_batch), due +to which ->head may be set to NULL if there are no not-done +callbacks remaining in the rcu_segcblist. If callback invocation must be postponed, for example, because a high-priority process just woke up on this CPU, then the remaining -callbacks are placed back on the RCU_DONE_TAIL segment. -Either way, the ->len and ->len_lazy counts -are adjusted after the corresponding callbacks have been invoked, and so -again it is the ->len count that accurately reflects whether -or not there are callbacks associated with this rcu_segcblist -structure. +callbacks are placed back on the RCU_DONE_TAIL segment and +->head once again points to the start of the segment. +In short, the head field can briefly be NULL even though the +CPU has callbacks present the entire time. +Therefore, it is not appropriate to test the ->head pointer +for NULL. + +

    In contrast, the ->len and ->len_lazy counts +are adjusted only after the corresponding callbacks have been invoked. +This means that the ->len count is zero only if +the rcu_segcblist structure really is devoid of callbacks. Of course, off-CPU sampling of the ->len count requires -the use of appropriate synchronization, for example, memory barriers. +careful use of appropriate synchronization, for example, memory barriers. This synchronization can be a bit subtle, particularly in the case of rcu_barrier(). From 70f0508caba2ccb564337e7a2ac4816b094abc00 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 25 Sep 2018 11:26:01 -0700 Subject: [PATCH 032/112] doc: rcu: Update description of gp_seq fields in rcu_data The rcu_state structure doesn't have a gp_seq_needed field. Update the description under rcu_data accordingly, to reflect this. Signed-off-by: Joel Fernandes (Google) Cc: Signed-off-by: Paul E. McKenney --- .../RCU/Design/Data-Structures/Data-Structures.html | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 3ed5f0182bc446..18f1798075633e 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -1011,9 +1011,10 @@

    Quiescent-State and Grace-Period Tracking
    5 bool gpwrap; -

    The ->gp_seq and ->gp_seq_needed -fields are the counterparts of the fields of the same name -in the rcu_state and rcu_node structures. +

    The ->gp_seq field is the counterpart of the field of the same +name in the rcu_state and rcu_node structures. The +->gp_seq_needed field is the counterpart of the field of the same +name in the rcu_node structure. They may each lag up to one behind their rcu_node counterparts, but in CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_FULL kernels can lag From ed8f6fb247785d98ffe6babcf93b7bedd2c88fd8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Oct 2018 08:38:54 -0700 Subject: [PATCH 033/112] doc: Document rcutorture forward-progress test kernel parameters Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 81d1d5a7472804..3823679deea533 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3773,6 +3773,23 @@ Set wait time between force_quiescent_state bursts in seconds. + rcutorture.fwd_progress= [KNL] + Enable RCU grace-period forward-progress testing + for the types of RCU supporting this notion. + + rcutorture.fwd_progress_div= [KNL] + Specify the fraction of a CPU-stall-warning + period to do tight-loop forward-progress testing. + + rcutorture.fwd_progress_holdoff= [KNL] + Number of seconds to wait between successive + forward-progress tests. + + rcutorture.fwd_progress_need_resched= [KNL] + Enclose cond_resched() calls within checks for + need_resched() during tight-loop forward-progress + testing. + rcutorture.gp_cond= [KNL] Use conditional/asynchronous update-side primitives, if available. From 3398496483df3508764d24917deaa8ab5176969e Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 5 Oct 2018 16:18:10 -0700 Subject: [PATCH 034/112] doc: rcu: Update core and full API in whatisRCU RCU consolidation effort causes the update side of the RCU API to be consistent across all the 3 RCU flavors (normal, sched, bh). This commit therefore updates the full API in the whatisRCU document, thus encouraging people to use the consolidated RCU update API instead of the old RCU-bh and RCU-sched update APIs. Also rcu_dereference is documented to be the same for all 3 mechanisms (even before the consolidation), however its actually different - as using the right rcu_dereference primitive (such as rcu_dereference_bh for bh) is needed to make lock debugging work correctly. This update also corrects that. Also, add local_bh_disable() and local_bh_enable() as softirq protection primitives and correct a grammar error in a quiz answer. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/whatisRCU.txt | 55 +++++++++++++++++---------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 86d82f7f35005c..7c33445fd0e54e 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -322,28 +322,27 @@ to their callers and (2) call_rcu() callbacks may be invoked. Efficient implementations of the RCU infrastructure make heavy use of batching in order to amortize their overhead over many uses of the corresponding APIs. -There are no fewer than three RCU mechanisms in the Linux kernel; the -diagram above shows the first one, which is by far the most commonly used. -The rcu_dereference() and rcu_assign_pointer() primitives are used for -all three mechanisms, but different defer and protect primitives are -used as follows: +There are at least three flavors of RCU usage in the Linux kernel. The diagram +above shows the most common one. On the updater side, the rcu_assign_pointer(), +sychronize_rcu() and call_rcu() primitives used are the same for all three +flavors. However for protection (on the reader side), the primitives used vary +depending on the flavor: - Defer Protect +a. rcu_read_lock() / rcu_read_unlock() + rcu_dereference() -a. synchronize_rcu() rcu_read_lock() / rcu_read_unlock() - call_rcu() rcu_dereference() +b. rcu_read_lock_bh() / rcu_read_unlock_bh() + local_bh_disable() / local_bh_enable() + rcu_dereference_bh() -b. synchronize_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh() - call_rcu_bh() rcu_dereference_bh() +c. rcu_read_lock_sched() / rcu_read_unlock_sched() + preempt_disable() / preempt_enable() + local_irq_save() / local_irq_restore() + hardirq enter / hardirq exit + NMI enter / NMI exit + rcu_dereference_sched() -c. synchronize_sched() rcu_read_lock_sched() / rcu_read_unlock_sched() - call_rcu_sched() preempt_disable() / preempt_enable() - local_irq_save() / local_irq_restore() - hardirq enter / hardirq exit - NMI enter / NMI exit - rcu_dereference_sched() - -These three mechanisms are used as follows: +These three flavors are used as follows: a. RCU applied to normal data structures. @@ -867,18 +866,20 @@ RCU: Critical sections Grace period Barrier bh: Critical sections Grace period Barrier - rcu_read_lock_bh call_rcu_bh rcu_barrier_bh - rcu_read_unlock_bh synchronize_rcu_bh - rcu_dereference_bh synchronize_rcu_bh_expedited + rcu_read_lock_bh call_rcu rcu_barrier + rcu_read_unlock_bh synchronize_rcu + [local_bh_disable] synchronize_rcu_expedited + [and friends] + rcu_dereference_bh rcu_dereference_bh_check rcu_dereference_bh_protected rcu_read_lock_bh_held sched: Critical sections Grace period Barrier - rcu_read_lock_sched synchronize_sched rcu_barrier_sched - rcu_read_unlock_sched call_rcu_sched - [preempt_disable] synchronize_sched_expedited + rcu_read_lock_sched call_rcu rcu_barrier + rcu_read_unlock_sched synchronize_rcu + [preempt_disable] synchronize_rcu_expedited [and friends] rcu_read_lock_sched_notrace rcu_read_unlock_sched_notrace @@ -890,8 +891,8 @@ sched: Critical sections Grace period Barrier SRCU: Critical sections Grace period Barrier - srcu_read_lock synchronize_srcu srcu_barrier - srcu_read_unlock call_srcu + srcu_read_lock call_srcu srcu_barrier + srcu_read_unlock synchronize_srcu srcu_dereference synchronize_srcu_expedited srcu_dereference_check srcu_read_lock_held @@ -1034,7 +1035,7 @@ Answer: Just as PREEMPT_RT permits preemption of spinlock spinlocks blocking while in RCU read-side critical sections. - Why the apparent inconsistency? Because it is it + Why the apparent inconsistency? Because it is possible to use priority boosting to keep the RCU grace periods short if need be (for example, if running short of memory). In contrast, if blocking waiting From 090c1685fd628a8c191d77b5267a7dc226246a5b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 5 Oct 2018 16:18:11 -0700 Subject: [PATCH 035/112] doc: rcu: Add more rationale for using rcu_read_lock_sched in checklist This commit explains why rcu_read_lock_sched is better than using preempt_disable. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 49747717d90579..8860ab2a897ae2 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -63,7 +63,7 @@ over a rather long period of time, but improvements are always welcome! pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(), rcu_read_lock_sched(), or by the appropriate update-side lock. Disabling of preemption can serve as rcu_read_lock_sched(), but - is less readable. + is less readable and prevents lockdep from detecting locking issues. Letting RCU-protected pointers "leak" out of an RCU read-side critical section is every bid as bad as letting them leak out From bc2072c9adebd6ed1a192ed55ae195d8fb415c8d Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 5 Oct 2018 16:18:12 -0700 Subject: [PATCH 036/112] doc: rcu: Remove obsolete suggestion from checklist call_rcu_bh is now implemented in terms of call_rcu, so the suggestion to use a different API for speed benefits is not accurate anymore. This commit updates the document accordingly. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.txt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 8860ab2a897ae2..cc22ce49618d7d 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -285,11 +285,7 @@ over a rather long period of time, but improvements are always welcome! here is that superuser already has lots of ways to crash the machine. - d. Use call_rcu_bh() rather than call_rcu(), in order to take - advantage of call_rcu_bh()'s faster grace periods. (This - is only a partial solution, though.) - - e. Periodically invoke synchronize_rcu(), permitting a limited + d. Periodically invoke synchronize_rcu(), permitting a limited number of updates per grace period. The same cautions apply to call_rcu_bh(), call_rcu_sched(), From e060a03a1c9288f169297c7461ae1e4790b6c53a Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 5 Oct 2018 16:18:13 -0700 Subject: [PATCH 037/112] doc: rcu: Remove obsolete checklist item about synchronize_rcu usage Since the RCU mechanisms have been consolidated, the checklist item warning that synchronize_rcu() waits only for RCU readers is obsolete. This commit therefore removes this checklist item. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.txt | 37 +++++++-------------------------- 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index cc22ce49618d7d..b90ad1b0665a02 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -320,37 +320,14 @@ over a rather long period of time, but improvements are always welcome! will break Alpha, cause aggressive compilers to generate bad code, and confuse people trying to read your code. -11. Note that synchronize_rcu() -only- guarantees to wait until - all currently executing rcu_read_lock()-protected RCU read-side - critical sections complete. It does -not- necessarily guarantee - that all currently running interrupts, NMIs, preempt_disable() - code, or idle loops will complete. Therefore, if your - read-side critical sections are protected by something other - than rcu_read_lock(), do -not- use synchronize_rcu(). - - Similarly, disabling preemption is not an acceptable substitute - for rcu_read_lock(). Code that attempts to use preemption - disabling where it should be using rcu_read_lock() will break - in CONFIG_PREEMPT=y kernel builds. - - If you want to wait for interrupt handlers, NMI handlers, and - code under the influence of preempt_disable(), you instead - need to use synchronize_irq() or synchronize_sched(). - - This same limitation also applies to synchronize_rcu_bh() - and synchronize_srcu(), as well as to the asynchronous and - expedited forms of the three primitives, namely call_rcu(), - call_rcu_bh(), call_srcu(), synchronize_rcu_expedited(), - synchronize_rcu_bh_expedited(), and synchronize_srcu_expedited(). - -12. Any lock acquired by an RCU callback must be acquired elsewhere +11. Any lock acquired by an RCU callback must be acquired elsewhere with softirq disabled, e.g., via spin_lock_irqsave(), spin_lock_bh(), etc. Failing to disable irq on a given acquisition of that lock will result in deadlock as soon as the RCU softirq handler happens to run your RCU callback while interrupting that acquisition's critical section. -13. RCU callbacks can be and are executed in parallel. In many cases, +12. RCU callbacks can be and are executed in parallel. In many cases, the callback code simply wrappers around kfree(), so that this is not an issue (or, more accurately, to the extent that it is an issue, the memory-allocator locking handles it). However, @@ -366,7 +343,7 @@ over a rather long period of time, but improvements are always welcome! not the case, a self-spawning RCU callback would prevent the victim CPU from ever going offline.) -14. Unlike other forms of RCU, it -is- permissible to block in an +13. Unlike other forms of RCU, it -is- permissible to block in an SRCU read-side critical section (demarked by srcu_read_lock() and srcu_read_unlock()), hence the "SRCU": "sleepable RCU". Please note that if you don't need to sleep in read-side critical @@ -410,7 +387,7 @@ over a rather long period of time, but improvements are always welcome! Note that rcu_dereference() and rcu_assign_pointer() relate to SRCU just as they do to other forms of RCU. -15. The whole point of call_rcu(), synchronize_rcu(), and friends +14. The whole point of call_rcu(), synchronize_rcu(), and friends is to wait until all pre-existing readers have finished before carrying out some otherwise-destructive operation. It is therefore critically important to -first- remove any path @@ -422,13 +399,13 @@ over a rather long period of time, but improvements are always welcome! is the caller's responsibility to guarantee that any subsequent readers will execute safely. -16. The various RCU read-side primitives do -not- necessarily contain +15. The various RCU read-side primitives do -not- necessarily contain memory barriers. You should therefore plan for the CPU and the compiler to freely reorder code into and out of RCU read-side critical sections. It is the responsibility of the RCU update-side primitives to deal with this. -17. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the +16. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the __rcu sparse checks to validate your RCU code. These can help find problems as follows: @@ -451,7 +428,7 @@ over a rather long period of time, but improvements are always welcome! These debugging aids can help you find problems that are otherwise extremely difficult to spot. -18. If you register a callback using call_rcu(), call_rcu_bh(), +17. If you register a callback using call_rcu(), call_rcu_bh(), call_rcu_sched(), or call_srcu(), and pass in a function defined within a loadable module, then it in necessary to wait for all pending callbacks to be invoked after the last invocation From 1c7d6d4411a1ce7530cbdc4605261c560e07d51a Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 5 Oct 2018 16:18:14 -0700 Subject: [PATCH 038/112] doc: rcu: Encourage use of rcu_barrier in checklist The checklist suggests rcu_barrier_bh() for RCU-bh and similarly for sched, however these APIs are now implemented as rcu_barrier() itself due to the RCU consolidation. This commit therefore corrects checklist.txt to encourage use of the underlying rcu_barrier() API. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index b90ad1b0665a02..6f469864d9f59a 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -442,8 +442,8 @@ over a rather long period of time, but improvements are always welcome! You instead need to use one of the barrier functions: o call_rcu() -> rcu_barrier() - o call_rcu_bh() -> rcu_barrier_bh() - o call_rcu_sched() -> rcu_barrier_sched() + o call_rcu_bh() -> rcu_barrier() + o call_rcu_sched() -> rcu_barrier() o call_srcu() -> srcu_barrier() However, these barrier functions are absolutely -not- guaranteed From 93eb14201fc690687c2d94865bc38c1aa23356b8 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 8 Oct 2018 18:33:41 -0700 Subject: [PATCH 039/112] doc: Make reader aware of rcu_dereference_protected The whatisRCU.txt document says rcu_dereference() cannot be used outside of rcu_read_lock() protected sections. The commit adds a mention of rcu_dereference_protected(), so that the new reader knows that this API can be used to avoid update-side use of rcu_read_lock() and rcu_read_unlock(). Cc: tytso@mit.edu Suggested-by: tytso@mit.edu Signed-off-by: Joel Fernandes (Google) [ paulmck: Update wording, including further feedback from Joel. ] Signed-off-by: Paul E. McKenney --- Documentation/RCU/whatisRCU.txt | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 7c33445fd0e54e..4a6854318b17f0 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -266,7 +266,7 @@ rcu_dereference() unnecessary overhead on Alpha CPUs. Note that the value returned by rcu_dereference() is valid - only within the enclosing RCU read-side critical section. + only within the enclosing RCU read-side critical section [1]. For example, the following is -not- legal: rcu_read_lock(); @@ -292,6 +292,19 @@ rcu_dereference() typically used indirectly, via the _rcu list-manipulation primitives, such as list_for_each_entry_rcu(). + [1] The variant rcu_dereference_protected() can be used outside + of an RCU read-side critical section as long as the usage is + protected by locks acquired by the update-side code. This variant + avoids the lockdep warning that would happen when using (for + example) rcu_dereference() without rcu_read_lock() protection. + Using rcu_dereference_protected() also has the advantage + of permitting compiler optimizations that rcu_dereference() + must prohibit. The rcu_dereference_protected() variant takes + a lockdep expression to indicate which locks must be acquired + by the caller. If the indicated protection is not provided, + a lockdep splat is emitted. See RCU/Design/Requirements.html + and the API's code comments for more details and example usage. + The following diagram shows how each API communicates among the reader, updater, and reclaimer. From 8b9df28d7f2e50dc1be758e98dad61ec77d6f6b5 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 14 Oct 2018 14:29:55 -0700 Subject: [PATCH 040/112] doc: Remove obsolete (non-)requirement about disabling preemption The Requirements.html document says "Disabling Preemption Does Not Block Grace Periods". However this is no longer true with the RCU consolidation. This commit therefore removes the obsolete (non-)requirement entirely. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 50 ------------------- 1 file changed, 50 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 7efc1c1da7afd1..4fae55056c1d5b 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -900,8 +900,6 @@

    Fundamental Non-Requirements

    Grace Periods Don't Partition Read-Side Critical Sections
  • Read-Side Critical Sections Don't Partition Grace Periods -
  • - Disabling Preemption Does Not Block Grace Periods

    Readers Impose Minimal Ordering

    @@ -1259,54 +1257,6 @@

  •  
     
    -

    -Disabling Preemption Does Not Block Grace Periods

    - -

    -There was a time when disabling preemption on any given CPU would block -subsequent grace periods. -However, this was an accident of implementation and is not a requirement. -And in the current Linux-kernel implementation, disabling preemption -on a given CPU in fact does not block grace periods, as Oleg Nesterov -demonstrated. - -

    -If you need a preempt-disable region to block grace periods, you need to add -rcu_read_lock() and rcu_read_unlock(), for example -as follows: - -

    -
    - 1 preempt_disable();
    - 2 rcu_read_lock();
    - 3 do_something();
    - 4 rcu_read_unlock();
    - 5 preempt_enable();
    - 6
    - 7 /* Spinlocks implicitly disable preemption. */
    - 8 spin_lock(&mylock);
    - 9 rcu_read_lock();
    -10 do_something();
    -11 rcu_read_unlock();
    -12 spin_unlock(&mylock);
    -
    -
    - -

    -In theory, you could enter the RCU read-side critical section first, -but it is more efficient to keep the entire RCU read-side critical -section contained in the preempt-disable region as shown above. -Of course, RCU read-side critical sections that extend outside of -preempt-disable regions will work correctly, but such critical sections -can be preempted, which forces rcu_read_unlock() to do -more work. -And no, this is not an invitation to enclose all of your RCU -read-side critical sections within preempt-disable regions, because -doing so would degrade real-time response. - -

    -This non-requirement appeared with preemptible RCU. -

    Parallelism Facts of Life

    From 97949f0176da396c32e7c881cbfbc61642fb1266 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 14 Oct 2018 19:29:42 -0700 Subject: [PATCH 041/112] doc: Make listing in RCU perf/scale requirements use rcu_assign_pointer() The code listing under this section has a quick quiz that says line 19 uses rcu_access_pointer, but the code listing itself instead uses rcu_dereference(). This commit therefore makes the code listing match the quick quiz. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Requirements/Requirements.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 4fae55056c1d5b..f74a2233865c16 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1596,7 +1596,7 @@

    Performance and Scalability

    16 struct foo *p; 17 18 spin_lock(&gp_lock); -19 p = rcu_dereference(gp); +19 p = rcu_access_pointer(gp); 20 if (!p) { 21 spin_unlock(&gp_lock); 22 return false; From 97562c018135a9d01c59bd3bf95a9458548b79e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Oct 2018 10:54:13 -0700 Subject: [PATCH 042/112] doc: RCU scheduler spinlock rcu_read_unlock() restriction remains Given RCU flavor consolidation, when rcu_read_unlock() is invoked with interrupts disabled, the reporting of the corresponding quiescent state is deferred until interrupts are re-enabled. There was therefore some hope that this would allow dropping the restriction against holding scheduler spinlocks across an rcu_read_unlock() without disabling interrupts across the entire corresponding RCU read-side critical section. Unfortunately, the need to quickly provide a quiescent state to expedited grace periods sometimes requires a call to raise_softirq() during rcu_read_unlock() execution. Because raise_softirq() can sometimes acquire the scheduler spinlocks, the restriction must remain in effect. This commit therefore updates the RCU requirements documentation accordingly. Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 44 ++++++++++++------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index f74a2233865c16..9fca73e03a9895 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -2475,23 +2475,37 @@

    Scheduler and RCU

    but there is room for further improvement.

    -In the past, it was forbidden to disable interrupts across an -rcu_read_unlock() unless that interrupt-disabled region -of code also included the matching rcu_read_lock(). -Violating this restriction could result in deadlocks involving the -scheduler's runqueue and priority-inheritance spinlocks. -This restriction was lifted when interrupt-disabled calls to -rcu_read_unlock() started deferring the reporting of -the resulting RCU-preempt quiescent state until the end of that +It is forbidden to hold any of scheduler's runqueue or priority-inheritance +spinlocks across an rcu_read_unlock() unless interrupts have been +disabled across the entire RCU read-side critical section, that is, +up to and including the matching rcu_read_lock(). +Violating this restriction can result in deadlocks involving these +scheduler spinlocks. +There was hope that this restriction might be lifted when interrupt-disabled +calls to rcu_read_unlock() started deferring the reporting of +the resulting RCU-preempt quiescent state until the end of the corresponding interrupts-disabled region. -This deferred reporting means that the scheduler's runqueue and -priority-inheritance locks cannot be held while reporting an RCU-preempt -quiescent state, which lifts the earlier restriction, at least from -a deadlock perspective. -Unfortunately, real-time systems using RCU priority boosting may +Unfortunately, timely reporting of the corresponding quiescent state +to expedited grace periods requires a call to raise_softirq(), +which can acquire these scheduler spinlocks. +In addition, real-time systems using RCU priority boosting need this restriction to remain in effect because deferred -quiescent-state reporting also defers deboosting, which in turn -degrades real-time latencies. +quiescent-state reporting would also defer deboosting, which in turn +would degrade real-time latencies. + +

    +In theory, if a given RCU read-side critical section could be +guaranteed to be less than one second in duration, holding a scheduler +spinlock across that critical section's rcu_read_unlock() +would require only that preemption be disabled across the entire +RCU read-side critical section, not interrupts. +Unfortunately, given the possibility of vCPU preemption, long-running +interrupts, and so on, it is not possible in practice to guarantee +that a given RCU read-side critical section will complete in less than +one second. +Therefore, as noted above, if scheduler spinlocks are held across +a given call to rcu_read_unlock(), interrupts must be +disabled across the entire RCU read-side critical section.

    Tracing and RCU

    From a78ad16c7f0f948284d6927be95bc0e31a7b170b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 29 Oct 2018 22:15:59 -0700 Subject: [PATCH 043/112] doc: Correct parameter in stallwarn The stallwarn document incorrectly mentions 'fps=' instead of 'fqs='. This commit orrects that. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index b01bcafc64aa02..073dbc12d1ea06 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -205,7 +205,7 @@ handlers are no longer able to execute on this CPU. This can happen if the stalled CPU is spinning with interrupts are disabled, or, in -rt kernels, if a high-priority process is starving RCU's softirq handler. -The "fps=" shows the number of force-quiescent-state idle/offline +The "fqs=" shows the number of force-quiescent-state idle/offline detection passes that the grace-period kthread has made across this CPU since the last time that this CPU noted the beginning of a grace period. From 97b59370fa5959d5833a54f303f640d094af3d3c Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 27 Oct 2018 21:30:46 -0700 Subject: [PATCH 044/112] doc: Fix "struction" typo in RCU memory-ordering documentation This commit replaces "struction" with the correct "structure". Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- .../RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html index a346ce0116eb5e..e4d94fba6c89ad 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html @@ -77,7 +77,7 @@

    smp_mb__after_unlock_lock() immediately after successful acquisition of the lock. -

    Therefore, for any given rcu_node struction, any access +

    Therefore, for any given rcu_node structure, any access happening before one of the above lock-release functions will be seen by all CPUs as happening before any access happening after a later one of the above lock-acquisition functions. From 1dfddcdb95c426acbf9b54414fcc802e49a5aaca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Sep 2018 09:50:11 -0700 Subject: [PATCH 045/112] MAINTAINERS: Update from @linux.vnet.ibm.com to @linux.ibm.com IBM's patch-friendly email infrastructure is changing domains from @linux.vnet.ibm.com to @linux.ibm.com, which if nothing else might save a bit of typing. This commit therefore updates us stragglers' email addresses in the MAINTAINERS file. The old addresses are expected to continue to work for a few more months. While in the neighborhood, remove some obsolete entries, which results in an orphaned subsystem: "JSM Neo PCI based serial card". Signed-off-by: Paul E. McKenney Cc: Douglas Miller Cc: Eddie James Cc: Frank Haverkamp Cc: Frederic Barrat Cc: John Allen Cc: Manoj N. Kumar Cc: Matthew R. Ochs Cc: Michael Cyr Cc: Naveen N. Rao Cc: Paulo Flabiano Smorigo Cc: Philip Kelleher Cc: Steven Royer Cc: Thomas Falcon Cc: Tyrel Datwyler Cc: Uma Krishnan Acked-by: James Bottomley Acked-by: Mimi Zohar --- MAINTAINERS | 72 ++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 39 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index f4855974f32506..1b80a021f54260 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4033,7 +4033,7 @@ S: Supported F: drivers/net/ethernet/chelsio/cxgb4vf/ CXL (IBM Coherent Accelerator Processor Interface CAPI) DRIVER -M: Frederic Barrat +M: Frederic Barrat M: Andrew Donnellan L: linuxppc-dev@lists.ozlabs.org S: Supported @@ -4045,9 +4045,9 @@ F: Documentation/powerpc/cxl.txt F: Documentation/ABI/testing/sysfs-class-cxl CXLFLASH (IBM Coherent Accelerator Processor Interface CAPI Flash) SCSI DRIVER -M: Manoj N. Kumar -M: Matthew R. Ochs -M: Uma Krishnan +M: Manoj N. Kumar +M: Matthew R. Ochs +M: Uma Krishnan L: linux-scsi@vger.kernel.org S: Supported F: drivers/scsi/cxlflash/ @@ -5428,7 +5428,7 @@ S: Orphan F: fs/efs/ EHEA (IBM pSeries eHEA 10Gb ethernet adapter) DRIVER -M: Douglas Miller +M: Douglas Miller L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/ibm/ehea/ @@ -5565,7 +5565,7 @@ F: Documentation/filesystems/ext4/ext4.rst F: fs/ext4/ Extended Verification Module (EVM) -M: Mimi Zohar +M: Mimi Zohar L: linux-integrity@vger.kernel.org S: Supported F: security/integrity/evm/ @@ -5775,7 +5775,7 @@ F: include/linux/firmware.h FLASH ADAPTER DRIVER (IBM Flash Adapter 900GB Full Height PCI Flash Card) M: Joshua Morris -M: Philip Kelleher +M: Philip Kelleher S: Maintained F: drivers/block/rsxx/ @@ -6042,7 +6042,7 @@ F: include/linux/fscrypt*.h F: Documentation/filesystems/fscrypt.rst FSI-ATTACHED I2C DRIVER -M: Eddie James +M: Eddie James L: linux-i2c@vger.kernel.org L: openbmc@lists.ozlabs.org (moderated for non-subscribers) S: Maintained @@ -6218,8 +6218,7 @@ S: Supported F: drivers/uio/uio_pci_generic.c GENWQE (IBM Generic Workqueue Card) -M: Frank Haverkamp -M: Guilherme G. Piccoli +M: Frank Haverkamp S: Supported F: drivers/misc/genwqe/ @@ -7001,8 +7000,7 @@ F: crypto/842.c F: lib/842/ IBM Power in-Nest Crypto Acceleration -M: Leonidas S. Barbosa -M: Paulo Flabiano Smorigo +M: Paulo Flabiano Smorigo L: linux-crypto@vger.kernel.org S: Supported F: drivers/crypto/nx/Makefile @@ -7019,8 +7017,8 @@ S: Supported F: drivers/scsi/ipr.* IBM Power SRIOV Virtual NIC Device Driver -M: Thomas Falcon -M: John Allen +M: Thomas Falcon +M: John Allen L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/ibm/ibmvnic.* @@ -7035,41 +7033,38 @@ F: arch/powerpc/include/asm/vas.h F: arch/powerpc/include/uapi/asm/vas.h IBM Power Virtual Ethernet Device Driver -M: Thomas Falcon +M: Thomas Falcon L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/ibm/ibmveth.* IBM Power Virtual FC Device Drivers -M: Tyrel Datwyler +M: Tyrel Datwyler L: linux-scsi@vger.kernel.org S: Supported F: drivers/scsi/ibmvscsi/ibmvfc* IBM Power Virtual Management Channel Driver -M: Bryant G. Ly -M: Steven Royer +M: Steven Royer S: Supported F: drivers/misc/ibmvmc.* IBM Power Virtual SCSI Device Drivers -M: Tyrel Datwyler +M: Tyrel Datwyler L: linux-scsi@vger.kernel.org S: Supported F: drivers/scsi/ibmvscsi/ibmvscsi* F: include/scsi/viosrp.h IBM Power Virtual SCSI Device Target Driver -M: Bryant G. Ly -M: Michael Cyr +M: Michael Cyr L: linux-scsi@vger.kernel.org L: target-devel@vger.kernel.org S: Supported F: drivers/scsi/ibmvscsi_tgt/ IBM Power VMX Cryptographic instructions -M: Leonidas S. Barbosa -M: Paulo Flabiano Smorigo +M: Paulo Flabiano Smorigo L: linux-crypto@vger.kernel.org S: Supported F: drivers/crypto/vmx/Makefile @@ -7346,7 +7341,7 @@ S: Maintained L: linux-crypto@vger.kernel.org INTEGRITY MEASUREMENT ARCHITECTURE (IMA) -M: Mimi Zohar +M: Mimi Zohar M: Dmitry Kasatkin L: linux-integrity@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity.git @@ -7938,9 +7933,8 @@ S: Maintained F: drivers/media/platform/rcar_jpu.c JSM Neo PCI based serial card -M: Guilherme G. Piccoli L: linux-serial@vger.kernel.org -S: Maintained +S: Orphan F: drivers/tty/serial/jsm/ K10TEMP HARDWARE MONITORING DRIVER @@ -8170,7 +8164,7 @@ F: include/uapi/linux/kexec.h F: kernel/kexec* KEYS-ENCRYPTED -M: Mimi Zohar +M: Mimi Zohar L: linux-integrity@vger.kernel.org L: keyrings@vger.kernel.org S: Supported @@ -8179,9 +8173,9 @@ F: include/keys/encrypted-type.h F: security/keys/encrypted-keys/ KEYS-TRUSTED -M: James Bottomley +M: James Bottomley M: Jarkko Sakkinen -M: Mimi Zohar +M: Mimi Zohar L: linux-integrity@vger.kernel.org L: keyrings@vger.kernel.org S: Supported @@ -8234,7 +8228,7 @@ F: lib/test_kmod.c F: tools/testing/selftests/kmod/ KPROBES -M: Naveen N. Rao +M: Naveen N. Rao M: Anil S Keshavamurthy M: "David S. Miller" M: Masami Hiramatsu @@ -8590,7 +8584,7 @@ M: Nicholas Piggin M: David Howells M: Jade Alglave M: Luc Maranget -M: "Paul E. McKenney" +M: "Paul E. McKenney" R: Akira Yokosawa R: Daniel Lustig L: linux-kernel@vger.kernel.org @@ -9548,7 +9542,7 @@ F: drivers/platform/x86/mlx-platform.c MEMBARRIER SUPPORT M: Mathieu Desnoyers -M: "Paul E. McKenney" +M: "Paul E. McKenney" L: linux-kernel@vger.kernel.org S: Supported F: kernel/sched/membarrier.c @@ -10686,7 +10680,7 @@ S: Supported F: tools/objtool/ OCXL (Open Coherent Accelerator Processor Interface OpenCAPI) DRIVER -M: Frederic Barrat +M: Frederic Barrat M: Andrew Donnellan L: linuxppc-dev@lists.ozlabs.org S: Supported @@ -12487,7 +12481,7 @@ S: Orphan F: drivers/net/wireless/ray* RCUTORTURE TEST FRAMEWORK -M: "Paul E. McKenney" +M: "Paul E. McKenney" M: Josh Triplett R: Steven Rostedt R: Mathieu Desnoyers @@ -12534,7 +12528,7 @@ F: arch/x86/include/asm/intel_rdt_sched.h F: Documentation/x86/intel_rdt* READ-COPY UPDATE (RCU) -M: "Paul E. McKenney" +M: "Paul E. McKenney" M: Josh Triplett R: Steven Rostedt R: Mathieu Desnoyers @@ -12674,7 +12668,7 @@ F: include/linux/reset-controller.h RESTARTABLE SEQUENCES SUPPORT M: Mathieu Desnoyers M: Peter Zijlstra -M: "Paul E. McKenney" +M: "Paul E. McKenney" M: Boqun Feng L: linux-kernel@vger.kernel.org S: Supported @@ -13199,7 +13193,7 @@ F: drivers/scsi/sg.c F: include/scsi/sg.h SCSI SUBSYSTEM -M: "James E.J. Bottomley" +M: "James E.J. Bottomley" T: git git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git M: "Martin K. Petersen" T: git git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git @@ -13634,7 +13628,7 @@ F: mm/sl?b* SLEEPABLE READ-COPY UPDATE (SRCU) M: Lai Jiangshan -M: "Paul E. McKenney" +M: "Paul E. McKenney" M: Josh Triplett R: Steven Rostedt R: Mathieu Desnoyers @@ -15057,7 +15051,7 @@ F: drivers/platform/x86/topstar-laptop.c TORTURE-TEST MODULES M: Davidlohr Bueso -M: "Paul E. McKenney" +M: "Paul E. McKenney" M: Josh Triplett L: linux-kernel@vger.kernel.org S: Supported From b3c1d9ec7c59feadd22693f43145d8285bd35b04 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Oct 2018 13:25:32 -0700 Subject: [PATCH 046/112] rcu: Avoid double multiply by HZ The rcu_check_gp_start_stall() function multiplies the return value from rcu_jiffies_till_stall_check() by HZ, but the units are already in jiffies. This commit therefore avoids the need for introduction of a jiffies-squared unit by removing the extraneous multiplication. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 121f833acd046c..4933f5d9d99289 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2603,7 +2603,7 @@ static void force_quiescent_state(void) static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp) { - const unsigned long gpssdelay = rcu_jiffies_till_stall_check() * HZ; + const unsigned long gpssdelay = rcu_jiffies_till_stall_check(); unsigned long flags; unsigned long j; struct rcu_node *rnp_root = rcu_get_root(); From 791416c47153b45f640d52baaf30995d9d396a08 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Oct 2018 15:42:44 -0700 Subject: [PATCH 047/112] rcu: Parameterize rcu_check_gp_start_stall() In order to debug forward-progress stalls, it is necessary to check for excessively delayed grace-period starts. This is currently done for RCU CPU stall warnings by rcu_check_gp_start_stall(), which checks to see if the start of a requested grace period has been delayed by an RCU CPU stall warning period. Because rcutorture will need to check for the time consumed by an RCU forward-progress delay, this commit promotes gpssdelay from a local variable to a formal parameter. It is not necessary to export rcu_check_gp_start_stall() because rcutorture will access it via a wrapper function. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4933f5d9d99289..36e30150e1e91c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2600,10 +2600,10 @@ static void force_quiescent_state(void) * This function checks for grace-period requests that fail to motivate * RCU to come out of its idle mode. */ -static void -rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp) +void +rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, + const unsigned long gpssdelay) { - const unsigned long gpssdelay = rcu_jiffies_till_stall_check(); unsigned long flags; unsigned long j; struct rcu_node *rnp_root = rcu_get_root(); @@ -2690,7 +2690,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused local_irq_restore(flags); } - rcu_check_gp_start_stall(rnp, rdp); + rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ if (rcu_segcblist_ready_cbs(&rdp->cblist)) From 691960197e8daa39bf89886ba2e39de1e33f1ce4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Oct 2018 11:24:08 -0700 Subject: [PATCH 048/112] rcu: Add state name to show_rcu_gp_kthreads() output This commit adds the name of the RCU grace-period state to the show_rcu_gp_kthreads() output in order to ease debugging. This commit also moves gp_state_getname() up in the code so that show_rcu_gp_kthreads() can use it. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 36e30150e1e91c..ea78532183ac10 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -499,6 +499,16 @@ void rcu_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); +/* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + /* * Show the state of the grace-period kthreads. */ @@ -508,8 +518,9 @@ void show_rcu_gp_kthreads(void) struct rcu_data *rdp; struct rcu_node *rnp; - pr_info("%s: wait state: %d ->state: %#lx\n", rcu_state.name, - rcu_state.gp_state, rcu_state.gp_kthread->state); + pr_info("%s: wait state: %s(%d) ->state: %#lx\n", rcu_state.name, + gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, + rcu_state.gp_kthread->state); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) continue; @@ -1142,16 +1153,6 @@ static void record_gp_stall_check_time(void) rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); } -/* - * Convert a ->gp_state value to a character string. - */ -static const char *gp_state_getname(short gs) -{ - if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) - return "???"; - return gp_state_names[gs]; -} - /* * Complain about starvation of grace-period kthread. */ From c669c014d1dae9c7cdbfff049c798722f8650829 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Oct 2018 12:42:21 -0700 Subject: [PATCH 049/112] rcu: Add jiffies-since-GP-activity to show_rcu_gp_kthreads() This commit adds a printout of the number of jiffies since the last time that the RCU grace-period kthread did any processing. This can be useful when tracking down forward-progress issues. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ea78532183ac10..e7c9848d1e1bb1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -515,12 +515,14 @@ static const char *gp_state_getname(short gs) void show_rcu_gp_kthreads(void) { int cpu; + unsigned long j; struct rcu_data *rdp; struct rcu_node *rnp; - pr_info("%s: wait state: %s(%d) ->state: %#lx\n", rcu_state.name, - gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, - rcu_state.gp_kthread->state); + j = jiffies - READ_ONCE(rcu_state.gp_activity); + pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %ld\n", + rcu_state.name, gp_state_getname(rcu_state.gp_state), + rcu_state.gp_state, rcu_state.gp_kthread->state, j); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) continue; From 2320bda26df75c02f86c9fa42c3355ebcd16d8ed Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Mon, 8 Oct 2018 06:50:41 +0000 Subject: [PATCH 050/112] rcu: Adjust the comment of function rcu_is_watching Because RCU avoids interrupting idle CPUs, rcu_is_watching() is used to test whether or not it is currently legal to run RCU read-side critical sections on this CPU. However, the first sentence and last sentences of current comment for rcu_is_watching have opposite meaning of what is expected. This commit therefore fixes this header comment. Signed-off-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e7c9848d1e1bb1..429a46fb087a51 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -904,12 +904,12 @@ void rcu_irq_enter_irqson(void) } /** - * rcu_is_watching - see if RCU thinks that the current CPU is idle + * rcu_is_watching - see if RCU thinks that the current CPU is not idle * * Return true if RCU is watching the running CPU, which means that this * CPU can safely enter RCU read-side critical sections. In other words, - * if the current CPU is in its idle loop and is neither in an interrupt - * or NMI handler, return true. + * if the current CPU is not in its idle loop or is in an interrupt or + * NMI handler, return true. */ bool notrace rcu_is_watching(void) { From 0a89e5a402e957a97129423599a6ef6342da2764 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Oct 2018 10:00:58 -0700 Subject: [PATCH 051/112] rcu: Trace end of grace period before end of grace period Currently, rcu_gp_cleanup() traces the end of the old grace period after the old grace period has officially ended. This might make intuitive sense, but it also makes for confusing event-trace output because the "end" trace displays not the old but instead the new grace-period number. This commit therefore traces the end of an old grace period just before that grace period officially ends. Reported-by: Aravinda Prasad Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 429a46fb087a51..61bae41b1afe17 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2035,9 +2035,9 @@ static void rcu_gp_cleanup(void) rnp = rcu_get_root(); raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */ - /* Declare grace period done. */ - rcu_seq_end(&rcu_state.gp_seq); + /* Declare grace period done, trace first to use old GP number. */ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); + rcu_seq_end(&rcu_state.gp_seq); rcu_state.gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); From 05f415715ce45da07a0b1a5eac842765b733157f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Oct 2018 04:12:58 -0700 Subject: [PATCH 052/112] rcu: Speed up expedited GPs when interrupting RCU reader In PREEMPT kernels, an expedited grace period might send an IPI to a CPU that is executing an RCU read-side critical section. In that case, it would be nice if the rcu_read_unlock() directly interacted with the RCU core code to immediately report the quiescent state. And this does happen in the case where the reader has been preempted. But it would also be a nice performance optimization if immediate reporting also happened in the preemption-free case. This commit therefore adds an ->exp_hint field to the task_struct structure's ->rcu_read_unlock_special field. The IPI handler sets this hint when it has interrupted an RCU read-side critical section, and this causes the outermost rcu_read_unlock() call to invoke rcu_read_unlock_special(), which, if preemption is enabled, reports the quiescent state immediately. If preemption is disabled, then the report is required to be deferred until preemption (or bottom halves or interrupts or whatever) is re-enabled. Because this is a hint, it does nothing for more complicated cases. For example, if the IPI interrupts an RCU reader, but interrupts are disabled across the rcu_read_unlock(), but another rcu_read_lock() is executed before interrupts are re-enabled, the hint will already have been cleared. If you do crazy things like this, reporting will be deferred until some later RCU_SOFTIRQ handler, context switch, cond_resched(), or similar. Reported-by: Joel Fernandes Signed-off-by: Paul E. McKenney Acked-by: Joel Fernandes (Google) --- include/linux/sched.h | 4 +++- kernel/rcu/tree_exp.h | 4 +++- kernel/rcu/tree_plugin.h | 14 +++++++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index a51c13c2b1a031..e4c7b6241088cc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -572,8 +572,10 @@ union rcu_special { struct { u8 blocked; u8 need_qs; + u8 exp_hint; /* Hint for performance. */ + u8 pad; /* No garbage from compiler! */ } b; /* Bits. */ - u16 s; /* Set of bits. */ + u32 s; /* Set of bits. */ }; enum perf_event_task_context { diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e669ccf3751b1b..928fe5893a57d5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -692,8 +692,10 @@ static void sync_rcu_exp_handler(void *unused) */ if (t->rcu_read_lock_nesting > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rnp->expmask & rdp->grpmask) + if (rnp->expmask & rdp->grpmask) { rdp->deferred_qs = true; + WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true); + } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 05915e53633657..618956cc7a55f3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -642,13 +642,21 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); - if ((preempt_bh_were_disabled || irqs_were_disabled) && - t->rcu_read_unlock_special.b.blocked) { + if (preempt_bh_were_disabled || irqs_were_disabled) { + WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); /* Need to defer quiescent state until everything is enabled. */ - raise_softirq_irqoff(RCU_SOFTIRQ); + if (irqs_were_disabled) { + /* Enabling irqs does not reschedule, so... */ + raise_softirq_irqoff(RCU_SOFTIRQ); + } else { + /* Enabling BH or preempt does reschedule, so... */ + set_tsk_need_resched(current); + set_preempt_need_resched(); + } local_irq_restore(flags); return; } + WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); rcu_preempt_deferred_qs_irqrestore(t, flags); } From 117f683c6e0104e1d6dfe8f143ea9c24ab069044 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Nov 2018 14:20:57 -0800 Subject: [PATCH 053/112] rcu: Replace this_cpu_ptr() with __this_cpu_read() Because __this_cpu_read() can be lighter weight than equivalent uses of this_cpu_ptr(), this commit replaces the latter with the former. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 618956cc7a55f3..0bb1c1593ca41c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -597,7 +597,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { - return (this_cpu_ptr(&rcu_data)->deferred_qs || + return (__this_cpu_read(rcu_data.deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && t->rcu_read_lock_nesting <= 0; } From 5f1a6ef3746f536157922197d98676fa21154549 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Oct 2018 07:36:50 -0700 Subject: [PATCH 054/112] rcu: Avoid signed integer overflow in rcu_preempt_deferred_qs() Subtracting INT_MIN can be interpreted as unconditional signed integer overflow, which according to the C standard is undefined behavior. Therefore, kernel build arguments notwithstanding, it would be good to future-proof the code. This commit therefore substitutes INT_MAX for INT_MIN in order to avoid undefined behavior. While in the neighborhood, this commit also creates some meaningful names for INT_MAX and friends in order to improve readability, as suggested by Joel Fernandes. Reported-by: Ran Rozenstein Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0bb1c1593ca41c..3ed43f8cb02983 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -397,6 +397,11 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return rnp->gp_tasks != NULL; } +/* Bias and limit values for ->rcu_read_lock_nesting. */ +#define RCU_NEST_BIAS INT_MAX +#define RCU_NEST_NMAX (-INT_MAX / 2) +#define RCU_NEST_PMAX (INT_MAX / 2) + /* * Preemptible RCU implementation for rcu_read_lock(). * Just increment ->rcu_read_lock_nesting, shared state will be updated @@ -405,6 +410,8 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) void __rcu_read_lock(void) { current->rcu_read_lock_nesting++; + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX); barrier(); /* critical section after entry code. */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -424,20 +431,18 @@ void __rcu_read_unlock(void) --t->rcu_read_lock_nesting; } else { barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = INT_MIN; + t->rcu_read_lock_nesting = -RCU_NEST_BIAS; barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); barrier(); /* ->rcu_read_unlock_special load before assign */ t->rcu_read_lock_nesting = 0; } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = READ_ONCE(t->rcu_read_lock_nesting); + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { + int rrln = t->rcu_read_lock_nesting; - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -617,11 +622,11 @@ static void rcu_preempt_deferred_qs(struct task_struct *t) if (!rcu_preempt_need_deferred_qs(t)) return; if (couldrecurse) - t->rcu_read_lock_nesting -= INT_MIN; + t->rcu_read_lock_nesting -= RCU_NEST_BIAS; local_irq_save(flags); rcu_preempt_deferred_qs_irqrestore(t, flags); if (couldrecurse) - t->rcu_read_lock_nesting += INT_MIN; + t->rcu_read_lock_nesting += RCU_NEST_BIAS; } /* From b430c588bed984e4ed524e519a8c78862e94f1b3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Nov 2018 21:58:13 -0800 Subject: [PATCH 055/112] MAINTAINERS: Add Joel Fernandes as RCU reviewer Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1b80a021f54260..188809580d5e26 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12533,6 +12533,7 @@ M: Josh Triplett R: Steven Rostedt R: Mathieu Desnoyers R: Lai Jiangshan +R: Joel Fernandes L: linux-kernel@vger.kernel.org W: http://www.rdrop.com/users/paulmck/RCU/ S: Supported From 9189c7e706038a508567cb2e46ccdb68b08f4ac7 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 7 Sep 2018 15:26:18 -0700 Subject: [PATCH 056/112] checkpatch: Create table of obsolete APIs and apply to RCU This patch creates a deprecated_apis map, which allows such APIs to be flagged with suggested replacements more compactly and straightforwardly. It also uses this map to flag the old flavorful RCU APIs as deprecated, suggesting their vanilla-RCU counterparts as replacements. Signed-off-by: Joe Perches Cc: Andy Whitcroft Signed-off-by: Paul E. McKenney [ paulmck: Merged with earlier less-deft approach. ] --- scripts/checkpatch.pl | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index c883ec55654fb2..dd29e3c2816672 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -573,6 +573,27 @@ sub hash_show_words { } $mode_perms_search = "(?:${mode_perms_search})"; +our %deprecated_apis = ( + "synchronize_rcu_bh" => "synchronize_rcu", + "synchronize_rcu_bh_expedited" => "synchronize_rcu_expedited", + "call_rcu_bh" => "call_rcu", + "rcu_barrier_bh" => "rcu_barrier", + "synchronize_sched" => "synchronize_rcu", + "synchronize_sched_expedited" => "synchronize_rcu_expedited", + "call_rcu_sched" => "call_rcu", + "rcu_barrier_sched" => "rcu_barrier", + "get_state_synchronize_sched" => "get_state_synchronize_rcu", + "cond_synchronize_sched" => "cond_synchronize_rcu", +); + +#Create a search pattern for all these strings to speed up a loop below +our $deprecated_apis_search = ""; +foreach my $entry (keys %deprecated_apis) { + $deprecated_apis_search .= '|' if ($deprecated_apis_search ne ""); + $deprecated_apis_search .= $entry; +} +$deprecated_apis_search = "(?:${deprecated_apis_search})"; + our $mode_perms_world_writable = qr{ S_IWUGO | S_IWOTH | @@ -6368,6 +6389,14 @@ sub process { "please use device_initcall() or more appropriate function instead of __initcall() (see include/linux/init.h)\n" . $herecurr); } +# check for deprecated apis + if ($line =~ /\b($deprecated_apis_search)\b\s*\(/) { + my $deprecated_api = $1; + my $new_api = $deprecated_apis{$deprecated_api}; + WARN("DEPRECATED_API", + "Deprecated use of '$deprecated_api', prefer '$new_api' instead\n" . $herecurr); + } + # check for various structs that are normally const (ops, kgdb, device_tree) # and avoid what seem like struct definitions 'struct foo {' if ($line !~ /\bconst\b/ && From 3d709ab5a1767d422223e84cf5c20f0d33738274 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 11 Nov 2018 10:49:10 -0800 Subject: [PATCH 057/112] checkpatch.pl: Suggest lockdep instead of asserting !spin_is_locked() This commit points people who might otherwise code up something like WARN_ON(!spin_is_locked(&mylock)) to lockdep_assert_held(&mylock). Signed-off-by: Paul E. McKenney Cc: Andy Whitcroft Cc: Joe Perches --- scripts/checkpatch.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index dd29e3c2816672..377f373db6c002 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6389,6 +6389,12 @@ sub process { "please use device_initcall() or more appropriate function instead of __initcall() (see include/linux/init.h)\n" . $herecurr); } +# check for spin_is_locked(), suggest lockdep instead + if ($line =~ /\bspin_is_locked\(/) { + WARN("USE_LOCKDEP", + "Where possible, use lockdep_assert_held instead of assertions based on spin_is_locked\n" . $herecurr); + } + # check for deprecated apis if ($line =~ /\b($deprecated_apis_search)\b\s*\(/) { my $deprecated_api = $1; From 013ff4a6e7bc5aa64b0b937daae0244e3c93e49c Mon Sep 17 00:00:00 2001 From: Lance Roy Date: Thu, 4 Oct 2018 23:45:41 -0700 Subject: [PATCH 058/112] sfc: Replace spin_is_locked() with lockdep lockdep_assert_held() is better suited to checking locking requirements, since it only checks if the current thread holds the lock regardless of whether someone else does. This is also a step towards possibly removing spin_is_locked(). Signed-off-by: Lance Roy Cc: Solarflare linux maintainers Cc: Bert Kenward Cc: "David S. Miller" Cc: Signed-off-by: Paul E. McKenney Acked-by: Edward Cree --- drivers/net/ethernet/sfc/efx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 98fe7e762e1733..3643015a55cf20 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -3167,7 +3167,7 @@ struct hlist_head *efx_rps_hash_bucket(struct efx_nic *efx, { u32 hash = efx_filter_spec_hash(spec); - WARN_ON(!spin_is_locked(&efx->rps_hash_lock)); + lockdep_assert_held(&efx->rps_hash_lock); if (!efx->rps_hash_table) return NULL; return &efx->rps_hash_table[hash % EFX_ARFS_HASH_TABLE_SIZE]; From 97eeebea894283c290ead6694b76f575e7654773 Mon Sep 17 00:00:00 2001 From: Lance Roy Date: Thu, 4 Oct 2018 23:45:42 -0700 Subject: [PATCH 059/112] smsc: Replace spin_is_locked() with lockdep lockdep_assert_held() is better suited to checking locking requirements, since it only checks if the current thread holds the lock regardless of whether someone else does. This is also a step towards possibly removing spin_is_locked(). Signed-off-by: Lance Roy Cc: Steve Glendinning Cc: "David S. Miller" Cc: Signed-off-by: Paul E. McKenney --- drivers/net/ethernet/smsc/smsc911x.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/smsc/smsc911x.h b/drivers/net/ethernet/smsc/smsc911x.h index 8d75508acd2bfa..51b2fc1a395f17 100644 --- a/drivers/net/ethernet/smsc/smsc911x.h +++ b/drivers/net/ethernet/smsc/smsc911x.h @@ -67,7 +67,7 @@ #ifdef CONFIG_DEBUG_SPINLOCK #define SMSC_ASSERT_MAC_LOCK(pdata) \ - WARN_ON_SMP(!spin_is_locked(&pdata->mac_lock)) + lockdep_assert_held(&pdata->mac_lock) #else #define SMSC_ASSERT_MAC_LOCK(pdata) do {} while (0) #endif /* CONFIG_DEBUG_SPINLOCK */ From 456a737896b25e4853bbf3d6ca5a1c8ee4df4ee9 Mon Sep 17 00:00:00 2001 From: Lance Roy Date: Thu, 4 Oct 2018 23:45:44 -0700 Subject: [PATCH 060/112] userfaultfd: Replace spin_is_locked() with lockdep lockdep_assert_held() is better suited to checking locking requirements, since it only checks if the current thread holds the lock regardless of whether someone else does. This is also a step towards possibly removing spin_is_locked(). Signed-off-by: Lance Roy Cc: Alexander Viro Cc: Signed-off-by: Paul E. McKenney --- fs/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 356d2b8568c142..681881dc8a9d04 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -926,7 +926,7 @@ static inline struct userfaultfd_wait_queue *find_userfault_in( wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; - VM_BUG_ON(!spin_is_locked(&wqh->lock)); + lockdep_assert_held(&wqh->lock); uwq = NULL; if (!waitqueue_active(wqh)) From 04547728b7b775333a4f6fbb3c55102a79dc4596 Mon Sep 17 00:00:00 2001 From: Lance Roy Date: Thu, 4 Oct 2018 23:45:46 -0700 Subject: [PATCH 061/112] locking/mutex: Replace spin_is_locked() with lockdep lockdep_assert_held() is better suited to checking locking requirements, since it only checks if the current thread holds the lock regardless of whether someone else does. This is also a step towards possibly removing spin_is_locked(). Signed-off-by: Lance Roy Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Signed-off-by: Paul E. McKenney --- kernel/locking/mutex-debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 9aa713629387c3..771d4ca96dda76 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -36,7 +36,7 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) { - SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + lockdep_assert_held(&lock->wait_lock); DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); @@ -51,7 +51,7 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter) void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { - SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + lockdep_assert_held(&lock->wait_lock); /* Mark the current thread as blocked on the lock: */ task->blocked_on = waiter; From 35f3aa39f243e8c95e12a2b2d05b1d2e62ac58a4 Mon Sep 17 00:00:00 2001 From: Lance Roy Date: Thu, 4 Oct 2018 23:45:47 -0700 Subject: [PATCH 062/112] mm: Replace spin_is_locked() with lockdep lockdep_assert_held() is better suited to checking locking requirements, since it only checks if the current thread holds the lock regardless of whether someone else does. This is also a step towards possibly removing spin_is_locked(). Signed-off-by: Lance Roy Cc: Andrew Morton Cc: "Kirill A. Shutemov" Cc: Yang Shi Cc: Matthew Wilcox Cc: Mel Gorman Acked-by: Vlastimil Babka Cc: Jan Kara Cc: Shakeel Butt Cc: Signed-off-by: Paul E. McKenney --- mm/khugepaged.c | 4 ++-- mm/swap.c | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c13625c1ad5e5e..7b86600a47c914 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1225,7 +1225,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) { struct mm_struct *mm = mm_slot->mm; - VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); + lockdep_assert_held(&khugepaged_mm_lock); if (khugepaged_test_exit(mm)) { /* free mm_slot */ @@ -1631,7 +1631,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int progress = 0; VM_BUG_ON(!pages); - VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); + lockdep_assert_held(&khugepaged_mm_lock); if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; diff --git a/mm/swap.c b/mm/swap.c index aa483719922e73..5d786019eab957 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -823,8 +823,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, VM_BUG_ON_PAGE(!PageHead(page), page); VM_BUG_ON_PAGE(PageCompound(page_tail), page); VM_BUG_ON_PAGE(PageLRU(page_tail), page); - VM_BUG_ON(NR_CPUS != 1 && - !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock)); + lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock); if (!list) SetPageLRU(page_tail); From d4d592a6eeda1e381f38f398e7a0474a599c11ed Mon Sep 17 00:00:00 2001 From: Lance Roy Date: Thu, 4 Oct 2018 23:45:50 -0700 Subject: [PATCH 063/112] KVM: arm/arm64: vgic: Replace spin_is_locked() with lockdep lockdep_assert_held() is better suited to checking locking requirements, since it only checks if the current thread holds the lock regardless of whether someone else does. This is also a step towards possibly removing spin_is_locked(). Signed-off-by: Lance Roy Cc: Marc Zyngier Cc: Eric Auger Cc: linux-arm-kernel@lists.infradead.org Cc: Signed-off-by: Paul E. McKenney Acked-by: Christoffer Dall --- virt/kvm/arm/vgic/vgic.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 7cfdfbc910e0c4..50e25438fb3c6f 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -196,7 +196,7 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) */ static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) { - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock)); + lockdep_assert_held(&irq->irq_lock); /* If the interrupt is active, it must stay on the current vcpu */ if (irq->active) @@ -273,7 +273,7 @@ static void vgic_sort_ap_list(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); + lockdep_assert_held(&vgic_cpu->ap_list_lock); list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp); } @@ -311,7 +311,7 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, { struct kvm_vcpu *vcpu; - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock)); + lockdep_assert_held(&irq->irq_lock); retry: vcpu = vgic_target_oracle(irq); @@ -702,7 +702,7 @@ static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) static inline void vgic_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) { - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock)); + lockdep_assert_held(&irq->irq_lock); if (kvm_vgic_global_state.type == VGIC_V2) vgic_v2_populate_lr(vcpu, irq, lr); @@ -736,7 +736,7 @@ static int compute_ap_list_depth(struct kvm_vcpu *vcpu, *multi_sgi = false; - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); + lockdep_assert_held(&vgic_cpu->ap_list_lock); list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { int w; @@ -761,7 +761,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) bool multi_sgi; u8 prio = 0xff; - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); + lockdep_assert_held(&vgic_cpu->ap_list_lock); count = compute_ap_list_depth(vcpu, &multi_sgi); if (count > kvm_vgic_global_state.nr_lr || multi_sgi) From a0076e1778c23de4a42d90fee4ecb4c21dbb5838 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Nov 2018 16:57:40 -0800 Subject: [PATCH 064/112] crypto/pcrypt: Replace synchronize_rcu_bh() with synchronize_rcu() Now that synchronize_rcu() waits for bh-disable regions of code as well as RCU read-side critical sections, the synchronize_rcu_bh() in pcrypt_cpumask_change_notify() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Steffen Klassert Cc: Acked-by: Herbert Xu --- crypto/pcrypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c index f8ec3d4ba4a80f..8eb3c4c9ff6784 100644 --- a/crypto/pcrypt.c +++ b/crypto/pcrypt.c @@ -382,7 +382,7 @@ static int pcrypt_cpumask_change_notify(struct notifier_block *self, cpumask_copy(new_mask->mask, cpumask->cbcpu); rcu_assign_pointer(pcrypt->cb_cpumask, new_mask); - synchronize_rcu_bh(); + synchronize_rcu(); free_cpumask_var(old_mask->mask); kfree(old_mask); From 17c0eb74151ead3a7068e1691998de9950e91a08 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Nov 2018 17:00:58 -0800 Subject: [PATCH 065/112] drivers/ipmi: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Acked-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_intf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 677618e6f1f72f..dc8603d343209d 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -2187,7 +2187,7 @@ static void shutdown_smi(void *send_info) * handlers might have been running before we freed the * interrupt. */ - synchronize_sched(); + synchronize_rcu(); /* * Timeouts are stopped, now make sure the interrupts are off From fd8e688b0e8cc0c0a8ca18acb69cb8397ed62eec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Nov 2018 17:05:08 -0800 Subject: [PATCH 066/112] ethernet/sis: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Francois Romieu Cc: "David S. Miller" Cc: --- drivers/net/ethernet/sis/sis190.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sis/sis190.c b/drivers/net/ethernet/sis/sis190.c index c2c50522b96d63..808cf98166737f 100644 --- a/drivers/net/ethernet/sis/sis190.c +++ b/drivers/net/ethernet/sis/sis190.c @@ -1142,7 +1142,7 @@ static void sis190_down(struct net_device *dev) if (!poll_locked) poll_locked++; - synchronize_sched(); + synchronize_rcu(); } while (SIS_R32(IntrMask)); From 16f11500842ce022fe654e38bd447e15790911dd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Nov 2018 17:07:39 -0800 Subject: [PATCH 067/112] ethernet/realtek: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Realtek linux nic maintainers Cc: "David S. Miller" Cc: --- drivers/net/ethernet/realtek/8139too.c | 2 +- drivers/net/ethernet/realtek/r8169.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c index ffd68a7bc9e1a3..69d752f0b621f4 100644 --- a/drivers/net/ethernet/realtek/8139too.c +++ b/drivers/net/ethernet/realtek/8139too.c @@ -1661,7 +1661,7 @@ static void rtl8139_tx_timeout_task (struct work_struct *work) napi_disable(&tp->napi); netif_stop_queue(dev); - synchronize_sched(); + synchronize_rcu(); netdev_dbg(dev, "Transmit timeout, status %02x %04x %04x media %02x\n", RTL_R8(ChipCmd), RTL_R16(IntrStatus), diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 1fd01688d37bdd..4f1d89f0dc2402 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -5866,7 +5866,7 @@ static void rtl_reset_work(struct rtl8169_private *tp) napi_disable(&tp->napi); netif_stop_queue(dev); - synchronize_sched(); + synchronize_rcu(); rtl8169_hw_reset(tp); @@ -6609,7 +6609,7 @@ static void rtl8169_down(struct net_device *dev) rtl8169_rx_missed(dev); /* Give a racing hard_start_xmit a few cycles to complete. */ - synchronize_sched(); + synchronize_rcu(); rtl8169_tx_clear(tp); From d05faa5f1ac50beef77b4ceba0e8e157d41146e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Nov 2018 17:14:53 -0800 Subject: [PATCH 068/112] drivers/vhost: Replace synchronize_rcu_bh() with synchronize_rcu() Now that synchronize_rcu() waits for bh-disable regions of code as well as RCU read-side critical sections, synchronize_rcu_bh() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Cc: Cc: --- drivers/vhost/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index ab11b2bee2739f..564ead864028a3 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1359,7 +1359,7 @@ static int vhost_net_release(struct inode *inode, struct file *f) if (rx_sock) sockfd_put(rx_sock); /* Make sure no callbacks are outstanding */ - synchronize_rcu_bh(); + synchronize_rcu(); /* We do an extra flush before freeing memory, * since jobs can re-queue themselves. */ vhost_net_flush(n); From 09659af30860789b6f7d1c7dd35a7e829d530db5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Nov 2018 17:17:47 -0800 Subject: [PATCH 069/112] cpufreq/intel_pstate: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Srinivas Pandruvada Cc: Len Brown Cc: "Rafael J. Wysocki" Cc: Viresh Kumar Cc: --- drivers/cpufreq/intel_pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 9578312e43f2f1..ed124d72db765a 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1930,7 +1930,7 @@ static void intel_pstate_clear_update_util_hook(unsigned int cpu) cpufreq_remove_update_util_hook(cpu); cpu_data->update_util_set = false; - synchronize_sched(); + synchronize_rcu(); } static int intel_pstate_get_max_freq(struct cpudata *cpu) From cc69b389fd7bfcd14ade19e302a771f0234e9c85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Nov 2018 17:23:56 -0800 Subject: [PATCH 070/112] cpufreq/cpufreq_governor: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney --- drivers/cpufreq/cpufreq_governor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 6d53f7d9fc7a92..ffa9adeaba31bf 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -346,7 +346,7 @@ static inline void gov_clear_update_util(struct cpufreq_policy *policy) for_each_cpu(i, policy->cpus) cpufreq_remove_update_util_hook(i); - synchronize_sched(); + synchronize_rcu(); } static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *policy, From c93ffc15cceb057924410f9178e679120ee12353 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Nov 2018 17:31:31 -0800 Subject: [PATCH 071/112] fs/file: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Alexander Viro Cc: --- fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/file.c b/fs/file.c index 7ffd6e9d103d64..50304c7525eae8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -158,7 +158,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) - synchronize_sched(); + synchronize_rcu(); spin_lock(&files->file_lock); if (!new_fdt) From 7440172974e85b1828bdd84ac6b23b5bcad9c5eb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 18:44:52 -0800 Subject: [PATCH 072/112] tracing: Replace synchronize_sched() and call_rcu_sched() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). Similarly, call_rcu_sched() can be replaced by call_rcu(). This commit therefore makes these changes. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Acked-by: Steven Rostedt (VMware) --- include/linux/tracepoint.h | 2 +- kernel/trace/ftrace.c | 24 ++++++++++++------------ kernel/trace/ring_buffer.c | 12 ++++++------ kernel/trace/trace.c | 10 +++++----- kernel/trace/trace_events_filter.c | 4 ++-- kernel/trace/trace_kprobe.c | 2 +- kernel/tracepoint.c | 4 ++-- 7 files changed, 29 insertions(+), 29 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 538ba1a58f5b25..432080b59c266c 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -82,7 +82,7 @@ int unregister_tracepoint_module_notifier(struct notifier_block *nb) static inline void tracepoint_synchronize_unregister(void) { synchronize_srcu(&tracepoint_srcu); - synchronize_sched(); + synchronize_rcu(); } #else static inline void tracepoint_synchronize_unregister(void) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f536f601bd46e3..5b4f73e4fd56b5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -173,7 +173,7 @@ static void ftrace_sync(struct work_struct *work) { /* * This function is just a stub to implement a hard force - * of synchronize_sched(). This requires synchronizing + * of synchronize_rcu(). This requires synchronizing * tasks even in userspace and idle. * * Yes, function tracing is rude. @@ -934,7 +934,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, ftrace_profile_enabled = 0; /* * unregister_ftrace_profiler calls stop_machine - * so this acts like an synchronize_sched. + * so this acts like an synchronize_rcu. */ unregister_ftrace_profiler(); } @@ -1086,7 +1086,7 @@ struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) /* * Some of the ops may be dynamically allocated, - * they are freed after a synchronize_sched(). + * they are freed after a synchronize_rcu(). */ preempt_disable_notrace(); @@ -1286,7 +1286,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) { if (!hash || hash == EMPTY_HASH) return; - call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); + call_rcu(&hash->rcu, __free_ftrace_hash_rcu); } void ftrace_free_filter(struct ftrace_ops *ops) @@ -1501,7 +1501,7 @@ static bool hash_contains_ip(unsigned long ip, * the ip is not in the ops->notrace_hash. * * This needs to be called with preemption disabled as - * the hashes are freed with call_rcu_sched(). + * the hashes are freed with call_rcu(). */ static int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) @@ -4496,7 +4496,7 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, if (ftrace_enabled && !ftrace_hash_empty(hash)) ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS, &old_hash_ops); - synchronize_sched(); + synchronize_rcu(); hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) { hlist_del(&entry->hlist); @@ -5314,7 +5314,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) mutex_unlock(&graph_lock); /* Wait till all users are no longer using the old hash */ - synchronize_sched(); + synchronize_rcu(); free_ftrace_hash(old_hash); } @@ -5707,7 +5707,7 @@ void ftrace_release_mod(struct module *mod) list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { if (mod_map->mod == mod) { list_del_rcu(&mod_map->list); - call_rcu_sched(&mod_map->rcu, ftrace_free_mod_map); + call_rcu(&mod_map->rcu, ftrace_free_mod_map); break; } } @@ -5927,7 +5927,7 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, struct ftrace_mod_map *mod_map; const char *ret = NULL; - /* mod_map is freed via call_rcu_sched() */ + /* mod_map is freed via call_rcu() */ preempt_disable(); list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym); @@ -6262,7 +6262,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, /* * Some of the ops may be dynamically allocated, - * they must be freed after a synchronize_sched(). + * they must be freed after a synchronize_rcu(). */ preempt_disable_notrace(); @@ -6433,7 +6433,7 @@ static void clear_ftrace_pids(struct trace_array *tr) rcu_assign_pointer(tr->function_pids, NULL); /* Wait till all users are no longer using pid filtering */ - synchronize_sched(); + synchronize_rcu(); trace_free_pid_list(pid_list); } @@ -6580,7 +6580,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, rcu_assign_pointer(tr->function_pids, pid_list); if (filtered_pids) { - synchronize_sched(); + synchronize_rcu(); trace_free_pid_list(filtered_pids); } else if (pid_list) { /* Register a probe to set whether to ignore the tracing of a task */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 65bd4616220db7..4f3247a532598e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1834,7 +1834,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, * There could have been a race between checking * record_disable and incrementing it. */ - synchronize_sched(); + synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; rb_check_pages(cpu_buffer); @@ -3151,7 +3151,7 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * - * The caller should call synchronize_sched() after this. + * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable(struct ring_buffer *buffer) { @@ -3253,7 +3253,7 @@ bool ring_buffer_record_is_set_on(struct ring_buffer *buffer) * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * - * The caller should call synchronize_sched() after this. + * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) { @@ -4191,7 +4191,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); void ring_buffer_read_prepare_sync(void) { - synchronize_sched(); + synchronize_rcu(); } EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); @@ -4363,7 +4363,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); /* Make sure all commits have finished */ - synchronize_sched(); + synchronize_rcu(); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -4496,7 +4496,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, goto out; /* - * We can't do a synchronize_sched here because this + * We can't do a synchronize_rcu here because this * function can be called in atomic context. * Normally this will be called from the same CPU as cpu. * If not it's up to the caller to protect this. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ff1c4b20cd0a6d..51612b4a603f2f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1681,7 +1681,7 @@ void tracing_reset(struct trace_buffer *buf, int cpu) ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ - synchronize_sched(); + synchronize_rcu(); ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); @@ -1698,7 +1698,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf) ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ - synchronize_sched(); + synchronize_rcu(); buf->time_start = buffer_ftrace_now(buf, buf->cpu); @@ -2250,7 +2250,7 @@ void trace_buffered_event_disable(void) preempt_enable(); /* Wait for all current users to finish */ - synchronize_sched(); + synchronize_rcu(); for_each_tracing_cpu(cpu) { free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); @@ -5398,7 +5398,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) if (tr->current_trace->reset) tr->current_trace->reset(tr); - /* Current trace needs to be nop_trace before synchronize_sched */ + /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; #ifdef CONFIG_TRACER_MAX_TRACE @@ -5412,7 +5412,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) * The update_max_tr is called from interrupts disabled * so a synchronized_sched() is sufficient. */ - synchronize_sched(); + synchronize_rcu(); free_snapshot(tr); } #endif diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 84a65173b1e91d..35f3aa55be859f 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1614,7 +1614,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, /* * The calls can still be using the old filters. - * Do a synchronize_sched() and to ensure all calls are + * Do a synchronize_rcu() and to ensure all calls are * done with them before we free them. */ tracepoint_synchronize_unregister(); @@ -1845,7 +1845,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, if (filter) { /* * No event actually uses the system filter - * we can free it without synchronize_sched(). + * we can free it without synchronize_rcu(). */ __free_filter(system->filter); system->filter = filter; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index fec67188c4d285..adc153ab51c0e5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -333,7 +333,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) * event_call related objects, which will be accessed in * the kprobe_trace_func/kretprobe_trace_func. */ - synchronize_sched(); + synchronize_rcu(); kfree(link); /* Ignored if link == NULL */ } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index a3be42304485fd..46f2ab1e08a9e9 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -92,7 +92,7 @@ static __init int release_early_probes(void) while (early_probes) { tmp = early_probes; early_probes = tmp->next; - call_rcu_sched(tmp, rcu_free_old_probes); + call_rcu(tmp, rcu_free_old_probes); } return 0; @@ -123,7 +123,7 @@ static inline void release_probes(struct tracepoint_func *old) * cover both cases. So let us chain the SRCU and sched RCU * callbacks to wait for both grace periods. */ - call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes); + call_rcu(&tp_probes->rcu, rcu_free_old_probes); } } From ba180314253947f2a6057e21a0f92b5c314454b1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 18:58:01 -0800 Subject: [PATCH 073/112] main: Replace rcu_barrier_sched() with rcu_barrier() Now that all RCU flavors have been consolidated, rcu_barrier_sched() is but a synonym for rcu_barrier(). This commit therefore replaces the former with the latter. Signed-off-by: Paul E. McKenney Cc: Andrew Morton Cc: "Steven Rostedt (VMware)" Cc: Thomas Gleixner Cc: --- init/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/init/main.c b/init/main.c index ee147103ba1bae..a4548633024387 100644 --- a/init/main.c +++ b/init/main.c @@ -1046,12 +1046,12 @@ static void mark_readonly(void) { if (rodata_enabled) { /* - * load_module() results in W+X mappings, which are cleaned up - * with call_rcu_sched(). Let's make sure that queued work is + * load_module() results in W+X mappings, which are cleaned + * up with call_rcu(). Let's make sure that queued work is * flushed so that we don't hit false positives looking for * insecure pages which are W+X. */ - rcu_barrier_sched(); + rcu_barrier(); mark_rodata_ro(); rodata_test(); } else From ae8b7ce7647bc5c56a9a9fc32b03e1cd0ae49629 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:04:39 -0800 Subject: [PATCH 074/112] kprobes: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: "Naveen N. Rao" Cc: Anil S Keshavamurthy Cc: "David S. Miller" Acked-by: Masami Hiramatsu --- kernel/kprobes.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 90e98e233647f1..08e31d86319146 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -229,7 +229,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c) struct kprobe_insn_page *kip, *next; /* Ensure no-one is interrupted on the garbages */ - synchronize_sched(); + synchronize_rcu(); list_for_each_entry_safe(kip, next, &c->pages, list) { int i; @@ -1382,7 +1382,7 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) if (ret) { ap->flags |= KPROBE_FLAG_DISABLED; list_del_rcu(&p->list); - synchronize_sched(); + synchronize_rcu(); } } } @@ -1597,7 +1597,7 @@ int register_kprobe(struct kprobe *p) ret = arm_kprobe(p); if (ret) { hlist_del_rcu(&p->hlist); - synchronize_sched(); + synchronize_rcu(); goto out; } } @@ -1776,7 +1776,7 @@ void unregister_kprobes(struct kprobe **kps, int num) kps[i]->addr = NULL; mutex_unlock(&kprobe_mutex); - synchronize_sched(); + synchronize_rcu(); for (i = 0; i < num; i++) if (kps[i]->addr) __unregister_kprobe_bottom(kps[i]); @@ -1966,7 +1966,7 @@ void unregister_kretprobes(struct kretprobe **rps, int num) rps[i]->kp.addr = NULL; mutex_unlock(&kprobe_mutex); - synchronize_sched(); + synchronize_rcu(); for (i = 0; i < num; i++) { if (rps[i]->kp.addr) { __unregister_kprobe_bottom(&rps[i]->kp); From 51959d85f32dde9041655936eef206cc3323dc12 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:06:51 -0800 Subject: [PATCH 075/112] lockdep: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 1efada2dd9dd88..ef27f98714c098 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4195,7 +4195,7 @@ void lockdep_free_key_range(void *start, unsigned long size) * * sync_sched() is sufficient because the read-side is IRQ disable. */ - synchronize_sched(); + synchronize_rcu(); /* * XXX at this point we could return the resources to the pool; From c9a863bbb1620dca3af57934cdbb002e852449fc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:09:14 -0800 Subject: [PATCH 076/112] sched/membarrier: synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched/membarrier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 388a7a6c1aa2cc..3cd8a3a795d2f6 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -210,7 +210,7 @@ static int membarrier_register_global_expedited(void) * future scheduler executions will observe the new * thread flag state for this mm. */ - synchronize_sched(); + synchronize_rcu(); } atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, &mm->membarrier_state); @@ -246,7 +246,7 @@ static int membarrier_register_private_expedited(int flags) * Ensure all future scheduler executions will observe the * new thread flag state for this process. */ - synchronize_sched(); + synchronize_rcu(); } atomic_or(state, &mm->membarrier_state); From cb2f55369d3a9e6cf5c34d2da39eb242279a582d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:17:01 -0800 Subject: [PATCH 077/112] modules: Replace synchronize_sched() and call_rcu_sched() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). Similarly, call_rcu_sched() can be replaced by call_rcu(). This commit therefore makes these changes. Signed-off-by: Paul E. McKenney Acked-by: Jessica Yu --- kernel/module.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index 49a4058915870c..99b46c32d579ef 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2159,7 +2159,7 @@ static void free_module(struct module *mod) /* Remove this module from bug list, this uses list_del_rcu */ module_bug_cleanup(mod); /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */ - synchronize_sched(); + synchronize_rcu(); mutex_unlock(&module_mutex); /* This may be empty, but that's OK */ @@ -3507,15 +3507,15 @@ static noinline int do_init_module(struct module *mod) /* * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we - * call synchronize_sched(), but we don't want to slow down the success + * call synchronize_rcu(), but we don't want to slow down the success * path, so use actual RCU here. * Note that module_alloc() on most architectures creates W+X page * mappings which won't be cleaned up until do_free_init() runs. Any * code such as mark_rodata_ro() which depends on those mappings to * be cleaned up needs to sync with the queued work - ie - * rcu_barrier_sched() + * rcu_barrier() */ - call_rcu_sched(&freeinit->rcu, do_free_init); + call_rcu(&freeinit->rcu, do_free_init); mutex_unlock(&module_mutex); wake_up_all(&module_wq); @@ -3526,7 +3526,7 @@ static noinline int do_init_module(struct module *mod) fail: /* Try to protect us from buggy refcounters. */ mod->state = MODULE_STATE_GOING; - synchronize_sched(); + synchronize_rcu(); module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); @@ -3819,7 +3819,7 @@ static int load_module(struct load_info *info, const char __user *uargs, ddebug_cleanup: ftrace_release_mod(mod); dynamic_debug_remove(mod, info->debug); - synchronize_sched(); + synchronize_rcu(); kfree(mod->args); free_arch_cleanup: module_arch_cleanup(mod); @@ -3834,7 +3834,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod_tree_remove(mod); wake_up_all(&module_wq); /* Wait for RCU-sched synchronizing before releasing mod->list. */ - synchronize_sched(); + synchronize_rcu(); mutex_unlock(&module_mutex); free_module: /* Free lock-classes; relies on the preceding sync_rcu() */ From 25b0077511fe7cf1b876174f8481fb1742f4fb4d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:18:45 -0800 Subject: [PATCH 078/112] workqueue: Replace call_rcu_sched() with call_rcu() Now that call_rcu()'s callback is not invoked until after all preempt-disable regions of code have completed (in addition to explicitly marked RCU read-side critical sections), call_rcu() can be used in place of call_rcu_sched(). This commit therefore makes that change. Signed-off-by: Paul E. McKenney Cc: Lai Jiangshan Acked-by: Tejun Heo --- kernel/workqueue.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0280deac392e25..392be4b252f69d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3396,7 +3396,7 @@ static void put_unbound_pool(struct worker_pool *pool) del_timer_sync(&pool->mayday_timer); /* sched-RCU protected to allow dereferences from get_work_pool() */ - call_rcu_sched(&pool->rcu, rcu_free_pool); + call_rcu(&pool->rcu, rcu_free_pool); } /** @@ -3503,14 +3503,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work) put_unbound_pool(pool); mutex_unlock(&wq_pool_mutex); - call_rcu_sched(&pwq->rcu, rcu_free_pwq); + call_rcu(&pwq->rcu, rcu_free_pwq); /* * If we're the last pwq going away, @wq is already dead and no one * is gonna access it anymore. Schedule RCU free. */ if (is_last) - call_rcu_sched(&wq->rcu, rcu_free_wq); + call_rcu(&wq->rcu, rcu_free_wq); } /** @@ -4195,7 +4195,7 @@ void destroy_workqueue(struct workqueue_struct *wq) * The base ref is never dropped on per-cpu pwqs. Directly * schedule RCU free. */ - call_rcu_sched(&wq->rcu, rcu_free_wq); + call_rcu(&wq->rcu, rcu_free_wq); } else { /* * We're the sole accessor of @wq at this point. Directly From 0809d95451f7d867d37cf2b526b8da923fd72891 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:20:05 -0800 Subject: [PATCH 079/112] events: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 84530ab358c37a..c4b90cf7734afb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9918,7 +9918,7 @@ static void account_event(struct perf_event *event) * call the perf scheduling hooks before proceeding to * install events that need them. */ - synchronize_sched(); + synchronize_rcu(); } /* * Now that we have waited for the sync_sched(), allow further From 36bd1a8e91c66e9def12958547548aa549de9cbf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:22:23 -0800 Subject: [PATCH 080/112] percpu-refcount: Replace call_rcu_sched() with call_rcu() Now that call_rcu()'s callback is not invoked until after all preempt-disable regions of code have completed (in addition to explicitly marked RCU read-side critical sections), call_rcu() can be used in place of call_rcu_sched(). This commit therefore makes that change. Signed-off-by: Paul E. McKenney Cc: Ming Lei Cc: Bart Van Assche Cc: Jens Axboe Acked-by: Tejun Heo --- lib/percpu-refcount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index de10b8c0bff683..9877682e49c784 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -181,7 +181,7 @@ static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, ref->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch; percpu_ref_get(ref); /* put after confirmation */ - call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); + call_rcu(&ref->rcu, percpu_ref_switch_to_atomic_rcu); } static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) From 6564a25e6c185e65ca3148ed6e18f80882f6798f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:24:33 -0800 Subject: [PATCH 081/112] slab: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: --- mm/slab.c | 4 ++-- mm/slab_common.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 2a5654bb3b3ff3..3abb9feb3818e5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -962,10 +962,10 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, * To protect lockless access to n->shared during irq disabled context. * If n->shared isn't NULL in irq disabled context, accessing to it is * guaranteed to be valid until irq is re-enabled, because it will be - * freed after synchronize_sched(). + * freed after synchronize_rcu(). */ if (old_shared && force_change) - synchronize_sched(); + synchronize_rcu(); fail: kfree(old_shared); diff --git a/mm/slab_common.c b/mm/slab_common.c index 7eb8dc136c1cb8..9c11e8a937d2e4 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -724,7 +724,7 @@ void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, css_get(&s->memcg_params.memcg->css); s->memcg_params.deact_fn = deact_fn; - call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn); + call_rcu(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn); } void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) @@ -839,11 +839,11 @@ static void flush_memcg_workqueue(struct kmem_cache *s) mutex_unlock(&slab_mutex); /* - * SLUB deactivates the kmem_caches through call_rcu_sched. Make + * SLUB deactivates the kmem_caches through call_rcu. Make * sure all registered rcu callbacks have been invoked. */ if (IS_ENABLED(CONFIG_SLUB)) - rcu_barrier_sched(); + rcu_barrier(); /* * SLAB and SLUB create memcg kmem_caches through workqueue and SLUB From b401ec18485ab95c49c160b170514f7ab0f8f774 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:30:34 -0800 Subject: [PATCH 082/112] mm: Replace call_rcu_sched() with call_rcu() Now that call_rcu()'s callback is not invoked until after all preempt-disable regions of code have completed (in addition to explicitly marked RCU read-side critical sections), call_rcu() can be used in place of call_rcu_sched(). This commit therefore makes that change. Signed-off-by: Paul E. McKenney --- mm/mmu_gather.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 2a9fbc4a37d59e..f2f03c65580707 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -199,7 +199,7 @@ void tlb_table_flush(struct mmu_gather *tlb) if (*batch) { tlb_table_invalidate(tlb); - call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); + call_rcu(&(*batch)->rcu, tlb_remove_table_rcu); *batch = NULL; } } From eb4c2382272ae7ae5d81fdfa5b7a6c86146eaaa4 Mon Sep 17 00:00:00 2001 From: Dennis Krein Date: Fri, 26 Oct 2018 07:38:24 -0700 Subject: [PATCH 083/112] srcu: Lock srcu_data structure in srcu_gp_start() The srcu_gp_start() function is called with the srcu_struct structure's ->lock held, but not with the srcu_data structure's ->lock. This is problematic because this function accesses and updates the srcu_data structure's ->srcu_cblist, which is protected by that lock. Failing to hold this lock can result in corruption of the SRCU callback lists, which in turn can result in arbitrarily bad results. This commit therefore makes srcu_gp_start() acquire the srcu_data structure's ->lock across the calls to rcu_segcblist_advance() and rcu_segcblist_accelerate(), thus preventing this corruption. Reported-by: Bart Van Assche Reported-by: Christoph Hellwig Reported-by: Sebastian Kuzminsky Signed-off-by: Dennis Krein Signed-off-by: Paul E. McKenney Tested-by: Dennis Krein Cc: # 4.16.x --- kernel/rcu/srcutree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 60f3236beaf721..697a2d7e8e8ace 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -451,10 +451,12 @@ static void srcu_gp_start(struct srcu_struct *sp) lockdep_assert_held(&ACCESS_PRIVATE(sp, lock)); WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); + spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&sp->srcu_gp_seq)); + spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ rcu_seq_start(&sp->srcu_gp_seq); state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); From aacb5d91ab1bfbb0e8123da59a2e333d52ba7f60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 28 Oct 2018 10:32:51 -0700 Subject: [PATCH 084/112] srcu: Use "ssp" instead of "sp" for srcu_struct pointer In RCU, the distinction between "rsp", "rnp", and "rdp" has served well for a great many years, but in SRCU, "sp" vs. "sdp" has proven confusing. This commit therefore renames SRCU's "sp" pointers to "ssp", so that there is "ssp" for srcu_struct pointer, "snp" for srcu_node pointer, and "sdp" for srcu_data pointer. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 78 +++---- include/linux/srcutiny.h | 24 +- include/linux/srcutree.h | 8 +- kernel/rcu/srcutiny.c | 120 +++++----- kernel/rcu/srcutree.c | 488 +++++++++++++++++++-------------------- 5 files changed, 359 insertions(+), 359 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index ebd5f1511690f1..c614375cd264f8 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -38,20 +38,20 @@ struct srcu_struct; #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *sp, const char *name, +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key); -#define init_srcu_struct(sp) \ +#define init_srcu_struct(ssp) \ ({ \ static struct lock_class_key __srcu_key; \ \ - __init_srcu_struct((sp), #sp, &__srcu_key); \ + __init_srcu_struct((ssp), #ssp, &__srcu_key); \ }) #define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name }, #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -int init_srcu_struct(struct srcu_struct *sp); +int init_srcu_struct(struct srcu_struct *ssp); #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -67,28 +67,28 @@ int init_srcu_struct(struct srcu_struct *sp); struct srcu_struct { }; #endif -void call_srcu(struct srcu_struct *sp, struct rcu_head *head, +void call_srcu(struct srcu_struct *ssp, struct rcu_head *head, void (*func)(struct rcu_head *head)); -void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced); -int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); -void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); -void synchronize_srcu(struct srcu_struct *sp); +void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced); +int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); +void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); +void synchronize_srcu(struct srcu_struct *ssp); /** * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. + * @ssp: structure to clean up. * * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. */ -static inline void cleanup_srcu_struct(struct srcu_struct *sp) +static inline void cleanup_srcu_struct(struct srcu_struct *ssp) { - _cleanup_srcu_struct(sp, false); + _cleanup_srcu_struct(ssp, false); } /** * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure - * @sp: structure to clean up. + * @ssp: structure to clean up. * * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. Also, @@ -103,16 +103,16 @@ static inline void cleanup_srcu_struct(struct srcu_struct *sp) * (with high probability, anyway), and will also cause the srcu_struct * to be leaked. */ -static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp) +static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *ssp) { - _cleanup_srcu_struct(sp, true); + _cleanup_srcu_struct(ssp, true); } #ifdef CONFIG_DEBUG_LOCK_ALLOC /** * srcu_read_lock_held - might we be in SRCU read-side critical section? - * @sp: The srcu_struct structure to check + * @ssp: The srcu_struct structure to check * * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, @@ -126,16 +126,16 @@ static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp) * relies on normal RCU, it can be called from the CPU which * is in the idle loop from an RCU point of view or offline. */ -static inline int srcu_read_lock_held(const struct srcu_struct *sp) +static inline int srcu_read_lock_held(const struct srcu_struct *ssp) { if (!debug_lockdep_rcu_enabled()) return 1; - return lock_is_held(&sp->dep_map); + return lock_is_held(&ssp->dep_map); } #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -static inline int srcu_read_lock_held(const struct srcu_struct *sp) +static inline int srcu_read_lock_held(const struct srcu_struct *ssp) { return 1; } @@ -145,7 +145,7 @@ static inline int srcu_read_lock_held(const struct srcu_struct *sp) /** * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing * @p: the pointer to fetch and protect for later dereferencing - * @sp: pointer to the srcu_struct, which is used to check that we + * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. * @c: condition to check for update-side use * @@ -154,32 +154,32 @@ static inline int srcu_read_lock_held(const struct srcu_struct *sp) * to 1. The @c argument will normally be a logical expression containing * lockdep_is_held() calls. */ -#define srcu_dereference_check(p, sp, c) \ - __rcu_dereference_check((p), (c) || srcu_read_lock_held(sp), __rcu) +#define srcu_dereference_check(p, ssp, c) \ + __rcu_dereference_check((p), (c) || srcu_read_lock_held(ssp), __rcu) /** * srcu_dereference - fetch SRCU-protected pointer for later dereferencing * @p: the pointer to fetch and protect for later dereferencing - * @sp: pointer to the srcu_struct, which is used to check that we + * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. * * Makes rcu_dereference_check() do the dirty work. If PROVE_RCU * is enabled, invoking this outside of an RCU read-side critical * section will result in an RCU-lockdep splat. */ -#define srcu_dereference(p, sp) srcu_dereference_check((p), (sp), 0) +#define srcu_dereference(p, ssp) srcu_dereference_check((p), (ssp), 0) /** * srcu_dereference_notrace - no tracing and no lockdep calls from here * @p: the pointer to fetch and protect for later dereferencing - * @sp: pointer to the srcu_struct, which is used to check that we + * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. */ -#define srcu_dereference_notrace(p, sp) srcu_dereference_check((p), (sp), 1) +#define srcu_dereference_notrace(p, ssp) srcu_dereference_check((p), (ssp), 1) /** * srcu_read_lock - register a new reader for an SRCU-protected structure. - * @sp: srcu_struct in which to register the new reader. + * @ssp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section. Note that SRCU read-side * critical sections may be nested. However, it is illegal to @@ -194,44 +194,44 @@ static inline int srcu_read_lock_held(const struct srcu_struct *sp) * srcu_read_unlock() in an irq handler if the matching srcu_read_lock() * was invoked in process context. */ -static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) +static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { int retval; - retval = __srcu_read_lock(sp); - rcu_lock_acquire(&(sp)->dep_map); + retval = __srcu_read_lock(ssp); + rcu_lock_acquire(&(ssp)->dep_map); return retval; } /* Used by tracing, cannot be traced and cannot invoke lockdep. */ static inline notrace int -srcu_read_lock_notrace(struct srcu_struct *sp) __acquires(sp) +srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) { int retval; - retval = __srcu_read_lock(sp); + retval = __srcu_read_lock(ssp); return retval; } /** * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. - * @sp: srcu_struct in which to unregister the old reader. + * @ssp: srcu_struct in which to unregister the old reader. * @idx: return value from corresponding srcu_read_lock(). * * Exit an SRCU read-side critical section. */ -static inline void srcu_read_unlock(struct srcu_struct *sp, int idx) - __releases(sp) +static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) + __releases(ssp) { - rcu_lock_release(&(sp)->dep_map); - __srcu_read_unlock(sp, idx); + rcu_lock_release(&(ssp)->dep_map); + __srcu_read_unlock(ssp, idx); } /* Used by tracing, cannot be traced and cannot call lockdep. */ static inline notrace void -srcu_read_unlock_notrace(struct srcu_struct *sp, int idx) __releases(sp) +srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp) { - __srcu_read_unlock(sp, idx); + __srcu_read_unlock(ssp, idx); } /** diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index f41d2fb09f87bc..b19216aaaef2eb 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -60,7 +60,7 @@ void srcu_drive_gp(struct work_struct *wp); #define DEFINE_STATIC_SRCU(name) \ static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name) -void synchronize_srcu(struct srcu_struct *sp); +void synchronize_srcu(struct srcu_struct *ssp); /* * Counts the new reader in the appropriate per-CPU element of the @@ -68,36 +68,36 @@ void synchronize_srcu(struct srcu_struct *sp); * __srcu_read_unlock() must be in the same handler instance. Returns an * index that must be passed to the matching srcu_read_unlock(). */ -static inline int __srcu_read_lock(struct srcu_struct *sp) +static inline int __srcu_read_lock(struct srcu_struct *ssp) { int idx; - idx = READ_ONCE(sp->srcu_idx); - WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); + idx = READ_ONCE(ssp->srcu_idx); + WRITE_ONCE(ssp->srcu_lock_nesting[idx], ssp->srcu_lock_nesting[idx] + 1); return idx; } -static inline void synchronize_srcu_expedited(struct srcu_struct *sp) +static inline void synchronize_srcu_expedited(struct srcu_struct *ssp) { - synchronize_srcu(sp); + synchronize_srcu(ssp); } -static inline void srcu_barrier(struct srcu_struct *sp) +static inline void srcu_barrier(struct srcu_struct *ssp) { - synchronize_srcu(sp); + synchronize_srcu(ssp); } /* Defined here to avoid size increase for non-torture kernels. */ -static inline void srcu_torture_stats_print(struct srcu_struct *sp, +static inline void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) { int idx; - idx = READ_ONCE(sp->srcu_idx) & 0x1; + idx = READ_ONCE(ssp->srcu_idx) & 0x1; pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", tt, tf, idx, - READ_ONCE(sp->srcu_lock_nesting[!idx]), - READ_ONCE(sp->srcu_lock_nesting[idx])); + READ_ONCE(ssp->srcu_lock_nesting[!idx]), + READ_ONCE(ssp->srcu_lock_nesting[idx])); } #endif diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 0ae91b3a7406ec..6f292bd3e7db70 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -51,7 +51,7 @@ struct srcu_data { unsigned long grpmask; /* Mask for leaf srcu_node */ /* ->srcu_data_have_cbs[]. */ int cpu; - struct srcu_struct *sp; + struct srcu_struct *ssp; }; /* @@ -138,8 +138,8 @@ struct srcu_struct { #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) -void synchronize_srcu_expedited(struct srcu_struct *sp); -void srcu_barrier(struct srcu_struct *sp); -void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf); +void synchronize_srcu_expedited(struct srcu_struct *ssp); +void srcu_barrier(struct srcu_struct *ssp); +void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); #endif diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index b46e6683f8c9ec..32dfd652254850 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -37,30 +37,30 @@ int rcu_scheduler_active __read_mostly; static LIST_HEAD(srcu_boot_list); static bool srcu_init_done; -static int init_srcu_struct_fields(struct srcu_struct *sp) +static int init_srcu_struct_fields(struct srcu_struct *ssp) { - sp->srcu_lock_nesting[0] = 0; - sp->srcu_lock_nesting[1] = 0; - init_swait_queue_head(&sp->srcu_wq); - sp->srcu_cb_head = NULL; - sp->srcu_cb_tail = &sp->srcu_cb_head; - sp->srcu_gp_running = false; - sp->srcu_gp_waiting = false; - sp->srcu_idx = 0; - INIT_WORK(&sp->srcu_work, srcu_drive_gp); - INIT_LIST_HEAD(&sp->srcu_work.entry); + ssp->srcu_lock_nesting[0] = 0; + ssp->srcu_lock_nesting[1] = 0; + init_swait_queue_head(&ssp->srcu_wq); + ssp->srcu_cb_head = NULL; + ssp->srcu_cb_tail = &ssp->srcu_cb_head; + ssp->srcu_gp_running = false; + ssp->srcu_gp_waiting = false; + ssp->srcu_idx = 0; + INIT_WORK(&ssp->srcu_work, srcu_drive_gp); + INIT_LIST_HEAD(&ssp->srcu_work.entry); return 0; } #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *sp, const char *name, +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) { /* Don't re-initialize a lock while it is held. */ - debug_check_no_locks_freed((void *)sp, sizeof(*sp)); - lockdep_init_map(&sp->dep_map, name, key, 0); - return init_srcu_struct_fields(sp); + debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); + lockdep_init_map(&ssp->dep_map, name, key, 0); + return init_srcu_struct_fields(ssp); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -68,15 +68,15 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); /* * init_srcu_struct - initialize a sleep-RCU structure - * @sp: structure to initialize. + * @ssp: structure to initialize. * * Must invoke this on a given srcu_struct before passing that srcu_struct * to any other function. Each srcu_struct represents a separate domain * of SRCU protection. */ -int init_srcu_struct(struct srcu_struct *sp) +int init_srcu_struct(struct srcu_struct *ssp) { - return init_srcu_struct_fields(sp); + return init_srcu_struct_fields(ssp); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -84,22 +84,22 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); /* * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. + * @ssp: structure to clean up. * * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. */ -void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) +void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) { - WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); + WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); if (quiesced) - WARN_ON(work_pending(&sp->srcu_work)); + WARN_ON(work_pending(&ssp->srcu_work)); else - flush_work(&sp->srcu_work); - WARN_ON(sp->srcu_gp_running); - WARN_ON(sp->srcu_gp_waiting); - WARN_ON(sp->srcu_cb_head); - WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail); + flush_work(&ssp->srcu_work); + WARN_ON(ssp->srcu_gp_running); + WARN_ON(ssp->srcu_gp_waiting); + WARN_ON(ssp->srcu_cb_head); + WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail); } EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); @@ -107,13 +107,13 @@ EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); * Removes the count for the old reader from the appropriate element of * the srcu_struct. */ -void __srcu_read_unlock(struct srcu_struct *sp, int idx) +void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { - int newval = sp->srcu_lock_nesting[idx] - 1; + int newval = ssp->srcu_lock_nesting[idx] - 1; - WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); - if (!newval && READ_ONCE(sp->srcu_gp_waiting)) - swake_up_one(&sp->srcu_wq); + WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); + if (!newval && READ_ONCE(ssp->srcu_gp_waiting)) + swake_up_one(&ssp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -127,24 +127,24 @@ void srcu_drive_gp(struct work_struct *wp) int idx; struct rcu_head *lh; struct rcu_head *rhp; - struct srcu_struct *sp; + struct srcu_struct *ssp; - sp = container_of(wp, struct srcu_struct, srcu_work); - if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head)) + ssp = container_of(wp, struct srcu_struct, srcu_work); + if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head)) return; /* Already running or nothing to do. */ /* Remove recently arrived callbacks and wait for readers. */ - WRITE_ONCE(sp->srcu_gp_running, true); + WRITE_ONCE(ssp->srcu_gp_running, true); local_irq_disable(); - lh = sp->srcu_cb_head; - sp->srcu_cb_head = NULL; - sp->srcu_cb_tail = &sp->srcu_cb_head; + lh = ssp->srcu_cb_head; + ssp->srcu_cb_head = NULL; + ssp->srcu_cb_tail = &ssp->srcu_cb_head; local_irq_enable(); - idx = sp->srcu_idx; - WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); - WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ - swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); - WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ + idx = ssp->srcu_idx; + WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx); + WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ + swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); + WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ /* Invoke the callbacks we removed above. */ while (lh) { @@ -161,9 +161,9 @@ void srcu_drive_gp(struct work_struct *wp) * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ - WRITE_ONCE(sp->srcu_gp_running, false); - if (READ_ONCE(sp->srcu_cb_head)) - schedule_work(&sp->srcu_work); + WRITE_ONCE(ssp->srcu_gp_running, false); + if (READ_ONCE(ssp->srcu_cb_head)) + schedule_work(&ssp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); @@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(srcu_drive_gp); * Enqueue an SRCU callback on the specified srcu_struct structure, * initiating grace-period processing if it is not already running. */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, +void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; @@ -179,14 +179,14 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rhp->func = func; rhp->next = NULL; local_irq_save(flags); - *sp->srcu_cb_tail = rhp; - sp->srcu_cb_tail = &rhp->next; + *ssp->srcu_cb_tail = rhp; + ssp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); - if (!READ_ONCE(sp->srcu_gp_running)) { + if (!READ_ONCE(ssp->srcu_gp_running)) { if (likely(srcu_init_done)) - schedule_work(&sp->srcu_work); - else if (list_empty(&sp->srcu_work.entry)) - list_add(&sp->srcu_work.entry, &srcu_boot_list); + schedule_work(&ssp->srcu_work); + else if (list_empty(&ssp->srcu_work.entry)) + list_add(&ssp->srcu_work.entry, &srcu_boot_list); } } EXPORT_SYMBOL_GPL(call_srcu); @@ -194,13 +194,13 @@ EXPORT_SYMBOL_GPL(call_srcu); /* * synchronize_srcu - wait for prior SRCU read-side critical-section completion */ -void synchronize_srcu(struct srcu_struct *sp) +void synchronize_srcu(struct srcu_struct *ssp) { struct rcu_synchronize rs; init_rcu_head_on_stack(&rs.head); init_completion(&rs.completion); - call_srcu(sp, &rs.head, wakeme_after_rcu); + call_srcu(ssp, &rs.head, wakeme_after_rcu); wait_for_completion(&rs.completion); destroy_rcu_head_on_stack(&rs.head); } @@ -219,13 +219,13 @@ void __init rcu_scheduler_starting(void) */ void __init srcu_init(void) { - struct srcu_struct *sp; + struct srcu_struct *ssp; srcu_init_done = true; while (!list_empty(&srcu_boot_list)) { - sp = list_first_entry(&srcu_boot_list, + ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, srcu_work.entry); - list_del_init(&sp->srcu_work.entry); - schedule_work(&sp->srcu_work); + list_del_init(&ssp->srcu_work.entry); + schedule_work(&ssp->srcu_work); } } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 697a2d7e8e8ace..3600d88d8956b4 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -56,7 +56,7 @@ static LIST_HEAD(srcu_boot_list); static bool __read_mostly srcu_init_done; static void srcu_invoke_callbacks(struct work_struct *work); -static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); +static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); static void process_srcu(struct work_struct *work); /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ @@ -92,7 +92,7 @@ do { \ * srcu_read_unlock() running against them. So if the is_static parameter * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. */ -static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) +static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static) { int cpu; int i; @@ -103,13 +103,13 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) struct srcu_node *snp_first; /* Work out the overall tree geometry. */ - sp->level[0] = &sp->node[0]; + ssp->level[0] = &ssp->node[0]; for (i = 1; i < rcu_num_lvls; i++) - sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1]; + ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1]; rcu_init_levelspread(levelspread, num_rcu_lvl); /* Each pass through this loop initializes one srcu_node structure. */ - srcu_for_each_node_breadth_first(sp, snp) { + srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_init(&ACCESS_PRIVATE(snp, lock)); WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); @@ -120,17 +120,17 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) snp->srcu_gp_seq_needed_exp = 0; snp->grplo = -1; snp->grphi = -1; - if (snp == &sp->node[0]) { + if (snp == &ssp->node[0]) { /* Root node, special case. */ snp->srcu_parent = NULL; continue; } /* Non-root node. */ - if (snp == sp->level[level + 1]) + if (snp == ssp->level[level + 1]) level++; - snp->srcu_parent = sp->level[level - 1] + - (snp - sp->level[level]) / + snp->srcu_parent = ssp->level[level - 1] + + (snp - ssp->level[level]) / levelspread[level - 1]; } @@ -141,14 +141,14 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != ARRAY_SIZE(sdp->srcu_unlock_count)); level = rcu_num_lvls - 1; - snp_first = sp->level[level]; + snp_first = ssp->level[level]; for_each_possible_cpu(cpu) { - sdp = per_cpu_ptr(sp->sda, cpu); + sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; - sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; - sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq; + sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; sdp->mynode = &snp_first[cpu / levelspread[level]]; for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { if (snp->grplo < 0) @@ -157,7 +157,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) } sdp->cpu = cpu; INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); - sdp->sp = sp; + sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); if (is_static) continue; @@ -176,35 +176,35 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) * parameter is passed through to init_srcu_struct_nodes(), and * also tells us that ->sda has already been wired up to srcu_data. */ -static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static) +static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { - mutex_init(&sp->srcu_cb_mutex); - mutex_init(&sp->srcu_gp_mutex); - sp->srcu_idx = 0; - sp->srcu_gp_seq = 0; - sp->srcu_barrier_seq = 0; - mutex_init(&sp->srcu_barrier_mutex); - atomic_set(&sp->srcu_barrier_cpu_cnt, 0); - INIT_DELAYED_WORK(&sp->work, process_srcu); + mutex_init(&ssp->srcu_cb_mutex); + mutex_init(&ssp->srcu_gp_mutex); + ssp->srcu_idx = 0; + ssp->srcu_gp_seq = 0; + ssp->srcu_barrier_seq = 0; + mutex_init(&ssp->srcu_barrier_mutex); + atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); + INIT_DELAYED_WORK(&ssp->work, process_srcu); if (!is_static) - sp->sda = alloc_percpu(struct srcu_data); - init_srcu_struct_nodes(sp, is_static); - sp->srcu_gp_seq_needed_exp = 0; - sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); - smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */ - return sp->sda ? 0 : -ENOMEM; + ssp->sda = alloc_percpu(struct srcu_data); + init_srcu_struct_nodes(ssp, is_static); + ssp->srcu_gp_seq_needed_exp = 0; + ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ + return ssp->sda ? 0 : -ENOMEM; } #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *sp, const char *name, +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) { /* Don't re-initialize a lock while it is held. */ - debug_check_no_locks_freed((void *)sp, sizeof(*sp)); - lockdep_init_map(&sp->dep_map, name, key, 0); - spin_lock_init(&ACCESS_PRIVATE(sp, lock)); - return init_srcu_struct_fields(sp, false); + debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); + lockdep_init_map(&ssp->dep_map, name, key, 0); + spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); + return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -212,16 +212,16 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); /** * init_srcu_struct - initialize a sleep-RCU structure - * @sp: structure to initialize. + * @ssp: structure to initialize. * * Must invoke this on a given srcu_struct before passing that srcu_struct * to any other function. Each srcu_struct represents a separate domain * of SRCU protection. */ -int init_srcu_struct(struct srcu_struct *sp) +int init_srcu_struct(struct srcu_struct *ssp) { - spin_lock_init(&ACCESS_PRIVATE(sp, lock)); - return init_srcu_struct_fields(sp, false); + spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); + return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -231,37 +231,37 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be * done with compile-time initialization, so this check is added - * to each update-side SRCU primitive. Use sp->lock, which -is- + * to each update-side SRCU primitive. Use ssp->lock, which -is- * compile-time initialized, to resolve races involving multiple * CPUs trying to garner first-use privileges. */ -static void check_init_srcu_struct(struct srcu_struct *sp) +static void check_init_srcu_struct(struct srcu_struct *ssp) { unsigned long flags; /* The smp_load_acquire() pairs with the smp_store_release(). */ - if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ + if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave_rcu_node(sp, flags); - if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { - spin_unlock_irqrestore_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(ssp, flags); + if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) { + spin_unlock_irqrestore_rcu_node(ssp, flags); return; } - init_srcu_struct_fields(sp, true); - spin_unlock_irqrestore_rcu_node(sp, flags); + init_srcu_struct_fields(ssp, true); + spin_unlock_irqrestore_rcu_node(ssp, flags); } /* * Returns approximate total of the readers' ->srcu_lock_count[] values * for the rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx) { int cpu; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); sum += READ_ONCE(cpuc->srcu_lock_count[idx]); } @@ -272,13 +272,13 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) * Returns approximate total of the readers' ->srcu_unlock_count[] values * for the rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) { int cpu; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); sum += READ_ONCE(cpuc->srcu_unlock_count[idx]); } @@ -289,11 +289,11 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) * Return true if the number of pre-existing readers is determined to * be zero. */ -static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) +static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) { unsigned long unlocks; - unlocks = srcu_readers_unlock_idx(sp, idx); + unlocks = srcu_readers_unlock_idx(ssp, idx); /* * Make sure that a lock is always counted if the corresponding @@ -329,25 +329,25 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient, * especially on 64-bit systems. */ - return srcu_readers_lock_idx(sp, idx) == unlocks; + return srcu_readers_lock_idx(ssp, idx) == unlocks; } /** * srcu_readers_active - returns true if there are readers. and false * otherwise - * @sp: which srcu_struct to count active readers (holding srcu_read_lock). + * @ssp: which srcu_struct to count active readers (holding srcu_read_lock). * * Note that this is not an atomic primitive, and can therefore suffer * severe errors when invoked on an active srcu_struct. That said, it * can be useful as an error check at cleanup time. */ -static bool srcu_readers_active(struct srcu_struct *sp) +static bool srcu_readers_active(struct srcu_struct *ssp) { int cpu; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); sum += READ_ONCE(cpuc->srcu_lock_count[0]); sum += READ_ONCE(cpuc->srcu_lock_count[1]); @@ -363,44 +363,44 @@ static bool srcu_readers_active(struct srcu_struct *sp) * Return grace-period delay, zero if there are expedited grace * periods pending, SRCU_INTERVAL otherwise. */ -static unsigned long srcu_get_delay(struct srcu_struct *sp) +static unsigned long srcu_get_delay(struct srcu_struct *ssp) { - if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq), - READ_ONCE(sp->srcu_gp_seq_needed_exp))) + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), + READ_ONCE(ssp->srcu_gp_seq_needed_exp))) return 0; return SRCU_INTERVAL; } /* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ -void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) +void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) { int cpu; - if (WARN_ON(!srcu_get_delay(sp))) + if (WARN_ON(!srcu_get_delay(ssp))) return; /* Just leak it! */ - if (WARN_ON(srcu_readers_active(sp))) + if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ if (quiesced) { - if (WARN_ON(delayed_work_pending(&sp->work))) + if (WARN_ON(delayed_work_pending(&ssp->work))) return; /* Just leak it! */ } else { - flush_delayed_work(&sp->work); + flush_delayed_work(&ssp->work); } for_each_possible_cpu(cpu) if (quiesced) { - if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work))) + if (WARN_ON(delayed_work_pending(&per_cpu_ptr(ssp->sda, cpu)->work))) return; /* Just leak it! */ } else { - flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + flush_delayed_work(&per_cpu_ptr(ssp->sda, cpu)->work); } - if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || - WARN_ON(srcu_readers_active(sp))) { + if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(srcu_readers_active(ssp))) { pr_info("%s: Active srcu_struct %p state: %d\n", - __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))); return; /* Caller forgot to stop doing call_srcu()? */ } - free_percpu(sp->sda); - sp->sda = NULL; + free_percpu(ssp->sda); + ssp->sda = NULL; } EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); @@ -409,12 +409,12 @@ EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); * srcu_struct. * Returns an index that must be passed to the matching srcu_read_unlock(). */ -int __srcu_read_lock(struct srcu_struct *sp) +int __srcu_read_lock(struct srcu_struct *ssp) { int idx; - idx = READ_ONCE(sp->srcu_idx) & 0x1; - this_cpu_inc(sp->sda->srcu_lock_count[idx]); + idx = READ_ONCE(ssp->srcu_idx) & 0x1; + this_cpu_inc(ssp->sda->srcu_lock_count[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ return idx; } @@ -425,10 +425,10 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); * element of the srcu_struct. Note that this may well be a different * CPU than that which was incremented by the corresponding srcu_read_lock(). */ -void __srcu_read_unlock(struct srcu_struct *sp, int idx) +void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(sp->sda->srcu_unlock_count[idx]); + this_cpu_inc(ssp->sda->srcu_unlock_count[idx]); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -444,22 +444,22 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* * Start an SRCU grace period. */ -static void srcu_gp_start(struct srcu_struct *sp) +static void srcu_gp_start(struct srcu_struct *ssp) { - struct srcu_data *sdp = this_cpu_ptr(sp->sda); + struct srcu_data *sdp = this_cpu_ptr(ssp->sda); int state; - lockdep_assert_held(&ACCESS_PRIVATE(sp, lock)); - WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); + lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); + WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&sp->srcu_gp_seq)); + rcu_seq_current(&ssp->srcu_gp_seq)); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&sp->srcu_gp_seq)); + rcu_seq_snap(&ssp->srcu_gp_seq)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ - rcu_seq_start(&sp->srcu_gp_seq); - state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); + rcu_seq_start(&ssp->srcu_gp_seq); + state = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } @@ -513,7 +513,7 @@ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) * just-completed grace period, the one corresponding to idx. If possible, * schedule this invocation on the corresponding CPUs. */ -static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp, +static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp, unsigned long mask, unsigned long delay) { int cpu; @@ -521,7 +521,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp, for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { if (!(mask & (1 << (cpu - snp->grplo)))) continue; - srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay); + srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); } } @@ -534,7 +534,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp, * are initiating callback invocation. This allows the ->srcu_have_cbs[] * array to have a finite number of elements. */ -static void srcu_gp_end(struct srcu_struct *sp) +static void srcu_gp_end(struct srcu_struct *ssp) { unsigned long cbdelay; bool cbs; @@ -548,28 +548,28 @@ static void srcu_gp_end(struct srcu_struct *sp) struct srcu_node *snp; /* Prevent more than one additional grace period. */ - mutex_lock(&sp->srcu_cb_mutex); + mutex_lock(&ssp->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq_rcu_node(sp); - idx = rcu_seq_state(sp->srcu_gp_seq); + spin_lock_irq_rcu_node(ssp); + idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - cbdelay = srcu_get_delay(sp); - sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); - rcu_seq_end(&sp->srcu_gp_seq); - gpseq = rcu_seq_current(&sp->srcu_gp_seq); - if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) - sp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irq_rcu_node(sp); - mutex_unlock(&sp->srcu_gp_mutex); + cbdelay = srcu_get_delay(ssp); + ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + rcu_seq_end(&ssp->srcu_gp_seq); + gpseq = rcu_seq_current(&ssp->srcu_gp_seq); + if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) + ssp->srcu_gp_seq_needed_exp = gpseq; + spin_unlock_irq_rcu_node(ssp); + mutex_unlock(&ssp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - srcu_for_each_node_breadth_first(sp, snp) { + srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; - last_lvl = snp >= sp->level[rcu_num_lvls - 1]; + last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; if (last_lvl) cbs = snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; @@ -580,12 +580,12 @@ static void srcu_gp_end(struct srcu_struct *sp) snp->srcu_data_have_cbs[idx] = 0; spin_unlock_irq_rcu_node(snp); if (cbs) - srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); + srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); /* Occasionally prevent srcu_data counter wrap. */ if (!(gpseq & counter_wrap_check) && last_lvl) for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - sdp = per_cpu_ptr(sp->sda, cpu); + sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_irqsave_rcu_node(sdp, flags); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) @@ -598,18 +598,18 @@ static void srcu_gp_end(struct srcu_struct *sp) } /* Callback initiation done, allow grace periods after next. */ - mutex_unlock(&sp->srcu_cb_mutex); + mutex_unlock(&ssp->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq_rcu_node(sp); - gpseq = rcu_seq_current(&sp->srcu_gp_seq); + spin_lock_irq_rcu_node(ssp); + gpseq = rcu_seq_current(&ssp->srcu_gp_seq); if (!rcu_seq_state(gpseq) && - ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { - srcu_gp_start(sp); - spin_unlock_irq_rcu_node(sp); - srcu_reschedule(sp, 0); + ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) { + srcu_gp_start(ssp); + spin_unlock_irq_rcu_node(ssp); + srcu_reschedule(ssp, 0); } else { - spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(ssp); } } @@ -620,13 +620,13 @@ static void srcu_gp_end(struct srcu_struct *sp) * but without expediting. To start a completely new grace period, * whether expedited or not, use srcu_funnel_gp_start() instead. */ -static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, +static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp, unsigned long s) { unsigned long flags; for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&sp->srcu_gp_seq, s) || + if (rcu_seq_done(&ssp->srcu_gp_seq, s) || ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) return; spin_lock_irqsave_rcu_node(snp, flags); @@ -637,10 +637,10 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); } - spin_lock_irqsave_rcu_node(sp, flags); - if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) - sp->srcu_gp_seq_needed_exp = s; - spin_unlock_irqrestore_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(ssp, flags); + if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) + ssp->srcu_gp_seq_needed_exp = s; + spin_unlock_irqrestore_rcu_node(ssp, flags); } /* @@ -653,7 +653,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, * Note that this function also does the work of srcu_funnel_exp_start(), * in some cases by directly invoking it. */ -static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, +static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, unsigned long s, bool do_norm) { unsigned long flags; @@ -663,7 +663,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, /* Each pass through the loop does one level of the srcu_node tree. */ for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) + if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode) return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { @@ -678,7 +678,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, return; } if (!do_norm) - srcu_funnel_exp_start(sp, snp, s); + srcu_funnel_exp_start(ssp, snp, s); return; } snp->srcu_have_cbs[idx] = s; @@ -690,29 +690,29 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave_rcu_node(sp, flags); - if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { + spin_lock_irqsave_rcu_node(ssp, flags); + if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load * acquire setting up for initialization. */ - smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/ + smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ } - if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) - sp->srcu_gp_seq_needed_exp = s; + if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) + ssp->srcu_gp_seq_needed_exp = s; /* If grace period not already done and none in progress, start it. */ - if (!rcu_seq_done(&sp->srcu_gp_seq, s) && - rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { - WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); - srcu_gp_start(sp); + if (!rcu_seq_done(&ssp->srcu_gp_seq, s) && + rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { + WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); + srcu_gp_start(ssp); if (likely(srcu_init_done)) - queue_delayed_work(rcu_gp_wq, &sp->work, - srcu_get_delay(sp)); - else if (list_empty(&sp->work.work.entry)) - list_add(&sp->work.work.entry, &srcu_boot_list); + queue_delayed_work(rcu_gp_wq, &ssp->work, + srcu_get_delay(ssp)); + else if (list_empty(&ssp->work.work.entry)) + list_add(&ssp->work.work.entry, &srcu_boot_list); } - spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(ssp, flags); } /* @@ -720,12 +720,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, * loop an additional time if there is an expedited grace period pending. * The caller must ensure that ->srcu_idx is not changed while checking. */ -static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) +static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { for (;;) { - if (srcu_readers_active_idx_check(sp, idx)) + if (srcu_readers_active_idx_check(ssp, idx)) return true; - if (--trycount + !srcu_get_delay(sp) <= 0) + if (--trycount + !srcu_get_delay(ssp) <= 0) return false; udelay(SRCU_RETRY_CHECK_DELAY); } @@ -736,7 +736,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ -static void srcu_flip(struct srcu_struct *sp) +static void srcu_flip(struct srcu_struct *ssp) { /* * Ensure that if this updater saw a given reader's increment @@ -748,7 +748,7 @@ static void srcu_flip(struct srcu_struct *sp) */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); /* * Ensure that if the updater misses an __srcu_read_unlock() @@ -781,7 +781,7 @@ static void srcu_flip(struct srcu_struct *sp) * negligible when amoritized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ -static bool srcu_might_be_idle(struct srcu_struct *sp) +static bool srcu_might_be_idle(struct srcu_struct *ssp) { unsigned long curseq; unsigned long flags; @@ -790,7 +790,7 @@ static bool srcu_might_be_idle(struct srcu_struct *sp) /* If the local srcu_data structure has callbacks, not idle. */ local_irq_save(flags); - sdp = this_cpu_ptr(sp->sda); + sdp = this_cpu_ptr(ssp->sda); if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { local_irq_restore(flags); return false; /* Callbacks already present, so not idle. */ @@ -806,17 +806,17 @@ static bool srcu_might_be_idle(struct srcu_struct *sp) /* First, see if enough time has passed since the last GP. */ t = ktime_get_mono_fast_ns(); if (exp_holdoff == 0 || - time_in_range_open(t, sp->srcu_last_gp_end, - sp->srcu_last_gp_end + exp_holdoff)) + time_in_range_open(t, ssp->srcu_last_gp_end, + ssp->srcu_last_gp_end + exp_holdoff)) return false; /* Too soon after last GP. */ /* Next, check for probable idleness. */ - curseq = rcu_seq_current(&sp->srcu_gp_seq); + curseq = rcu_seq_current(&ssp->srcu_gp_seq); smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ - if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed))) + if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed))) return false; /* Grace period in progress, so not idle. */ smp_mb(); /* Order ->srcu_gp_seq with prior access. */ - if (curseq != rcu_seq_current(&sp->srcu_gp_seq)) + if (curseq != rcu_seq_current(&ssp->srcu_gp_seq)) return false; /* GP # changed, so not idle. */ return true; /* With reasonable probability, idle! */ } @@ -856,7 +856,7 @@ static void srcu_leak_callback(struct rcu_head *rhp) * srcu_read_lock(), and srcu_read_unlock() that are all passed the same * srcu_struct structure. */ -void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, +void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func, bool do_norm) { unsigned long flags; @@ -866,7 +866,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, unsigned long s; struct srcu_data *sdp; - check_init_srcu_struct(sp); + check_init_srcu_struct(ssp); if (debug_rcu_head_queue(rhp)) { /* Probable double call_srcu(), so leak the callback. */ WRITE_ONCE(rhp->func, srcu_leak_callback); @@ -874,14 +874,14 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, return; } rhp->func = func; - idx = srcu_read_lock(sp); + idx = srcu_read_lock(ssp); local_irq_save(flags); - sdp = this_cpu_ptr(sp->sda); + sdp = this_cpu_ptr(ssp->sda); spin_lock_rcu_node(sdp); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&sp->srcu_gp_seq)); - s = rcu_seq_snap(&sp->srcu_gp_seq); + rcu_seq_current(&ssp->srcu_gp_seq)); + s = rcu_seq_snap(&ssp->srcu_gp_seq); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { sdp->srcu_gp_seq_needed = s; @@ -893,15 +893,15 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, } spin_unlock_irqrestore_rcu_node(sdp, flags); if (needgp) - srcu_funnel_gp_start(sp, sdp, s, do_norm); + srcu_funnel_gp_start(ssp, sdp, s, do_norm); else if (needexp) - srcu_funnel_exp_start(sp, sdp->mynode, s); - srcu_read_unlock(sp, idx); + srcu_funnel_exp_start(ssp, sdp->mynode, s); + srcu_read_unlock(ssp, idx); } /** * call_srcu() - Queue a callback for invocation after an SRCU grace period - * @sp: srcu_struct in queue the callback + * @ssp: srcu_struct in queue the callback * @rhp: structure to be used for queueing the SRCU callback. * @func: function to be invoked after the SRCU grace period * @@ -916,21 +916,21 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, * The callback will be invoked from process context, but must nevertheless * be fast and must not block. */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, +void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) { - __call_srcu(sp, rhp, func, true); + __call_srcu(ssp, rhp, func, true); } EXPORT_SYMBOL_GPL(call_srcu); /* * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). */ -static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) +static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) { struct rcu_synchronize rcu; - RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || + RCU_LOCKDEP_WARN(lock_is_held(&ssp->dep_map) || lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), @@ -939,10 +939,10 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return; might_sleep(); - check_init_srcu_struct(sp); + check_init_srcu_struct(ssp); init_completion(&rcu.completion); init_rcu_head_on_stack(&rcu.head); - __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm); + __call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm); wait_for_completion(&rcu.completion); destroy_rcu_head_on_stack(&rcu.head); @@ -958,7 +958,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) /** * synchronize_srcu_expedited - Brute-force SRCU grace period - * @sp: srcu_struct with which to synchronize. + * @ssp: srcu_struct with which to synchronize. * * Wait for an SRCU grace period to elapse, but be more aggressive about * spinning rather than blocking when waiting. @@ -966,15 +966,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) * Note that synchronize_srcu_expedited() has the same deadlock and * memory-ordering properties as does synchronize_srcu(). */ -void synchronize_srcu_expedited(struct srcu_struct *sp) +void synchronize_srcu_expedited(struct srcu_struct *ssp) { - __synchronize_srcu(sp, rcu_gp_is_normal()); + __synchronize_srcu(ssp, rcu_gp_is_normal()); } EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); /** * synchronize_srcu - wait for prior SRCU read-side critical-section completion - * @sp: srcu_struct with which to synchronize. + * @ssp: srcu_struct with which to synchronize. * * Wait for the count to drain to zero of both indexes. To avoid the * possible starvation of synchronize_srcu(), it waits for the count of @@ -1016,12 +1016,12 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * SRCU must also provide it. Note that detecting idleness is heuristic * and subject to both false positives and negatives. */ -void synchronize_srcu(struct srcu_struct *sp) +void synchronize_srcu(struct srcu_struct *ssp) { - if (srcu_might_be_idle(sp) || rcu_gp_is_expedited()) - synchronize_srcu_expedited(sp); + if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited()) + synchronize_srcu_expedited(ssp); else - __synchronize_srcu(sp, true); + __synchronize_srcu(ssp, true); } EXPORT_SYMBOL_GPL(synchronize_srcu); @@ -1031,36 +1031,36 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); static void srcu_barrier_cb(struct rcu_head *rhp) { struct srcu_data *sdp; - struct srcu_struct *sp; + struct srcu_struct *ssp; sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); - sp = sdp->sp; - if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) - complete(&sp->srcu_barrier_completion); + ssp = sdp->ssp; + if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_barrier_completion); } /** * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. - * @sp: srcu_struct on which to wait for in-flight callbacks. + * @ssp: srcu_struct on which to wait for in-flight callbacks. */ -void srcu_barrier(struct srcu_struct *sp) +void srcu_barrier(struct srcu_struct *ssp) { int cpu; struct srcu_data *sdp; - unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq); + unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); - check_init_srcu_struct(sp); - mutex_lock(&sp->srcu_barrier_mutex); - if (rcu_seq_done(&sp->srcu_barrier_seq, s)) { + check_init_srcu_struct(ssp); + mutex_lock(&ssp->srcu_barrier_mutex); + if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) { smp_mb(); /* Force ordering following return. */ - mutex_unlock(&sp->srcu_barrier_mutex); + mutex_unlock(&ssp->srcu_barrier_mutex); return; /* Someone else did our work for us. */ } - rcu_seq_start(&sp->srcu_barrier_seq); - init_completion(&sp->srcu_barrier_completion); + rcu_seq_start(&ssp->srcu_barrier_seq); + init_completion(&ssp->srcu_barrier_completion); /* Initial count prevents reaching zero until all CBs are posted. */ - atomic_set(&sp->srcu_barrier_cpu_cnt, 1); + atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); /* * Each pass through this loop enqueues a callback, but only @@ -1071,39 +1071,39 @@ void srcu_barrier(struct srcu_struct *sp) * grace period as the last callback already in the queue. */ for_each_possible_cpu(cpu) { - sdp = per_cpu_ptr(sp->sda, cpu); + sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_irq_rcu_node(sdp); - atomic_inc(&sp->srcu_barrier_cpu_cnt); + atomic_inc(&ssp->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, &sdp->srcu_barrier_head, 0)) { debug_rcu_head_unqueue(&sdp->srcu_barrier_head); - atomic_dec(&sp->srcu_barrier_cpu_cnt); + atomic_dec(&ssp->srcu_barrier_cpu_cnt); } spin_unlock_irq_rcu_node(sdp); } /* Remove the initial count, at which point reaching zero can happen. */ - if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) - complete(&sp->srcu_barrier_completion); - wait_for_completion(&sp->srcu_barrier_completion); + if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_barrier_completion); + wait_for_completion(&ssp->srcu_barrier_completion); - rcu_seq_end(&sp->srcu_barrier_seq); - mutex_unlock(&sp->srcu_barrier_mutex); + rcu_seq_end(&ssp->srcu_barrier_seq); + mutex_unlock(&ssp->srcu_barrier_mutex); } EXPORT_SYMBOL_GPL(srcu_barrier); /** * srcu_batches_completed - return batches completed. - * @sp: srcu_struct on which to report batch completion. + * @ssp: srcu_struct on which to report batch completion. * * Report the number of batches, correlated with, but not necessarily * precisely the same as, the number of grace periods that have elapsed. */ -unsigned long srcu_batches_completed(struct srcu_struct *sp) +unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return sp->srcu_idx; + return ssp->srcu_idx; } EXPORT_SYMBOL_GPL(srcu_batches_completed); @@ -1112,11 +1112,11 @@ EXPORT_SYMBOL_GPL(srcu_batches_completed); * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has * completed in that state. */ -static void srcu_advance_state(struct srcu_struct *sp) +static void srcu_advance_state(struct srcu_struct *ssp) { int idx; - mutex_lock(&sp->srcu_gp_mutex); + mutex_lock(&ssp->srcu_gp_mutex); /* * Because readers might be delayed for an extended period after @@ -1128,47 +1128,47 @@ static void srcu_advance_state(struct srcu_struct *sp) * The load-acquire ensures that we see the accesses performed * by the prior grace period. */ - idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ + idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq_rcu_node(sp); - if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { - WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); - spin_unlock_irq_rcu_node(sp); - mutex_unlock(&sp->srcu_gp_mutex); + spin_lock_irq_rcu_node(ssp); + if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { + WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq)); + spin_unlock_irq_rcu_node(ssp); + mutex_unlock(&ssp->srcu_gp_mutex); return; } - idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); + idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) - srcu_gp_start(sp); - spin_unlock_irq_rcu_node(sp); + srcu_gp_start(ssp); + spin_unlock_irq_rcu_node(ssp); if (idx != SRCU_STATE_IDLE) { - mutex_unlock(&sp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_gp_mutex); return; /* Someone else started the grace period. */ } } - if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { - idx = 1 ^ (sp->srcu_idx & 1); - if (!try_check_zero(sp, idx, 1)) { - mutex_unlock(&sp->srcu_gp_mutex); + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { + idx = 1 ^ (ssp->srcu_idx & 1); + if (!try_check_zero(ssp, idx, 1)) { + mutex_unlock(&ssp->srcu_gp_mutex); return; /* readers present, retry later. */ } - srcu_flip(sp); - rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2); + srcu_flip(ssp); + rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); } - if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { /* * SRCU read-side critical sections are normally short, * so check at least twice in quick succession after a flip. */ - idx = 1 ^ (sp->srcu_idx & 1); - if (!try_check_zero(sp, idx, 2)) { - mutex_unlock(&sp->srcu_gp_mutex); + idx = 1 ^ (ssp->srcu_idx & 1); + if (!try_check_zero(ssp, idx, 2)) { + mutex_unlock(&ssp->srcu_gp_mutex); return; /* readers present, retry later. */ } - srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */ + srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ } } @@ -1184,14 +1184,14 @@ static void srcu_invoke_callbacks(struct work_struct *work) struct rcu_cblist ready_cbs; struct rcu_head *rhp; struct srcu_data *sdp; - struct srcu_struct *sp; + struct srcu_struct *ssp; sdp = container_of(work, struct srcu_data, work.work); - sp = sdp->sp; + ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&sp->srcu_gp_seq)); + rcu_seq_current(&ssp->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { spin_unlock_irq_rcu_node(sdp); @@ -1217,7 +1217,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) spin_lock_irq_rcu_node(sdp); rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&sp->srcu_gp_seq)); + rcu_seq_snap(&ssp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); spin_unlock_irq_rcu_node(sdp); @@ -1229,24 +1229,24 @@ static void srcu_invoke_callbacks(struct work_struct *work) * Finished one round of SRCU grace period. Start another if there are * more SRCU callbacks queued, otherwise put SRCU into not-running state. */ -static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) +static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) { bool pushgp = true; - spin_lock_irq_rcu_node(sp); - if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { - if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { + spin_lock_irq_rcu_node(ssp); + if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { + if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ pushgp = false; } - } else if (!rcu_seq_state(sp->srcu_gp_seq)) { + } else if (!rcu_seq_state(ssp->srcu_gp_seq)) { /* Outstanding request and no GP. Start one. */ - srcu_gp_start(sp); + srcu_gp_start(ssp); } - spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(ssp); if (pushgp) - queue_delayed_work(rcu_gp_wq, &sp->work, delay); + queue_delayed_work(rcu_gp_wq, &ssp->work, delay); } /* @@ -1254,41 +1254,41 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) */ static void process_srcu(struct work_struct *work) { - struct srcu_struct *sp; + struct srcu_struct *ssp; - sp = container_of(work, struct srcu_struct, work.work); + ssp = container_of(work, struct srcu_struct, work.work); - srcu_advance_state(sp); - srcu_reschedule(sp, srcu_get_delay(sp)); + srcu_advance_state(ssp); + srcu_reschedule(ssp, srcu_get_delay(ssp)); } void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, + struct srcu_struct *ssp, int *flags, unsigned long *gp_seq) { if (test_type != SRCU_FLAVOR) return; *flags = 0; - *gp_seq = rcu_seq_current(&sp->srcu_gp_seq); + *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); -void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) +void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) { int cpu; int idx; unsigned long s0 = 0, s1 = 0; - idx = sp->srcu_idx & 0x1; + idx = ssp->srcu_idx & 0x1; pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):", - tt, tf, rcu_seq_current(&sp->srcu_gp_seq), idx); + tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; long c0, c1; struct srcu_data *sdp; - sdp = per_cpu_ptr(sp->sda, cpu); + sdp = per_cpu_ptr(ssp->sda, cpu); u0 = sdp->srcu_unlock_count[!idx]; u1 = sdp->srcu_unlock_count[idx]; @@ -1323,14 +1323,14 @@ early_initcall(srcu_bootup_announce); void __init srcu_init(void) { - struct srcu_struct *sp; + struct srcu_struct *ssp; srcu_init_done = true; while (!list_empty(&srcu_boot_list)) { - sp = list_first_entry(&srcu_boot_list, struct srcu_struct, + ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, work.work.entry); - check_init_srcu_struct(sp); - list_del_init(&sp->work.work.entry); - queue_work(rcu_gp_wq, &sp->work.work); + check_init_srcu_struct(ssp); + list_del_init(&ssp->work.work.entry); + queue_work(rcu_gp_wq, &ssp->work.work); } } From ae0e33494a601e13df79c4742d88d4d2bc2b0a87 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:40:39 -0800 Subject: [PATCH 085/112] net/sched: Replace call_rcu_bh() and rcu_barrier_bh() Now that call_rcu()'s callback is not invoked until after bh-disable regions of code have completed (in addition to explicitly marked RCU read-side critical sections), call_rcu() can be used in place of call_rcu_bh(). Similarly, rcu_barrier() can be used in place o frcu_barrier_bh(). This commit therefore makes these changes. Signed-off-by: Paul E. McKenney Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: "David S. Miller" Cc: --- net/sched/sch_api.c | 2 +- net/sched/sch_generic.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ca3b0f46de5303..016e628c6ac90a 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -540,7 +540,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab) if (--tab->refcnt == 0) { list_del(&tab->list); - call_rcu_bh(&tab->rcu, stab_kfree_rcu); + call_rcu(&tab->rcu, stab_kfree_rcu); } } EXPORT_SYMBOL(qdisc_put_stab); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index de1663f7d3ad6e..66ba2ce2320f96 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1372,7 +1372,7 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, if (!tp_head) { RCU_INIT_POINTER(*miniqp->p_miniq, NULL); /* Wait for flying RCU callback before it is freed. */ - rcu_barrier_bh(); + rcu_barrier(); return; } @@ -1380,10 +1380,10 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, &miniqp->miniq1 : &miniqp->miniq2; /* We need to make sure that readers won't see the miniq - * we are about to modify. So wait until previous call_rcu_bh callback + * we are about to modify. So wait until previous call_rcu callback * is done. */ - rcu_barrier_bh(); + rcu_barrier(); miniq->filter_list = tp_head; rcu_assign_pointer(*miniqp->p_miniq, miniq); @@ -1392,7 +1392,7 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, * block potential new user of miniq_old until all readers * are not seeing it. */ - call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func); + call_rcu(&miniq_old->rcu, mini_qdisc_rcu_func); } EXPORT_SYMBOL(mini_qdisc_pair_swap); From 5da54c1810e52f7abba0ff7932dddd761f945875 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:43:32 -0800 Subject: [PATCH 086/112] net/core: Replace call_rcu_bh() and synchronize_rcu_bh() Now that call_rcu()'s callback is not invoked until after all bh-disable regions of code have completed (in addition to explicitly marked RCU read-side critical sections), call_rcu() can be used in place of call_rcu_bh(). Similarly, synchronize_rcu() can be used in place of synchronize_rcu_bh(). This commit therefore makes these changes. Signed-off-by: Paul E. McKenney Cc: "David S. Miller" Cc: Eric Dumazet Cc: --- net/core/netpoll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 5da9552b186bc8..677d3f332172b3 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -800,7 +800,7 @@ void __netpoll_cleanup(struct netpoll *np) ops->ndo_netpoll_cleanup(np->dev); RCU_INIT_POINTER(np->dev->npinfo, NULL); - call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); + call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); } else RCU_INIT_POINTER(np->dev->npinfo, NULL); } @@ -811,7 +811,7 @@ void __netpoll_free(struct netpoll *np) ASSERT_RTNL(); /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu_bh(); + synchronize_rcu(); __netpoll_cleanup(np); kfree(np); } From 1a56f7d53b5c8e82442e86eeac0b5d549088ee42 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Nov 2018 19:45:50 -0800 Subject: [PATCH 087/112] net/bridge: Replace call_rcu_bh() and rcu_barrier_bh() Now that call_rcu()'s callback is not invoked until after all bh-disable regions of code have completed (in addition to explicitly marked RCU read-side critical sections), call_rcu() can be used in place of call_rcu_bh(). Similarly, rcu_barrier() can be used in place of rcu_barrier_bh(). This commit therefore makes these changes. Signed-off-by: Paul E. McKenney Cc: Roopa Prabhu Cc: "David S. Miller" Cc: Cc: Acked-by: Nikolay Aleksandrov --- net/bridge/br_mdb.c | 2 +- net/bridge/br_multicast.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index a7ea2d43171430..596ec6e7df11d7 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -728,7 +728,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); - call_rcu_bh(&p->rcu, br_multicast_free_pg); + call_rcu(&p->rcu, br_multicast_free_pg); err = 0; if (!mp->ports && !mp->host_joined && diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 6bac0d6b7b9413..0255223f20017f 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -260,7 +260,7 @@ static void br_multicast_group_expired(struct timer_list *t) hlist_del_rcu(&mp->hlist[mdb->ver]); mdb->size--; - call_rcu_bh(&mp->rcu, br_multicast_free_group); + call_rcu(&mp->rcu, br_multicast_free_group); out: spin_unlock(&br->multicast_lock); @@ -291,7 +291,7 @@ static void br_multicast_del_pg(struct net_bridge *br, del_timer(&p->timer); br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB, p->flags); - call_rcu_bh(&p->rcu, br_multicast_free_pg); + call_rcu(&p->rcu, br_multicast_free_pg); if (!mp->ports && !mp->host_joined && netif_running(br->dev)) @@ -358,7 +358,7 @@ static int br_mdb_rehash(struct net_bridge_mdb_htable __rcu **mdbp, int max, } br_mdb_rehash_seq++; - call_rcu_bh(&mdb->rcu, br_mdb_free); + call_rcu(&mdb->rcu, br_mdb_free); out: rcu_assign_pointer(*mdbp, mdb); @@ -1629,7 +1629,7 @@ br_multicast_leave_group(struct net_bridge *br, rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); - call_rcu_bh(&p->rcu, br_multicast_free_pg); + call_rcu(&p->rcu, br_multicast_free_pg); br_mdb_notify(br->dev, port, group, RTM_DELMDB, p->flags); @@ -2051,19 +2051,19 @@ void br_multicast_dev_del(struct net_bridge *br) hlist_for_each_entry_safe(mp, n, &mdb->mhash[i], hlist[ver]) { del_timer(&mp->timer); - call_rcu_bh(&mp->rcu, br_multicast_free_group); + call_rcu(&mp->rcu, br_multicast_free_group); } } if (mdb->old) { spin_unlock_bh(&br->multicast_lock); - rcu_barrier_bh(); + rcu_barrier(); spin_lock_bh(&br->multicast_lock); WARN_ON(mdb->old); } mdb->old = mdb; - call_rcu_bh(&mdb->rcu, br_mdb_free); + call_rcu(&mdb->rcu, br_mdb_free); out: spin_unlock_bh(&br->multicast_lock); From e3e740544173ef0dd8bffbf158182a7748e6c678 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Nov 2018 13:53:34 -0800 Subject: [PATCH 088/112] percpu-rwsem: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change, even though it is but a comment. Signed-off-by: Paul E. McKenney Cc: Dennis Zhou Cc: Christoph Lameter Acked-by: Tejun Heo --- include/linux/percpu-rwsem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 79b99d653e030d..71b75643c43204 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -41,7 +41,7 @@ static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore * * cannot both change sem->state from readers_fast and start checking * counters while we are here. So if we see !sem->state, we know that * the writer won't be checking until we're past the preempt_enable() - * and that one the synchronize_sched() is done, the writer will see + * and that once the synchronize_rcu() is done, the writer will see * anything we did within this RCU-sched read-size critical section. */ __this_cpu_inc(*sem->read_count); From d5cccfc7b772b8a20b06557f1b7c066e7fc2c393 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Nov 2018 14:01:39 -0800 Subject: [PATCH 089/112] types: Remove call_rcu_bh() and call_rcu_sched() Now that call_rcu()'s callback is not invoked until after bh-disable and preempt-disable regions of code have completed (in addition to explicitly marked RCU read-side critical sections), call_rcu() can be used in place of call_rcu_bh() and call_rcu_sched(). This commit therefore removes these two API members from the callback_head structure's header comment. Signed-off-by: Paul E. McKenney Cc: Andrew Morton Cc: Pekka Enberg Cc: Masahiro Yamada Cc: Alexey Dobriyan --- include/linux/types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/types.h b/include/linux/types.h index 9834e90aa01007..c2615d6a019e73 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -212,8 +212,8 @@ struct ustat { * weird ABI and we need to ask it explicitly. * * The alignment is required to guarantee that bit 0 of @next will be - * clear under normal conditions -- as long as we use call_rcu(), - * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. + * clear under normal conditions -- as long as we use call_rcu() or + * call_srcu() to queue the callback. * * This guarantee is important for few reasons: * - future call_rcu_lazy() will make use of lower bits in the pointer; From 2af3024cd78f120d027cb44b454186ba9d7dab24 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Nov 2018 14:11:40 -0800 Subject: [PATCH 090/112] cgroups: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change, even though it is but a comment. Signed-off-by: Paul E. McKenney Cc: Jens Axboe Cc: Dennis Zhou Cc: Johannes Weiner Cc: "Dennis Zhou (Facebook)" Acked-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6aaf5dd5383bba..7a8429f8e2803f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5343,7 +5343,7 @@ int __init cgroup_init(void) cgroup_rstat_boot(); /* - * The latency of the synchronize_sched() is too high for cgroups, + * The latency of the synchronize_rcu() is too high for cgroups, * avoid it at the cost of forcing all readers into the slow path. */ rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); From 6932689e4145f545062ca8c86cf76f38854d63d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Nov 2018 14:16:57 -0800 Subject: [PATCH 091/112] livepatch: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change, even though it is but a comment. Signed-off-by: Paul E. McKenney --- kernel/livepatch/patch.c | 4 ++-- kernel/livepatch/transition.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 82d584225dc68e..7702cb4064fce8 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -61,7 +61,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, ops = container_of(fops, struct klp_ops, fops); /* - * A variant of synchronize_sched() is used to allow patching functions + * A variant of synchronize_rcu() is used to allow patching functions * where RCU is not watching, see klp_synchronize_transition(). */ preempt_disable_notrace(); @@ -72,7 +72,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, /* * func should never be NULL because preemption should be disabled here * and unregister_ftrace_function() does the equivalent of a - * synchronize_sched() before the func_stack removal. + * synchronize_rcu() before the func_stack removal. */ if (WARN_ON_ONCE(!func)) goto unlock; diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 5bc349805e033f..304d5eb8a98cbf 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -52,7 +52,7 @@ static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn); /* * This function is just a stub to implement a hard force - * of synchronize_sched(). This requires synchronizing + * of synchronize_rcu(). This requires synchronizing * tasks even in userspace and idle. */ static void klp_sync(struct work_struct *work) @@ -175,7 +175,7 @@ void klp_cancel_transition(void) void klp_update_patch_state(struct task_struct *task) { /* - * A variant of synchronize_sched() is used to allow patching functions + * A variant of synchronize_rcu() is used to allow patching functions * where RCU is not watching, see klp_synchronize_transition(). */ preempt_disable_notrace(); From 0245b80e284d4fdabbf50589180482dc4d2cf277 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Nov 2018 15:09:25 -0800 Subject: [PATCH 092/112] net/core/skmsg: Replace call_rcu_sched() with call_rcu() Now that call_rcu()'s callback is not invoked until after all preempt-disable regions of code have completed (in addition to explicitly marked RCU read-side critical sections), call_rcu() can be used in place of call_rcu_sched(). This commit therefore makes that change. Signed-off-by: Paul E. McKenney Cc: John Fastabend Cc: Daniel Borkmann Cc: "David S. Miller" Cc: --- net/core/skmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 56a99d0c9aa08d..c92d6ccce6104a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -580,7 +580,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) write_unlock_bh(&sk->sk_callback_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - call_rcu_sched(&psock->rcu, sk_psock_destroy); + call_rcu(&psock->rcu, sk_psock_destroy); } EXPORT_SYMBOL_GPL(sk_psock_drop); From dd06d25d06f4932c570540877eba88d868dbba9b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Nov 2018 15:12:34 -0800 Subject: [PATCH 093/112] net/decnet: Replace rcu_barrier_bh() with rcu_barrier() Now that all RCU flavors have been consolidated, rcu_barrier_bh() is but a synonym for rcu_barrier(). This commit therefore replaces the former with the latter. Signed-off-by: Paul E. McKenney Cc: "David S. Miller" Cc: Cc: --- net/decnet/af_decnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 7d6ff983ba2cbb..dbd0f7bae00a60 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -2405,7 +2405,7 @@ static void __exit decnet_exit(void) proto_unregister(&dn_proto); - rcu_barrier_bh(); /* Wait for completion of call_rcu_bh()'s */ + rcu_barrier(); /* Wait for completion of call_rcu()'s */ } module_exit(decnet_exit); #endif From 4a67e3a79e3bdc47dfd0c85a1888067d95a0282c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Nov 2018 15:25:13 -0800 Subject: [PATCH 094/112] tools/kernel.h: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change, even though it is but a comment. Signed-off-by: Paul E. McKenney Cc: Matthew Wilcox Cc: --- tools/include/linux/kernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h index 6935ef94e77ab3..857d9e22826e6d 100644 --- a/tools/include/linux/kernel.h +++ b/tools/include/linux/kernel.h @@ -116,6 +116,6 @@ int scnprintf(char * buf, size_t size, const char * fmt, ...); #define round_down(x, y) ((x) & ~__round_mask(x, y)) #define current_gfp_context(k) 0 -#define synchronize_sched() +#define synchronize_rcu() #endif From df56e0f960627f606cfc6ccda25c6ab5c61953d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Nov 2018 15:27:40 -0800 Subject: [PATCH 095/112] rcutorture/formal: Replace synchronize_sched() with synchronize_rcu() Now that synchronize_rcu() waits for preempt-disable regions of code as well as RCU read-side critical sections, synchronize_sched() can be replaced by synchronize_rcu(). This commit therefore makes this change, even though it is but a comment. Signed-off-by: Paul E. McKenney --- .../rcutorture/formal/srcu-cbmc/include/linux/types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h index 891ad13e95b2dc..d27285f8ee82d4 100644 --- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h +++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h @@ -131,8 +131,8 @@ struct hlist_node { * weird ABI and we need to ask it explicitly. * * The alignment is required to guarantee that bits 0 and 1 of @next will be - * clear under normal conditions -- as long as we use call_rcu(), - * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. + * clear under normal conditions -- as long as we use call_rcu() or + * call_srcu() to queue callback. * * This guarantee is important for few reasons: * - future call_rcu_lazy() will make use of lower bits in the pointer; From 4871848531af1d62f30032bfb872c43b9afe03ad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 15 Aug 2018 15:32:51 -0700 Subject: [PATCH 096/112] rcutorture: Add call_rcu() flooding forward-progress tests This commit adds a call_rcu() flooding loop to the forward-progress test. This emulates tight userspace loops that force call_rcu() invocations, for example, the infamous loop containing close(open()) that instigated the addition of blimit. If RCU does not make sufficient forward progress in invoking the resulting flood of callbacks, rcutorture emits a warning. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 129 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 127 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 210c774603656f..8cf700ca7845bf 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -259,6 +259,8 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); +static bool rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */ + /* * Allocate an element from the rcu_tortures pool. */ @@ -348,7 +350,8 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { + if (!rcu_fwd_cb_nodelay && + !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK)) @@ -1674,6 +1677,43 @@ static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp) cur_ops->call(&fcsp->rh, rcu_torture_fwd_prog_cb); } +/* State for continuous-flood RCU callbacks. */ +struct rcu_fwd_cb { + struct rcu_head rh; + struct rcu_fwd_cb *rfc_next; + int rfc_gps; +}; +static DEFINE_SPINLOCK(rcu_fwd_lock); +static struct rcu_fwd_cb *rcu_fwd_cb_head; +static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head; +static long n_launders_cb; +static unsigned long rcu_fwd_startat; +#define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */ +#define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ +#define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ +static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / HZ]; + +/* Callback function for continuous-flood RCU callbacks. */ +static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) +{ + int i; + struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh); + struct rcu_fwd_cb **rfcpp; + + rfcp->rfc_next = NULL; + rfcp->rfc_gps++; + spin_lock(&rcu_fwd_lock); + rfcpp = rcu_fwd_cb_tail; + rcu_fwd_cb_tail = &rfcp->rfc_next; + WRITE_ONCE(*rfcpp, rfcp); + WRITE_ONCE(n_launders_cb, n_launders_cb + 1); + i = ((jiffies - rcu_fwd_startat) / HZ); + if (i >= ARRAY_SIZE(n_launders_hist)) + i = ARRAY_SIZE(n_launders_hist) - 1; + n_launders_hist[i]++; + spin_unlock(&rcu_fwd_lock); +} + /* Carry out grace-period forward-progress testing. */ static int rcu_torture_fwd_prog(void *args) { @@ -1681,11 +1721,21 @@ static int rcu_torture_fwd_prog(void *args) unsigned long dur; struct fwd_cb_state fcs; unsigned long gps; + int i; int idx; + int j; + long n_launders; + long n_launders_cb_snap; + long n_launders_sa; + long n_max_cbs; + long n_max_gps; + struct rcu_fwd_cb *rfcp; + struct rcu_fwd_cb *rfcpn; int sd; int sd4; bool selfpropcb = false; unsigned long stopat; + unsigned long stoppedat; int tested = 0; int tested_tries = 0; static DEFINE_TORTURE_RANDOM(trs); @@ -1699,6 +1749,8 @@ static int rcu_torture_fwd_prog(void *args) } do { schedule_timeout_interruptible(fwd_progress_holdoff * HZ); + + /* Tight loop containing cond_resched(). */ if (selfpropcb) { WRITE_ONCE(fcs.stop, 0); cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); @@ -1708,7 +1760,8 @@ static int rcu_torture_fwd_prog(void *args) sd = cur_ops->stall_dur() + 1; sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; dur = sd4 + torture_random(&trs) % (sd - sd4); - stopat = jiffies + dur; + rcu_fwd_startat = jiffies; + stopat = rcu_fwd_startat + dur; while (time_before(jiffies, stopat) && !torture_must_stop()) { idx = cur_ops->readlock(); udelay(10); @@ -1729,6 +1782,78 @@ static int rcu_torture_fwd_prog(void *args) cur_ops->sync(); /* Wait for running CB to complete. */ cur_ops->cb_barrier(); /* Wait for queued callbacks. */ } + + /* Loop continuously posting RCU callbacks. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, true); + cur_ops->sync(); /* Later readers see above write. */ + rcu_fwd_startat = jiffies; + stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES; + n_launders = 0; + n_launders_cb = 0; + n_launders_sa = 0; + n_max_cbs = 0; + n_max_gps = 0; + for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) + n_launders_hist[i] = 0; + cver = READ_ONCE(rcu_torture_current_version); + gps = cur_ops->get_gp_seq(); + while (time_before(jiffies, stopat) && !torture_must_stop()) { + rfcp = READ_ONCE(rcu_fwd_cb_head); + rfcpn = NULL; + if (rfcp) + rfcpn = READ_ONCE(rfcp->rfc_next); + if (rfcpn) { + if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS && + ++n_max_gps >= MIN_FWD_CBS_LAUNDERED) + break; + rcu_fwd_cb_head = rfcpn; + n_launders++; + n_launders_sa++; + } else { + rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL); + if (WARN_ON_ONCE(!rfcp)) { + schedule_timeout_interruptible(1); + continue; + } + n_max_cbs++; + n_launders_sa = 0; + rfcp->rfc_gps = 0; + } + cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); + cond_resched(); + } + stoppedat = jiffies; + n_launders_cb_snap = READ_ONCE(n_launders_cb); + cver = READ_ONCE(rcu_torture_current_version) - cver; + gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); + cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ + for (;;) { + rfcp = rcu_fwd_cb_head; + if (!rfcp) + break; + rcu_fwd_cb_head = rfcp->rfc_next; + kfree(rfcp); + } + rcu_fwd_cb_tail = &rcu_fwd_cb_head; + WRITE_ONCE(rcu_fwd_cb_nodelay, false); + if (!torture_must_stop()) { + WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); + pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", + __func__, + stoppedat - rcu_fwd_startat, + jiffies - stoppedat, + n_launders + n_max_cbs - n_launders_cb_snap, + n_launders, n_launders_sa, + n_max_gps, n_max_cbs, cver, gps); + for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) + if (n_launders_hist[i] > 0) + break; + pr_alert("Callback-invocation histogram:"); + for (j = 0; j <= i; j++) + pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]); + pr_cont("\n"); + } + /* Avoid slow periods, better to test when busy. */ stutter_wait("rcu_torture_fwd_prog"); } while (!torture_must_stop()); From 28cf5952f56005325f269ccfe402a880cd741189 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Aug 2018 15:27:16 -0700 Subject: [PATCH 097/112] torture: Bring any extra CPUs online during kernel startup Currently, the torture scripts rely on the initrd/init script to bring any extra CPUs online, for example, in the case where the kernel and qemu have different ideas about how many CPUs are present. This works, but is an unnecessary dependency on initrd, which needs to vary depending on the distro. This commit therefore causes torture_onoff() to check for additional CPUs, attempting to bring any found online. Errors are ignored, just as they are by the initrd/init script. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/torture.c b/kernel/torture.c index 17d91f5fba2acb..9410d1bf84d669 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -194,11 +194,23 @@ torture_onoff(void *arg) int cpu; int maxcpu = -1; DEFINE_TORTURE_RANDOM(rand); + int ret; VERBOSE_TOROUT_STRING("torture_onoff task started"); for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); + if (!IS_MODULE(CONFIG_TORTURE_TEST)) + for_each_possible_cpu(cpu) { + if (cpu_online(cpu)) + continue; + ret = cpu_up(cpu); + if (ret && verbose) { + pr_alert("%s" TORTURE_FLAG + "%s: Initial online %d: errno %d\n", + __func__, torture_type, cpu, ret); + } + } if (maxcpu == 0) { VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); From fc6f9c57787e578473d47b7bbc846e317d17c1df Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Aug 2018 14:43:05 -0700 Subject: [PATCH 098/112] rcutorture: Remove cbflood facility Now that the forward-progress code does a full-bore continuous callback flood lasting multiple seconds, there is little point in also posting a mere 60,000 callbacks every second or so. This commit therefore removes the old cbflood testing. Over time, it may be desirable to concurrently do full-bore continuous callback floods on all CPUs simultaneously, but one dragon at a time. Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 18 ---- kernel/rcu/rcutorture.c | 86 +------------------ 2 files changed, 1 insertion(+), 103 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3823679deea533..86e825e0927a50 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3743,24 +3743,6 @@ in microseconds. The default of zero says no holdoff. - rcutorture.cbflood_inter_holdoff= [KNL] - Set holdoff time (jiffies) between successive - callback-flood tests. - - rcutorture.cbflood_intra_holdoff= [KNL] - Set holdoff time (jiffies) between successive - bursts of callbacks within a given callback-flood - test. - - rcutorture.cbflood_n_burst= [KNL] - Set the number of bursts making up a given - callback-flood test. Set this to zero to - disable callback-flood testing. - - rcutorture.cbflood_n_per_burst= [KNL] - Set the number of callbacks to be registered - in a given burst of a callback-flood test. - rcutorture.fqs_duration= [KNL] Set duration of force_quiescent_state bursts in microseconds. diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8cf700ca7845bf..17f480129a7801 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -80,13 +80,6 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett 0 && - cbflood_inter_holdoff > 0 && - cbflood_intra_holdoff > 0 && - cur_ops->call && - cur_ops->cb_barrier) { - rhp = vmalloc(array3_size(cbflood_n_burst, - cbflood_n_per_burst, - sizeof(*rhp))); - err = !rhp; - } - if (err) { - VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); - goto wait_for_stop; - } - VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); - do { - schedule_timeout_interruptible(cbflood_inter_holdoff); - atomic_long_inc(&n_cbfloods); - WARN_ON(signal_pending(current)); - for (i = 0; i < cbflood_n_burst; i++) { - for (j = 0; j < cbflood_n_per_burst; j++) { - cur_ops->call(&rhp[i * cbflood_n_per_burst + j], - rcu_torture_cbflood_cb); - } - schedule_timeout_interruptible(cbflood_intra_holdoff); - WARN_ON(signal_pending(current)); - } - cur_ops->cb_barrier(); - stutter_wait("rcu_torture_cbflood"); - } while (!torture_must_stop()); - vfree(rhp); -wait_for_stop: - torture_kthread_stopping("rcu_torture_cbflood"); - return 0; -} - /* * RCU torture force-quiescent-state kthread. Repeatedly induces * bursts of calls to force_quiescent_state(), increasing the probability @@ -1460,11 +1397,10 @@ rcu_torture_stats_print(void) n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); - pr_cont("barrier: %ld/%ld:%ld ", + pr_cont("barrier: %ld/%ld:%ld\n", n_barrier_successes, n_barrier_attempts, n_rcu_torture_barrier_error); - pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || @@ -2093,8 +2029,6 @@ rcu_torture_cleanup(void) cur_ops->name, gp_seq, flags); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); - for (i = 0; i < ncbflooders; i++) - torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); if (rcu_torture_can_boost()) cpuhp_remove_state(rcutor_hp); @@ -2377,24 +2311,6 @@ rcu_torture_init(void) goto unwind; if (object_debug) rcu_test_debug_objects(); - if (cbflood_n_burst > 0) { - /* Create the cbflood threads */ - ncbflooders = (num_online_cpus() + 3) / 4; - cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task), - GFP_KERNEL); - if (!cbflood_task) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < ncbflooders; i++) { - firsterr = torture_create_kthread(rcu_torture_cbflood, - NULL, - cbflood_task[i]); - if (firsterr) - goto unwind; - } - } torture_init_end(); return 0; From 6b3de7a172bc59010a9d8e425877d98c1f24555e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 Aug 2018 14:38:43 -0700 Subject: [PATCH 099/112] rcutorture: Break up too-long rcu_torture_fwd_prog() function This commit splits rcu_torture_fwd_prog_nr() and rcu_torture_fwd_prog_cr() functions out of rcu_torture_fwd_prog() in order to reduce indentation pain and because rcu_torture_fwd_prog() was getting a bit too long. In addition, this will enable easier conditional execution of the rcu_torture_fwd_prog_cr() function, which can give false-positive failures in some NO_HZ_FULL configurations due to overloading the housekeeping CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 254 +++++++++++++++++++++------------------- 1 file changed, 135 insertions(+), 119 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 17f480129a7801..bcc33bb8d9a69e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1650,15 +1650,70 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) spin_unlock(&rcu_fwd_lock); } -/* Carry out grace-period forward-progress testing. */ -static int rcu_torture_fwd_prog(void *args) +/* Carry out need_resched()/cond_resched() forward-progress testing. */ +static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) { unsigned long cver; unsigned long dur; struct fwd_cb_state fcs; unsigned long gps; - int i; int idx; + int sd; + int sd4; + bool selfpropcb = false; + unsigned long stopat; + static DEFINE_TORTURE_RANDOM(trs); + + if (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) { + init_rcu_head_on_stack(&fcs.rh); + selfpropcb = true; + } + + /* Tight loop containing cond_resched(). */ + if (selfpropcb) { + WRITE_ONCE(fcs.stop, 0); + cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); + } + cver = READ_ONCE(rcu_torture_current_version); + gps = cur_ops->get_gp_seq(); + sd = cur_ops->stall_dur() + 1; + sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; + dur = sd4 + torture_random(&trs) % (sd - sd4); + rcu_fwd_startat = jiffies; + stopat = rcu_fwd_startat + dur; + while (time_before(jiffies, stopat) && !torture_must_stop()) { + idx = cur_ops->readlock(); + udelay(10); + cur_ops->readunlock(idx); + if (!fwd_progress_need_resched || need_resched()) + cond_resched(); + } + (*tested_tries)++; + if (!time_before(jiffies, stopat) && !torture_must_stop()) { + (*tested)++; + cver = READ_ONCE(rcu_torture_current_version) - cver; + gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); + WARN_ON(!cver && gps < 2); + pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps); + } + if (selfpropcb) { + WRITE_ONCE(fcs.stop, 1); + cur_ops->sync(); /* Wait for running CB to complete. */ + cur_ops->cb_barrier(); /* Wait for queued callbacks. */ + } + + if (selfpropcb) { + WARN_ON(READ_ONCE(fcs.stop) != 2); + destroy_rcu_head_on_stack(&fcs.rh); + } +} + +/* Carry out call_rcu() forward-progress testing. */ +static void rcu_torture_fwd_prog_cr(void) +{ + unsigned long cver; + unsigned long gps; + int i; int j; long n_launders; long n_launders_cb_snap; @@ -1667,136 +1722,97 @@ static int rcu_torture_fwd_prog(void *args) long n_max_gps; struct rcu_fwd_cb *rfcp; struct rcu_fwd_cb *rfcpn; - int sd; - int sd4; - bool selfpropcb = false; unsigned long stopat; unsigned long stoppedat; + + /* Loop continuously posting RCU callbacks. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, true); + cur_ops->sync(); /* Later readers see above write. */ + rcu_fwd_startat = jiffies; + stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES; + n_launders = 0; + n_launders_cb = 0; + n_launders_sa = 0; + n_max_cbs = 0; + n_max_gps = 0; + for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) + n_launders_hist[i] = 0; + cver = READ_ONCE(rcu_torture_current_version); + gps = cur_ops->get_gp_seq(); + while (time_before(jiffies, stopat) && !torture_must_stop()) { + rfcp = READ_ONCE(rcu_fwd_cb_head); + rfcpn = NULL; + if (rfcp) + rfcpn = READ_ONCE(rfcp->rfc_next); + if (rfcpn) { + if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS && + ++n_max_gps >= MIN_FWD_CBS_LAUNDERED) + break; + rcu_fwd_cb_head = rfcpn; + n_launders++; + n_launders_sa++; + } else { + rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL); + if (WARN_ON_ONCE(!rfcp)) { + schedule_timeout_interruptible(1); + continue; + } + n_max_cbs++; + n_launders_sa = 0; + rfcp->rfc_gps = 0; + } + cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); + cond_resched(); + } + stoppedat = jiffies; + n_launders_cb_snap = READ_ONCE(n_launders_cb); + cver = READ_ONCE(rcu_torture_current_version) - cver; + gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); + cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ + for (;;) { + rfcp = rcu_fwd_cb_head; + if (!rfcp) + break; + rcu_fwd_cb_head = rfcp->rfc_next; + kfree(rfcp); + } + rcu_fwd_cb_tail = &rcu_fwd_cb_head; + WRITE_ONCE(rcu_fwd_cb_nodelay, false); + if (!torture_must_stop()) { + WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); + pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", + __func__, + stoppedat - rcu_fwd_startat, jiffies - stoppedat, + n_launders + n_max_cbs - n_launders_cb_snap, + n_launders, n_launders_sa, + n_max_gps, n_max_cbs, cver, gps); + for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) + if (n_launders_hist[i] > 0) + break; + pr_alert("Callback-invocation histogram:"); + for (j = 0; j <= i; j++) + pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]); + pr_cont("\n"); + } +} + +/* Carry out grace-period forward-progress testing. */ +static int rcu_torture_fwd_prog(void *args) +{ int tested = 0; int tested_tries = 0; - static DEFINE_TORTURE_RANDOM(trs); VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started"); if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) set_user_nice(current, MAX_NICE); - if (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) { - init_rcu_head_on_stack(&fcs.rh); - selfpropcb = true; - } do { schedule_timeout_interruptible(fwd_progress_holdoff * HZ); - - /* Tight loop containing cond_resched(). */ - if (selfpropcb) { - WRITE_ONCE(fcs.stop, 0); - cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); - } - cver = READ_ONCE(rcu_torture_current_version); - gps = cur_ops->get_gp_seq(); - sd = cur_ops->stall_dur() + 1; - sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; - dur = sd4 + torture_random(&trs) % (sd - sd4); - rcu_fwd_startat = jiffies; - stopat = rcu_fwd_startat + dur; - while (time_before(jiffies, stopat) && !torture_must_stop()) { - idx = cur_ops->readlock(); - udelay(10); - cur_ops->readunlock(idx); - if (!fwd_progress_need_resched || need_resched()) - cond_resched(); - } - tested_tries++; - if (!time_before(jiffies, stopat) && !torture_must_stop()) { - tested++; - cver = READ_ONCE(rcu_torture_current_version) - cver; - gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); - WARN_ON(!cver && gps < 2); - pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps); - } - if (selfpropcb) { - WRITE_ONCE(fcs.stop, 1); - cur_ops->sync(); /* Wait for running CB to complete. */ - cur_ops->cb_barrier(); /* Wait for queued callbacks. */ - } - - /* Loop continuously posting RCU callbacks. */ - WRITE_ONCE(rcu_fwd_cb_nodelay, true); - cur_ops->sync(); /* Later readers see above write. */ - rcu_fwd_startat = jiffies; - stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES; - n_launders = 0; - n_launders_cb = 0; - n_launders_sa = 0; - n_max_cbs = 0; - n_max_gps = 0; - for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) - n_launders_hist[i] = 0; - cver = READ_ONCE(rcu_torture_current_version); - gps = cur_ops->get_gp_seq(); - while (time_before(jiffies, stopat) && !torture_must_stop()) { - rfcp = READ_ONCE(rcu_fwd_cb_head); - rfcpn = NULL; - if (rfcp) - rfcpn = READ_ONCE(rfcp->rfc_next); - if (rfcpn) { - if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS && - ++n_max_gps >= MIN_FWD_CBS_LAUNDERED) - break; - rcu_fwd_cb_head = rfcpn; - n_launders++; - n_launders_sa++; - } else { - rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL); - if (WARN_ON_ONCE(!rfcp)) { - schedule_timeout_interruptible(1); - continue; - } - n_max_cbs++; - n_launders_sa = 0; - rfcp->rfc_gps = 0; - } - cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); - cond_resched(); - } - stoppedat = jiffies; - n_launders_cb_snap = READ_ONCE(n_launders_cb); - cver = READ_ONCE(rcu_torture_current_version) - cver; - gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); - cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ - for (;;) { - rfcp = rcu_fwd_cb_head; - if (!rfcp) - break; - rcu_fwd_cb_head = rfcp->rfc_next; - kfree(rfcp); - } - rcu_fwd_cb_tail = &rcu_fwd_cb_head; - WRITE_ONCE(rcu_fwd_cb_nodelay, false); - if (!torture_must_stop()) { - WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); - pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", - __func__, - stoppedat - rcu_fwd_startat, - jiffies - stoppedat, - n_launders + n_max_cbs - n_launders_cb_snap, - n_launders, n_launders_sa, - n_max_gps, n_max_cbs, cver, gps); - for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) - if (n_launders_hist[i] > 0) - break; - pr_alert("Callback-invocation histogram:"); - for (j = 0; j <= i; j++) - pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]); - pr_cont("\n"); - } + rcu_torture_fwd_prog_nr(&tested, &tested_tries); + rcu_torture_fwd_prog_cr(); /* Avoid slow periods, better to test when busy. */ stutter_wait("rcu_torture_fwd_prog"); } while (!torture_must_stop()); - if (selfpropcb) { - WARN_ON(READ_ONCE(fcs.stop) != 2); - destroy_rcu_head_on_stack(&fcs.rh); - } /* Short runs might not contain a valid forward-progress attempt. */ WARN_ON(!tested && tested_tries >= 5); pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); From 5ab7ab8362fa8a4f7995d65ea05edf71530e8004 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Sep 2018 18:08:09 -0700 Subject: [PATCH 100/112] rcutorture: Affinity forward-progress test to avoid housekeeping CPUs This commit affinities the forward-progress tests to avoid hogging a housekeeping CPU on the theory that the offloaded callbacks will be running on those housekeeping CPUs. Signed-off-by: Paul E. McKenney [ paulmck: Fix NULL-pointer issue located by kbuild test robot. ] Tested-by: Rong Chen --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/rcutorture.c | 1 + kernel/rcu/tree_plugin.h | 11 +++++++++++ 3 files changed, 14 insertions(+) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 2866166863f0f8..0f0f5ae8c3d4e1 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -539,8 +539,10 @@ extern struct workqueue_struct *rcu_par_gp_wq; #ifdef CONFIG_RCU_NOCB_CPU bool rcu_is_nocb_cpu(int cpu); +void rcu_bind_current_to_nocb(void); #else static inline bool rcu_is_nocb_cpu(int cpu) { return false; } +static inline void rcu_bind_current_to_nocb(void) { } #endif #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bcc33bb8d9a69e..36a3bc42782dc8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1803,6 +1803,7 @@ static int rcu_torture_fwd_prog(void *args) int tested_tries = 0; VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started"); + rcu_bind_current_to_nocb(); if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) set_user_nice(current, MAX_NICE); do { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 605ff3b06098da..25fe26c4e9c12a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2603,6 +2603,17 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) return true; } +/* + * Bind the current task to the offloaded CPUs. If there are no offloaded + * CPUs, leave the task unbound. Splat if the bind attempt fails. + */ +void rcu_bind_current_to_nocb(void) +{ + if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) + WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); +} +EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); + #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static bool rcu_nocb_cpu_needs_barrier(int cpu) From 2a7d968816a94a4c52f0082c085c6714a5b3f1ec Mon Sep 17 00:00:00 2001 From: Pierce Griffiths Date: Fri, 21 Sep 2018 20:21:31 -0500 Subject: [PATCH 101/112] torture: Remove unnecessary "ret" variables Remove return variables (declared as "ret") in cases where, depending on whether a condition evaluates as true, the result of a function call can be immediately returned instead of storing the result in the return variable. When the condition evaluates as false, the constant initially stored in the return variable at declaration is returned instead. Signed-off-by: Pierce Griffiths Signed-off-by: Paul E. McKenney --- kernel/torture.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/kernel/torture.c b/kernel/torture.c index 9410d1bf84d669..bbf6d473e50c14 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -245,16 +245,15 @@ torture_onoff(void *arg) */ int torture_onoff_init(long ooholdoff, long oointerval) { - int ret = 0; - #ifdef CONFIG_HOTPLUG_CPU onoff_holdoff = ooholdoff; onoff_interval = oointerval; if (onoff_interval <= 0) return 0; - ret = torture_create_kthread(torture_onoff, NULL, onoff_task); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - return ret; + return torture_create_kthread(torture_onoff, NULL, onoff_task); +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + return 0; +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ } EXPORT_SYMBOL_GPL(torture_onoff_init); @@ -525,15 +524,13 @@ static int torture_shutdown(void *arg) */ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) { - int ret = 0; - torture_shutdown_hook = cleanup; if (ssecs > 0) { shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0)); - ret = torture_create_kthread(torture_shutdown, NULL, + return torture_create_kthread(torture_shutdown, NULL, shutdown_task); } - return ret; + return 0; } EXPORT_SYMBOL_GPL(torture_shutdown_init); @@ -632,13 +629,10 @@ static int torture_stutter(void *arg) /* * Initialize and kick off the torture_stutter kthread. */ -int torture_stutter_init(int s) +int torture_stutter_init(const int s) { - int ret; - stutter = s; - ret = torture_create_kthread(torture_stutter, NULL, stutter_task); - return ret; + return torture_create_kthread(torture_stutter, NULL, stutter_task); } EXPORT_SYMBOL_GPL(torture_stutter_init); From 61670adcb4a9f66ff3fa8a9e846a623d9a9e1553 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Oct 2018 13:27:41 -0700 Subject: [PATCH 102/112] rcutorture: Prepare for asynchronous access to rcu_fwd_startat Because rcutorture's forward-progress checking will trigger from an OOM notifier, this notifier will introduce asynchronous concurrent access to the rcu_fwd_startat variable. This commit therefore prepares for this by converting updates to WRITE_ONCE(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 36a3bc42782dc8..c4fd61dccedbef 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1679,7 +1679,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) sd = cur_ops->stall_dur() + 1; sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; dur = sd4 + torture_random(&trs) % (sd - sd4); - rcu_fwd_startat = jiffies; + WRITE_ONCE(rcu_fwd_startat, jiffies); stopat = rcu_fwd_startat + dur; while (time_before(jiffies, stopat) && !torture_must_stop()) { idx = cur_ops->readlock(); @@ -1728,7 +1728,7 @@ static void rcu_torture_fwd_prog_cr(void) /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); cur_ops->sync(); /* Later readers see above write. */ - rcu_fwd_startat = jiffies; + WRITE_ONCE(rcu_fwd_startat, jiffies); stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES; n_launders = 0; n_launders_cb = 0; From e0aff97355575ac6a28a48a4217533a3953095c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Oct 2018 17:40:54 -0700 Subject: [PATCH 103/112] rcutorture: Dump grace-period diagnostics upon forward-progress OOM This commit adds an OOM notifier during rcutorture forward-progress testing. If this notifier is invoked, it dumps out some grace-period state to help debug the forward-progress problem. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/rcutorture.c | 31 ++++++++++++++++++++++++++++--- kernel/rcu/tree.c | 20 ++++++++++++++++++++ 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0f0f5ae8c3d4e1..a393e24a9195aa 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -526,12 +526,14 @@ srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } +static inline void rcu_fwd_progress_check(unsigned long j) { } #else /* #ifdef CONFIG_TINY_RCU */ unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); +void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; extern struct workqueue_struct *rcu_par_gp_wq; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index c4fd61dccedbef..f28b88ecb47a02 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "rcu.h" @@ -1624,6 +1625,7 @@ static struct rcu_fwd_cb *rcu_fwd_cb_head; static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head; static long n_launders_cb; static unsigned long rcu_fwd_startat; +static bool rcu_fwd_emergency_stop; #define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */ #define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ #define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ @@ -1681,7 +1683,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) dur = sd4 + torture_random(&trs) % (sd - sd4); WRITE_ONCE(rcu_fwd_startat, jiffies); stopat = rcu_fwd_startat + dur; - while (time_before(jiffies, stopat) && !torture_must_stop()) { + while (time_before(jiffies, stopat) && + !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { idx = cur_ops->readlock(); udelay(10); cur_ops->readunlock(idx); @@ -1689,7 +1692,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) cond_resched(); } (*tested_tries)++; - if (!time_before(jiffies, stopat) && !torture_must_stop()) { + if (!time_before(jiffies, stopat) && + !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { (*tested)++; cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); @@ -1739,7 +1743,8 @@ static void rcu_torture_fwd_prog_cr(void) n_launders_hist[i] = 0; cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); - while (time_before(jiffies, stopat) && !torture_must_stop()) { + while (time_before(jiffies, stopat) && + !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { rfcp = READ_ONCE(rcu_fwd_cb_head); rfcpn = NULL; if (rfcp) @@ -1796,6 +1801,23 @@ static void rcu_torture_fwd_prog_cr(void) } } + +/* + * OOM notifier, but this only prints diagnostic information for the + * current forward-progress test. + */ +static int rcutorture_oom_notify(struct notifier_block *self, + unsigned long notused, void *nfreed) +{ + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); + WRITE_ONCE(rcu_fwd_emergency_stop, true); + return NOTIFY_OK; +} + +static struct notifier_block rcutorture_oom_nb = { + .notifier_call = rcutorture_oom_notify +}; + /* Carry out grace-period forward-progress testing. */ static int rcu_torture_fwd_prog(void *args) { @@ -1808,8 +1830,11 @@ static int rcu_torture_fwd_prog(void *args) set_user_nice(current, MAX_NICE); do { schedule_timeout_interruptible(fwd_progress_holdoff * HZ); + WRITE_ONCE(rcu_fwd_emergency_stop, false); + register_oom_notifier(&rcutorture_oom_nb); rcu_torture_fwd_prog_nr(&tested, &tested_tries); rcu_torture_fwd_prog_cr(); + unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ stutter_wait("rcu_torture_fwd_prog"); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6ec3abbe90e27b..853b79a6ff1013 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2657,6 +2657,26 @@ rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +/* + * Do a forward-progress check for rcutorture. This is normally invoked + * due to an OOM event. The argument "j" gives the time period during + * which rcutorture would like progress to have been made. + */ +void rcu_fwd_progress_check(unsigned long j) +{ + struct rcu_data *rdp; + + if (rcu_gp_in_progress()) { + show_rcu_gp_kthreads(); + } else { + preempt_disable(); + rdp = this_cpu_ptr(&rcu_data); + rcu_check_gp_start_stall(rdp->mynode, rdp, j); + preempt_enable(); + } +} +EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); + /* * This does the RCU core processing work for the specified rcu_data * structures. This may be called only from the CPU to whom the rdp From 903ee83d91776bc72d856147743687d4b6c99286 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Oct 2018 16:05:46 -0700 Subject: [PATCH 104/112] rcu: Account for nocb-CPU callback counts in RCU CPU stall warnings The RCU CPU stall warnings print an estimate of the total number of RCU callbacks queued in the system, but this estimate leaves out the callbacks queued for nocbs CPUs. This commit therefore introduces rcu_get_n_cbs_cpu(), which gives an accurate callback estimate for both nocbs and normal CPUs, and uses this new function as needed. This commit also introduces a rcu_get_n_cbs_nocb_cpu() helper function that returns the number of callbacks for nocbs CPUs or zero otherwise, and also uses this function in place of direct access to ->nocb_q_count while in the area (fewer characters, you see). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 19 +++++++++++++++---- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 24 +++++++++++++++++++----- 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 853b79a6ff1013..0933d865089068 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -207,6 +207,19 @@ static int rcu_gp_in_progress(void) return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq)); } +/* + * Return the number of callbacks queued on the specified CPU. + * Handles both the nocbs and normal cases. + */ +static long rcu_get_n_cbs_cpu(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */ + return rcu_segcblist_n_cbs(&rdp->cblist); + return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */ +} + void rcu_softirq_qs(void) { rcu_qs(); @@ -1265,8 +1278,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data, - cpu)->cblist); + totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rcu_state.gp_start), (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); @@ -1326,8 +1338,7 @@ static void print_cpu_stall(void) raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data, - cpu)->cblist); + totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", jiffies - rcu_state.gp_start, (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index c3e2807a834ae4..a8f82b7dc5e2f6 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -455,6 +455,7 @@ static void __init rcu_spawn_nocb_kthreads(void); static void __init rcu_organize_nocb_kthreads(void); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static bool init_nocb_callback_list(struct rcu_data *rdp); +static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp); static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 25fe26c4e9c12a..1b3dd2fc0cd64b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2011,7 +2011,7 @@ static bool rcu_nocb_cpu_needs_barrier(int cpu) * (if a callback is in fact needed). This is associated with an * atomic_inc() in the caller. */ - ret = atomic_long_read(&rdp->nocb_q_count); + ret = rcu_get_n_cbs_nocb_cpu(rdp); #ifdef CONFIG_PROVE_RCU rhp = READ_ONCE(rdp->nocb_head); @@ -2066,7 +2066,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, TPS("WakeNotPoll")); return; } - len = atomic_long_read(&rdp->nocb_q_count); + len = rcu_get_n_cbs_nocb_cpu(rdp); if (old_rhpp == &rdp->nocb_head) { if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ @@ -2115,11 +2115,11 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, trace_rcu_kfree_callback(rcu_state.name, rhp, (unsigned long)rhp->func, -atomic_long_read(&rdp->nocb_q_count_lazy), - -atomic_long_read(&rdp->nocb_q_count)); + -rcu_get_n_cbs_nocb_cpu(rdp)); else trace_rcu_callback(rcu_state.name, rhp, -atomic_long_read(&rdp->nocb_q_count_lazy), - -atomic_long_read(&rdp->nocb_q_count)); + -rcu_get_n_cbs_nocb_cpu(rdp)); /* * If called from an extended quiescent state with interrupts @@ -2343,7 +2343,7 @@ static int rcu_nocb_kthread(void *arg) /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rcu_state.name, atomic_long_read(&rdp->nocb_q_count_lazy), - atomic_long_read(&rdp->nocb_q_count), -1); + rcu_get_n_cbs_nocb_cpu(rdp), -1); c = cl = 0; while (list) { next = list->next; @@ -2614,6 +2614,15 @@ void rcu_bind_current_to_nocb(void) } EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); +/* + * Return the number of RCU callbacks still queued from the specified + * CPU, which must be a nocbs CPU. + */ +static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) +{ + return atomic_long_read(&rdp->nocb_q_count); +} + #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static bool rcu_nocb_cpu_needs_barrier(int cpu) @@ -2674,6 +2683,11 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) return false; } +static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) +{ + return 0; +} + #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /* From bfcfcffc5f23e76a1b88f7412ee7efaec5107b28 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Oct 2018 17:09:56 -0700 Subject: [PATCH 105/112] rcu: Print per-CPU callback counts for forward-progress failures This commit prints out the non-zero per-CPU callback counts when a forware-progress error (OOM event) occurs. Signed-off-by: Paul E. McKenney [ paulmck: Fix a pair of uninitialized locals spotted by kbuild test robot. ] --- kernel/rcu/tree.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0933d865089068..90a67c75a447ba 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2675,6 +2675,10 @@ rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, */ void rcu_fwd_progress_check(unsigned long j) { + unsigned long cbs; + int cpu; + unsigned long max_cbs = 0; + int max_cpu = -1; struct rcu_data *rdp; if (rcu_gp_in_progress()) { @@ -2685,6 +2689,20 @@ void rcu_fwd_progress_check(unsigned long j) rcu_check_gp_start_stall(rdp->mynode, rdp, j); preempt_enable(); } + for_each_possible_cpu(cpu) { + cbs = rcu_get_n_cbs_cpu(cpu); + if (!cbs) + continue; + if (max_cpu < 0) + pr_info("%s: callbacks", __func__); + pr_cont(" %d: %lu", cpu, cbs); + if (cbs <= max_cbs) + continue; + max_cbs = cbs; + max_cpu = cpu; + } + if (max_cpu >= 0) + pr_cont("\n"); } EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); From 8dd3b54689d9a2103c0817f2b7adc51760a45551 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Oct 2018 11:02:05 -0700 Subject: [PATCH 106/112] rcutorture: Print GP age upon forward-progress failure Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 90a67c75a447ba..f916315419651a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2682,6 +2682,8 @@ void rcu_fwd_progress_check(unsigned long j) struct rcu_data *rdp; if (rcu_gp_in_progress()) { + pr_info("%s: GP age %lu jiffies\n", + __func__, jiffies - rcu_state.gp_start); show_rcu_gp_kthreads(); } else { preempt_disable(); From 1a682754c7ed9df213069d5a0d3981f8360a32c2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Oct 2018 12:33:41 -0700 Subject: [PATCH 107/112] rcutorture: Print histogram of CB invocation at OOM time One reason why a forward-progress test might fail would be if something prevented or delayed callback invocation. This commit therefore adds a callback-invocation histogram printout when OOM is reported to rcutorture. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f28b88ecb47a02..329f4fb1312590 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1631,6 +1631,20 @@ static bool rcu_fwd_emergency_stop; #define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / HZ]; +static void rcu_torture_fwd_cb_hist(void) +{ + int i; + int j; + + for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) + if (n_launders_hist[i] > 0) + break; + pr_alert("%s: Callback-invocation histogram:", __func__); + for (j = 0; j <= i; j++) + pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]); + pr_cont("\n"); +} + /* Callback function for continuous-flood RCU callbacks. */ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) { @@ -1718,7 +1732,6 @@ static void rcu_torture_fwd_prog_cr(void) unsigned long cver; unsigned long gps; int i; - int j; long n_launders; long n_launders_cb_snap; long n_launders_sa; @@ -1791,13 +1804,7 @@ static void rcu_torture_fwd_prog_cr(void) n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, n_max_gps, n_max_cbs, cver, gps); - for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) - if (n_launders_hist[i] > 0) - break; - pr_alert("Callback-invocation histogram:"); - for (j = 0; j <= i; j++) - pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]); - pr_cont("\n"); + rcu_torture_fwd_cb_hist(); } } @@ -1809,6 +1816,7 @@ static void rcu_torture_fwd_prog_cr(void) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { + rcu_torture_fwd_cb_hist(); rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); WRITE_ONCE(rcu_fwd_emergency_stop, true); return NOTIFY_OK; From c51d7b5e6c94aa6b554c27bd2b0eb64ebef02334 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Oct 2018 17:25:33 -0700 Subject: [PATCH 108/112] rcutorture: Print time since GP end upon forward-progress failure If rcutorture's forward-progress tests fail while a grace period is not in progress, it is useful to print the time since the last grace period ended as a way to detect failure to launch a new grace period. This commit therefore makes this change. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++++- kernel/rcu/tree.h | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f916315419651a..9180158756d2c2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2000,7 +2000,8 @@ static void rcu_gp_cleanup(void) WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); - gp_duration = jiffies - rcu_state.gp_start; + rcu_state.gp_end = jiffies; + gp_duration = rcu_state.gp_end - rcu_state.gp_start; if (gp_duration > rcu_state.gp_max) rcu_state.gp_max = gp_duration; @@ -2686,6 +2687,8 @@ void rcu_fwd_progress_check(unsigned long j) __func__, jiffies - rcu_state.gp_start); show_rcu_gp_kthreads(); } else { + pr_info("%s: Last GP end %lu jiffies ago\n", + __func__, jiffies - rcu_state.gp_end); preempt_disable(); rdp = this_cpu_ptr(&rcu_data); rcu_check_gp_start_stall(rdp->mynode, rdp, j); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a8f82b7dc5e2f6..d90b02b53c0ec4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -328,6 +328,8 @@ struct rcu_state { /* force_quiescent_state(). */ unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ + unsigned long gp_end; /* Time last GP ended, again */ + /* in jiffies. */ unsigned long gp_activity; /* Time of last GP kthread */ /* activity in jiffies. */ unsigned long gp_req_activity; /* Time of last GP request */ From 73d665b1410afae405309ad4475a98924776ab13 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Oct 2018 10:54:22 -0700 Subject: [PATCH 109/112] rcutorture: Print forward-progress test age upon failure This commit prints the age of the forward-progress test in jiffies, in order to allow better interpretation of the callback-invocation histograms. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 329f4fb1312590..080b5ac6340cd5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1639,7 +1639,8 @@ static void rcu_torture_fwd_cb_hist(void) for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) if (n_launders_hist[i] > 0) break; - pr_alert("%s: Callback-invocation histogram:", __func__); + pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", + __func__, jiffies - rcu_fwd_startat); for (j = 0; j <= i; j++) pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]); pr_cont("\n"); From 2667ccce9328e4e25ed77a83291c066d5e11e65a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Oct 2018 09:09:49 -0700 Subject: [PATCH 110/112] rcutorture: Recover from OOM during forward-progress tests This commit causes the OOM handler to do rcu_barrier() calls and to free up forward-progress callbacks in order to recover from OOM events. The current test is terminated, but subsequent forward-progress tests can proceed. This allows a long test to result in multiple forward-progress failures, greatly reducing the required testing time. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 60 +++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 080b5ac6340cd5..afa98162575d85 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1649,13 +1649,14 @@ static void rcu_torture_fwd_cb_hist(void) /* Callback function for continuous-flood RCU callbacks. */ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) { + unsigned long flags; int i; struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh); struct rcu_fwd_cb **rfcpp; rfcp->rfc_next = NULL; rfcp->rfc_gps++; - spin_lock(&rcu_fwd_lock); + spin_lock_irqsave(&rcu_fwd_lock, flags); rfcpp = rcu_fwd_cb_tail; rcu_fwd_cb_tail = &rfcp->rfc_next; WRITE_ONCE(*rfcpp, rfcp); @@ -1664,7 +1665,33 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) if (i >= ARRAY_SIZE(n_launders_hist)) i = ARRAY_SIZE(n_launders_hist) - 1; n_launders_hist[i]++; - spin_unlock(&rcu_fwd_lock); + spin_unlock_irqrestore(&rcu_fwd_lock, flags); +} + +/* + * Free all callbacks on the rcu_fwd_cb_head list, either because the + * test is over or because we hit an OOM event. + */ +static unsigned long rcu_torture_fwd_prog_cbfree(void) +{ + unsigned long flags; + unsigned long freed = 0; + struct rcu_fwd_cb *rfcp; + + for (;;) { + spin_lock_irqsave(&rcu_fwd_lock, flags); + rfcp = rcu_fwd_cb_head; + if (!rfcp) + break; + rcu_fwd_cb_head = rfcp->rfc_next; + if (!rcu_fwd_cb_head) + rcu_fwd_cb_tail = &rcu_fwd_cb_head; + spin_unlock_irqrestore(&rcu_fwd_lock, flags); + kfree(rfcp); + freed++; + } + spin_unlock_irqrestore(&rcu_fwd_lock, flags); + return freed; } /* Carry out need_resched()/cond_resched() forward-progress testing. */ @@ -1743,6 +1770,9 @@ static void rcu_torture_fwd_prog_cr(void) unsigned long stopat; unsigned long stoppedat; + if (READ_ONCE(rcu_fwd_emergency_stop)) + return; /* Get out of the way quickly, no GP wait! */ + /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); cur_ops->sync(); /* Later readers see above write. */ @@ -1788,16 +1818,10 @@ static void rcu_torture_fwd_prog_cr(void) cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ - for (;;) { - rfcp = rcu_fwd_cb_head; - if (!rfcp) - break; - rcu_fwd_cb_head = rfcp->rfc_next; - kfree(rfcp); - } - rcu_fwd_cb_tail = &rcu_fwd_cb_head; + (void)rcu_torture_fwd_prog_cbfree(); + WRITE_ONCE(rcu_fwd_cb_nodelay, false); - if (!torture_must_stop()) { + if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", __func__, @@ -1817,9 +1841,23 @@ static void rcu_torture_fwd_prog_cr(void) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { + WARN(1, "%s invoked upon OOM during forward-progress testing.\n", + __func__); rcu_torture_fwd_cb_hist(); rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); WRITE_ONCE(rcu_fwd_emergency_stop, true); + smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ + pr_info("%s: Freed %lu RCU callbacks.\n", + __func__, rcu_torture_fwd_prog_cbfree()); + rcu_barrier(); + pr_info("%s: Freed %lu RCU callbacks.\n", + __func__, rcu_torture_fwd_prog_cbfree()); + rcu_barrier(); + pr_info("%s: Freed %lu RCU callbacks.\n", + __func__, rcu_torture_fwd_prog_cbfree()); + smp_mb(); /* Frees before return to avoid redoing OOM. */ + (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ + pr_info("%s returning after OOM processing.\n", __func__); return NOTIFY_OK; } From 2e57bf97a6856f2dc10fb4377c452cb08f844047 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Oct 2018 16:43:09 -0700 Subject: [PATCH 111/112] rcutorture: Use 100ms buckets for forward-progress callback histograms This commit narrows the scope of each bucket of the forward-progress callback-invocation histograms from one second to 100 milliseconds, which aids debugging of forward-progress problems by making shorter-duration callback-invocation stalls visible. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index afa98162575d85..a4c4a24bdcaa9f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1629,7 +1629,8 @@ static bool rcu_fwd_emergency_stop; #define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */ #define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ #define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ -static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / HZ]; +#define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */ +static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)]; static void rcu_torture_fwd_cb_hist(void) { @@ -1642,7 +1643,8 @@ static void rcu_torture_fwd_cb_hist(void) pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", __func__, jiffies - rcu_fwd_startat); for (j = 0; j <= i; j++) - pr_cont(" %ds: %ld", j + 1, n_launders_hist[j]); + pr_cont(" %ds/%d: %ld", + j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j]); pr_cont("\n"); } @@ -1661,7 +1663,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) rcu_fwd_cb_tail = &rfcp->rfc_next; WRITE_ONCE(*rfcpp, rfcp); WRITE_ONCE(n_launders_cb, n_launders_cb + 1); - i = ((jiffies - rcu_fwd_startat) / HZ); + i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); if (i >= ARRAY_SIZE(n_launders_hist)) i = ARRAY_SIZE(n_launders_hist) - 1; n_launders_hist[i]++; From 5ac7cdc29897e5fc3f5e214f3f8c8b03ef8d7029 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Oct 2018 05:46:58 -0700 Subject: [PATCH 112/112] rcutorture: Don't do busted forward-progress testing The "busted" rcutorture type is an intentionally broken implementation of RCU. Doing forward-progress testing on this implementation is not particularly meaningful on the one hand and can result in fatal abuse of the memory allocator on the other. This commit therefore disables forward-progress testing of the "busted" rcutorture type. Reported-by: kernel test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a4c4a24bdcaa9f..f6e85faa4ff4b3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1900,7 +1900,8 @@ static int __init rcu_torture_fwd_prog_init(void) { if (!fwd_progress) return 0; /* Not requested, so don't do it. */ - if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0) { + if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || + cur_ops == &rcu_busted_ops) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test"); return 0; }