forked from torvalds/linux
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtree_plugin.h
2609 lines (2329 loc) · 79.4 KB
/
tree_plugin.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Read-Copy Update mechanism for mutual exclusion (tree-based version)
* Internal non-public definitions that provide either classic
* or preemptible semantics.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, you can access it online at
* http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright Red Hat, 2009
* Copyright IBM Corporation, 2009
*
* Author: Ingo Molnar <[email protected]>
* Paul E. McKenney <[email protected]>
*/
#include <linux/delay.h>
#include <linux/gfp.h>
#include <linux/oom.h>
#include <linux/sched/debug.h>
#include <linux/smpboot.h>
#include <linux/sched/isolation.h>
#include <uapi/linux/sched/types.h>
#include "../time/tick-internal.h"
#ifdef CONFIG_RCU_BOOST
#include "../locking/rtmutex_common.h"
/*
* Control variables for per-CPU and per-rcu_node kthreads. These
* handle all flavors of RCU.
*/
static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
DEFINE_PER_CPU(char, rcu_cpu_has_work);
#else /* #ifdef CONFIG_RCU_BOOST */
/*
* Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
* all uses are in dead code. Provide a definition to keep the compiler
* happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
* This probably needs to be excluded from -rt builds.
*/
#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1)
#endif /* #else #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/*
* Check the RCU kernel configuration parameters and print informative
* messages about anything out of the ordinary.
*/
static void __init rcu_bootup_announce_oddness(void)
{
if (IS_ENABLED(CONFIG_RCU_TRACE))
pr_info("\tRCU event tracing is enabled.\n");
if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
(!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
RCU_FANOUT);
if (rcu_fanout_exact)
pr_info("\tHierarchical RCU autobalancing is disabled.\n");
if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
if (IS_ENABLED(CONFIG_PROVE_RCU))
pr_info("\tRCU lockdep checking is enabled.\n");
if (RCU_NUM_LVLS >= 4)
pr_info("\tFour(or more)-level hierarchy is enabled.\n");
if (RCU_FANOUT_LEAF != 16)
pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
RCU_FANOUT_LEAF);
if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids);
#ifdef CONFIG_RCU_BOOST
pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY);
#endif
if (blimit != DEFAULT_RCU_BLIMIT)
pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit);
if (qhimark != DEFAULT_RCU_QHIMARK)
pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
if (qlowmark != DEFAULT_RCU_QLOMARK)
pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
if (jiffies_till_first_fqs != ULONG_MAX)
pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
if (jiffies_till_next_fqs != ULONG_MAX)
pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs);
if (rcu_kick_kthreads)
pr_info("\tKick kthreads if too-long grace period.\n");
if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
if (gp_preinit_delay)
pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
if (gp_init_delay)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
if (gp_cleanup_delay)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
pr_info("\tRCU debug extended QS entry/exit.\n");
rcupdate_announce_bootup_oddness();
}
#ifdef CONFIG_PREEMPT_RCU
RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake);
/*
* Tell them what RCU they are running.
*/
static void __init rcu_bootup_announce(void)
{
pr_info("Preemptible hierarchical RCU implementation.\n");
rcu_bootup_announce_oddness();
}
/* Flags for rcu_preempt_ctxt_queue() decision table. */
#define RCU_GP_TASKS 0x8
#define RCU_EXP_TASKS 0x4
#define RCU_GP_BLKD 0x2
#define RCU_EXP_BLKD 0x1
/*
* Queues a task preempted within an RCU-preempt read-side critical
* section into the appropriate location within the ->blkd_tasks list,
* depending on the states of any ongoing normal and expedited grace
* periods. The ->gp_tasks pointer indicates which element the normal
* grace period is waiting on (NULL if none), and the ->exp_tasks pointer
* indicates which element the expedited grace period is waiting on (again,
* NULL if none). If a grace period is waiting on a given element in the
* ->blkd_tasks list, it also waits on all subsequent elements. Thus,
* adding a task to the tail of the list blocks any grace period that is
* already waiting on one of the elements. In contrast, adding a task
* to the head of the list won't block any grace period that is already
* waiting on one of the elements.
*
* This queuing is imprecise, and can sometimes make an ongoing grace
* period wait for a task that is not strictly speaking blocking it.
* Given the choice, we needlessly block a normal grace period rather than
* blocking an expedited grace period.
*
* Note that an endless sequence of expedited grace periods still cannot
* indefinitely postpone a normal grace period. Eventually, all of the
* fixed number of preempted tasks blocking the normal grace period that are
* not also blocking the expedited grace period will resume and complete
* their RCU read-side critical sections. At that point, the ->gp_tasks
* pointer will equal the ->exp_tasks pointer, at which point the end of
* the corresponding expedited grace period will also be the end of the
* normal grace period.
*/
static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
__releases(rnp->lock) /* But leaves rrupts disabled. */
{
int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
(rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
(rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) +
(rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
struct task_struct *t = current;
lockdep_assert_held(&rnp->lock);
WARN_ON_ONCE(rdp->mynode != rnp);
WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
/*
* Decide where to queue the newly blocked task. In theory,
* this could be an if-statement. In practice, when I tried
* that, it was quite messy.
*/
switch (blkd_state) {
case 0:
case RCU_EXP_TASKS:
case RCU_EXP_TASKS + RCU_GP_BLKD:
case RCU_GP_TASKS:
case RCU_GP_TASKS + RCU_EXP_TASKS:
/*
* Blocking neither GP, or first task blocking the normal
* GP but not blocking the already-waiting expedited GP.
* Queue at the head of the list to avoid unnecessarily
* blocking the already-waiting GPs.
*/
list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
break;
case RCU_EXP_BLKD:
case RCU_GP_BLKD:
case RCU_GP_BLKD + RCU_EXP_BLKD:
case RCU_GP_TASKS + RCU_EXP_BLKD:
case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
/*
* First task arriving that blocks either GP, or first task
* arriving that blocks the expedited GP (with the normal
* GP already waiting), or a task arriving that blocks
* both GPs with both GPs already waiting. Queue at the
* tail of the list to avoid any GP waiting on any of the
* already queued tasks that are not blocking it.
*/
list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks);
break;
case RCU_EXP_TASKS + RCU_EXP_BLKD:
case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD:
/*
* Second or subsequent task blocking the expedited GP.
* The task either does not block the normal GP, or is the
* first task blocking the normal GP. Queue just after
* the first task blocking the expedited GP.
*/
list_add(&t->rcu_node_entry, rnp->exp_tasks);
break;
case RCU_GP_TASKS + RCU_GP_BLKD:
case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD:
/*
* Second or subsequent task blocking the normal GP.
* The task does not block the expedited GP. Queue just
* after the first task blocking the normal GP.
*/
list_add(&t->rcu_node_entry, rnp->gp_tasks);
break;
default:
/* Yet another exercise in excessive paranoia. */
WARN_ON_ONCE(1);
break;
}
/*
* We have now queued the task. If it was the first one to
* block either grace period, update the ->gp_tasks and/or
* ->exp_tasks pointers, respectively, to reference the newly
* blocked tasks.
*/
if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD))
rnp->gp_tasks = &t->rcu_node_entry;
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
rnp->exp_tasks = &t->rcu_node_entry;
WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
!(rnp->qsmask & rdp->grpmask));
WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
!(rnp->expmask & rdp->grpmask));
raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
/*
* Report the quiescent state for the expedited GP. This expedited
* GP should not be able to end until we report, so there should be
* no need to check for a subsequent expedited GP. (Though we are
* still in a quiescent state in any case.)
*/
if (blkd_state & RCU_EXP_BLKD &&
t->rcu_read_unlock_special.b.exp_need_qs) {
t->rcu_read_unlock_special.b.exp_need_qs = false;
rcu_report_exp_rdp(rdp->rsp, rdp, true);
} else {
WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
}
}
/*
* Record a preemptible-RCU quiescent state for the specified CPU. Note
* that this just means that the task currently running on the CPU is
* not in a quiescent state. There might be any number of tasks blocked
* while in an RCU read-side critical section.
*
* As with the other rcu_*_qs() functions, callers to this function
* must disable preemption.
*/
static void rcu_preempt_qs(void)
{
RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n");
if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) {
trace_rcu_grace_period(TPS("rcu_preempt"),
__this_cpu_read(rcu_data_p->gpnum),
TPS("cpuqs"));
__this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false);
barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
current->rcu_read_unlock_special.b.need_qs = false;
}
}
/*
* We have entered the scheduler, and the current task might soon be
* context-switched away from. If this task is in an RCU read-side
* critical section, we will no longer be able to rely on the CPU to
* record that fact, so we enqueue the task on the blkd_tasks list.
* The task will dequeue itself when it exits the outermost enclosing
* RCU read-side critical section. Therefore, the current grace period
* cannot be permitted to complete until the blkd_tasks list entries
* predating the current grace period drain, in other words, until
* rnp->gp_tasks becomes NULL.
*
* Caller must disable interrupts.
*/
static void rcu_preempt_note_context_switch(bool preempt)
{
struct task_struct *t = current;
struct rcu_data *rdp;
struct rcu_node *rnp;
lockdep_assert_irqs_disabled();
WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
if (t->rcu_read_lock_nesting > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
/* Possibly blocking in an RCU read-side critical section. */
rdp = this_cpu_ptr(rcu_state_p->rda);
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp);
t->rcu_read_unlock_special.b.blocked = true;
t->rcu_blocked_node = rnp;
/*
* Verify the CPU's sanity, trace the preemption, and
* then queue the task as required based on the states
* of any ongoing and expedited grace periods.
*/
WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
trace_rcu_preempt_task(rdp->rsp->name,
t->pid,
(rnp->qsmask & rdp->grpmask)
? rnp->gpnum
: rnp->gpnum + 1);
rcu_preempt_ctxt_queue(rnp, rdp);
} else if (t->rcu_read_lock_nesting < 0 &&
t->rcu_read_unlock_special.s) {
/*
* Complete exit from RCU read-side critical section on
* behalf of preempted instance of __rcu_read_unlock().
*/
rcu_read_unlock_special(t);
}
/*
* Either we were not in an RCU read-side critical section to
* begin with, or we have now recorded that critical section
* globally. Either way, we can now note a quiescent state
* for this CPU. Again, if we were in an RCU read-side critical
* section, and if that critical section was blocking the current
* grace period, then the fact that the task has been enqueued
* means that we continue to block the current grace period.
*/
rcu_preempt_qs();
}
/*
* Check for preempted RCU readers blocking the current grace period
* for the specified rcu_node structure. If the caller needs a reliable
* answer, it must hold the rcu_node's ->lock.
*/
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
{
return rnp->gp_tasks != NULL;
}
/*
* Advance a ->blkd_tasks-list pointer to the next entry, instead
* returning NULL if at the end of the list.
*/
static struct list_head *rcu_next_node_entry(struct task_struct *t,
struct rcu_node *rnp)
{
struct list_head *np;
np = t->rcu_node_entry.next;
if (np == &rnp->blkd_tasks)
np = NULL;
return np;
}
/*
* Return true if the specified rcu_node structure has tasks that were
* preempted within an RCU read-side critical section.
*/
static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
{
return !list_empty(&rnp->blkd_tasks);
}
/*
* Handle special cases during rcu_read_unlock(), such as needing to
* notify RCU core processing or task having blocked during the RCU
* read-side critical section.
*/
void rcu_read_unlock_special(struct task_struct *t)
{
bool empty_exp;
bool empty_norm;
bool empty_exp_now;
unsigned long flags;
struct list_head *np;
bool drop_boost_mutex = false;
struct rcu_data *rdp;
struct rcu_node *rnp;
union rcu_special special;
/* NMI handlers cannot block and cannot safely manipulate state. */
if (in_nmi())
return;
local_irq_save(flags);
/*
* If RCU core is waiting for this CPU to exit its critical section,
* report the fact that it has exited. Because irqs are disabled,
* t->rcu_read_unlock_special cannot change.
*/
special = t->rcu_read_unlock_special;
if (special.b.need_qs) {
rcu_preempt_qs();
t->rcu_read_unlock_special.b.need_qs = false;
if (!t->rcu_read_unlock_special.s) {
local_irq_restore(flags);
return;
}
}
/*
* Respond to a request for an expedited grace period, but only if
* we were not preempted, meaning that we were running on the same
* CPU throughout. If we were preempted, the exp_need_qs flag
* would have been cleared at the time of the first preemption,
* and the quiescent state would be reported when we were dequeued.
*/
if (special.b.exp_need_qs) {
WARN_ON_ONCE(special.b.blocked);
t->rcu_read_unlock_special.b.exp_need_qs = false;
rdp = this_cpu_ptr(rcu_state_p->rda);
rcu_report_exp_rdp(rcu_state_p, rdp, true);
if (!t->rcu_read_unlock_special.s) {
local_irq_restore(flags);
return;
}
}
/* Hardware IRQ handlers cannot block, complain if they get here. */
if (in_irq() || in_serving_softirq()) {
lockdep_rcu_suspicious(__FILE__, __LINE__,
"rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
t->rcu_read_unlock_special.s,
t->rcu_read_unlock_special.b.blocked,
t->rcu_read_unlock_special.b.exp_need_qs,
t->rcu_read_unlock_special.b.need_qs);
local_irq_restore(flags);
return;
}
/* Clean up if blocked during RCU read-side critical section. */
if (special.b.blocked) {
t->rcu_read_unlock_special.b.blocked = false;
/*
* Remove this task from the list it blocked on. The task
* now remains queued on the rcu_node corresponding to the
* CPU it first blocked on, so there is no longer any need
* to loop. Retain a WARN_ON_ONCE() out of sheer paranoia.
*/
rnp = t->rcu_blocked_node;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
WARN_ON_ONCE(rnp != t->rcu_blocked_node);
WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
np = rcu_next_node_entry(t, rnp);
list_del_init(&t->rcu_node_entry);
t->rcu_blocked_node = NULL;
trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
rnp->gpnum, t->pid);
if (&t->rcu_node_entry == rnp->gp_tasks)
rnp->gp_tasks = np;
if (&t->rcu_node_entry == rnp->exp_tasks)
rnp->exp_tasks = np;
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
if (&t->rcu_node_entry == rnp->boost_tasks)
rnp->boost_tasks = np;
}
/*
* If this was the last task on the current list, and if
* we aren't waiting on any CPUs, report the quiescent state.
* Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
* so we must take a snapshot of the expedited state.
*/
empty_exp_now = sync_rcu_preempt_exp_done(rnp);
if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
rnp->gpnum,
0, rnp->qsmask,
rnp->level,
rnp->grplo,
rnp->grphi,
!!rnp->gp_tasks);
rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
} else {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/* Unboost if we were boosted. */
if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
rt_mutex_futex_unlock(&rnp->boost_mtx);
/*
* If this was the last task on the expedited lists,
* then we need to report up the rcu_node hierarchy.
*/
if (!empty_exp && empty_exp_now)
rcu_report_exp_rnp(rcu_state_p, rnp, true);
} else {
local_irq_restore(flags);
}
}
/*
* Dump detailed information for all tasks blocking the current RCU
* grace period on the specified rcu_node structure.
*/
static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
{
unsigned long flags;
struct task_struct *t;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (!rcu_preempt_blocked_readers_cgp(rnp)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
t = list_entry(rnp->gp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
sched_show_task(t);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/*
* Dump detailed information for all tasks blocking the current RCU
* grace period.
*/
static void rcu_print_detail_task_stall(struct rcu_state *rsp)
{
struct rcu_node *rnp = rcu_get_root(rsp);
rcu_print_detail_task_stall_rnp(rnp);
rcu_for_each_leaf_node(rsp, rnp)
rcu_print_detail_task_stall_rnp(rnp);
}
static void rcu_print_task_stall_begin(struct rcu_node *rnp)
{
pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
rnp->level, rnp->grplo, rnp->grphi);
}
static void rcu_print_task_stall_end(void)
{
pr_cont("\n");
}
/*
* Scan the current list of tasks blocked within RCU read-side critical
* sections, printing out the tid of each.
*/
static int rcu_print_task_stall(struct rcu_node *rnp)
{
struct task_struct *t;
int ndetected = 0;
if (!rcu_preempt_blocked_readers_cgp(rnp))
return 0;
rcu_print_task_stall_begin(rnp);
t = list_entry(rnp->gp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
pr_cont(" P%d", t->pid);
ndetected++;
}
rcu_print_task_stall_end();
return ndetected;
}
/*
* Scan the current list of tasks blocked within RCU read-side critical
* sections, printing out the tid of each that is blocking the current
* expedited grace period.
*/
static int rcu_print_task_exp_stall(struct rcu_node *rnp)
{
struct task_struct *t;
int ndetected = 0;
if (!rnp->exp_tasks)
return 0;
t = list_entry(rnp->exp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
pr_cont(" P%d", t->pid);
ndetected++;
}
return ndetected;
}
/*
* Check that the list of blocked tasks for the newly completed grace
* period is in fact empty. It is a serious bug to complete a grace
* period that still has RCU readers blocked! This function must be
* invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
* must be held by the caller.
*
* Also, if there are blocked tasks on the list, they automatically
* block the newly created grace period, so set up ->gp_tasks accordingly.
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
struct task_struct *t;
RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
if (rcu_preempt_has_tasks(rnp)) {
rnp->gp_tasks = rnp->blkd_tasks.next;
t = container_of(rnp->gp_tasks, struct task_struct,
rcu_node_entry);
trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
rnp->gpnum, t->pid);
}
WARN_ON_ONCE(rnp->qsmask);
}
/*
* Check for a quiescent state from the current CPU. When a task blocks,
* the task is recorded in the corresponding CPU's rcu_node structure,
* which is checked elsewhere.
*
* Caller must disable hard irqs.
*/
static void rcu_preempt_check_callbacks(void)
{
struct task_struct *t = current;
if (t->rcu_read_lock_nesting == 0) {
rcu_preempt_qs();
return;
}
if (t->rcu_read_lock_nesting > 0 &&
__this_cpu_read(rcu_data_p->core_needs_qs) &&
__this_cpu_read(rcu_data_p->cpu_no_qs.b.norm))
t->rcu_read_unlock_special.b.need_qs = true;
}
#ifdef CONFIG_RCU_BOOST
static void rcu_preempt_do_callbacks(void)
{
rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
}
#endif /* #ifdef CONFIG_RCU_BOOST */
/**
* call_rcu() - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
* @func: actual callback function to be invoked after the grace period
*
* The callback function will be invoked some time after a full grace
* period elapses, in other words after all pre-existing RCU read-side
* critical sections have completed. However, the callback function
* might well execute concurrently with RCU read-side critical sections
* that started after call_rcu() was invoked. RCU read-side critical
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
*
* Note that all CPUs must agree that the grace period extended beyond
* all pre-existing RCU read-side critical section. On systems with more
* than one CPU, this means that when "func()" is invoked, each CPU is
* guaranteed to have executed a full memory barrier since the end of its
* last RCU read-side critical section whose beginning preceded the call
* to call_rcu(). It also means that each CPU executing an RCU read-side
* critical section that continues beyond the start of "func()" must have
* executed a memory barrier after the call_rcu() but before the beginning
* of that RCU read-side critical section. Note that these guarantees
* include CPUs that are offline, idle, or executing in user mode, as
* well as CPUs that are executing in the kernel.
*
* Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
* resulting RCU callback function "func()", then both CPU A and CPU B are
* guaranteed to execute a full memory barrier during the time interval
* between the call to call_rcu() and the invocation of "func()" -- even
* if CPU A and CPU B are the same CPU (but again only if the system has
* more than one CPU).
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
__call_rcu(head, func, rcu_state_p, -1, 0);
}
EXPORT_SYMBOL_GPL(call_rcu);
/**
* synchronize_rcu - wait until a grace period has elapsed.
*
* Control will return to the caller some time after a full grace
* period has elapsed, in other words after all currently executing RCU
* read-side critical sections have completed. Note, however, that
* upon return from synchronize_rcu(), the caller might well be executing
* concurrently with new RCU read-side critical sections that began while
* synchronize_rcu() was waiting. RCU read-side critical sections are
* delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
*
* See the description of synchronize_sched() for more detailed
* information on memory-ordering guarantees. However, please note
* that -only- the memory-ordering guarantees apply. For example,
* synchronize_rcu() is -not- guaranteed to wait on things like code
* protected by preempt_disable(), instead, synchronize_rcu() is -only-
* guaranteed to wait on RCU read-side critical sections, that is, sections
* of code protected by rcu_read_lock().
*/
void synchronize_rcu(void)
{
RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return;
if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
else
wait_rcu_gp(call_rcu);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
/**
* rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
*
* Note that this primitive does not necessarily wait for an RCU grace period
* to complete. For example, if there are no RCU callbacks queued anywhere
* in the system, then rcu_barrier() is within its rights to return
* immediately, without waiting for anything, much less an RCU grace period.
*/
void rcu_barrier(void)
{
_rcu_barrier(rcu_state_p);
}
EXPORT_SYMBOL_GPL(rcu_barrier);
/*
* Initialize preemptible RCU's state structures.
*/
static void __init __rcu_init_preempt(void)
{
rcu_init_one(rcu_state_p);
}
/*
* Check for a task exiting while in a preemptible-RCU read-side
* critical section, clean up if so. No need to issue warnings,
* as debug_check_no_locks_held() already does this if lockdep
* is enabled.
*/
void exit_rcu(void)
{
struct task_struct *t = current;
if (likely(list_empty(¤t->rcu_node_entry)))
return;
t->rcu_read_lock_nesting = 1;
barrier();
t->rcu_read_unlock_special.b.blocked = true;
__rcu_read_unlock();
}
#else /* #ifdef CONFIG_PREEMPT_RCU */
static struct rcu_state *const rcu_state_p = &rcu_sched_state;
/*
* Tell them what RCU they are running.
*/
static void __init rcu_bootup_announce(void)
{
pr_info("Hierarchical RCU implementation.\n");
rcu_bootup_announce_oddness();
}
/*
* Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
static void rcu_preempt_note_context_switch(bool preempt)
{
}
/*
* Because preemptible RCU does not exist, there are never any preempted
* RCU readers.
*/
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
{
return 0;
}
/*
* Because there is no preemptible RCU, there can be no readers blocked.
*/
static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
{
return false;
}
/*
* Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections.
*/
static void rcu_print_detail_task_stall(struct rcu_state *rsp)
{
}
/*
* Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections.
*/
static int rcu_print_task_stall(struct rcu_node *rnp)
{
return 0;
}
/*
* Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections that are
* blocking the current expedited grace period.
*/
static int rcu_print_task_exp_stall(struct rcu_node *rnp)
{
return 0;
}
/*
* Because there is no preemptible RCU, there can be no readers blocked,
* so there is no need to check for blocked tasks. So check only for
* bogus qsmask values.
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
WARN_ON_ONCE(rnp->qsmask);
}
/*
* Because preemptible RCU does not exist, it never has any callbacks
* to check.
*/
static void rcu_preempt_check_callbacks(void)
{
}
/*
* Because preemptible RCU does not exist, rcu_barrier() is just
* another name for rcu_barrier_sched().
*/
void rcu_barrier(void)
{
rcu_barrier_sched();
}
EXPORT_SYMBOL_GPL(rcu_barrier);
/*
* Because preemptible RCU does not exist, it need not be initialized.
*/
static void __init __rcu_init_preempt(void)
{
}
/*
* Because preemptible RCU does not exist, tasks cannot possibly exit
* while in preemptible RCU read-side critical sections.
*/
void exit_rcu(void)
{
}
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
static void rcu_wake_cond(struct task_struct *t, int status)
{
/*
* If the thread is yielding, only wake it when this
* is invoked from idle
*/
if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
wake_up_process(t);
}
/*
* Carry out RCU priority boosting on the task indicated by ->exp_tasks
* or ->boost_tasks, advancing the pointer to the next task in the
* ->blkd_tasks list.
*
* Note that irqs must be enabled: boosting the task can block.
* Returns 1 if there are more tasks needing to be boosted.
*/
static int rcu_boost(struct rcu_node *rnp)
{
unsigned long flags;
struct task_struct *t;
struct list_head *tb;
if (READ_ONCE(rnp->exp_tasks) == NULL &&
READ_ONCE(rnp->boost_tasks) == NULL)
return 0; /* Nothing left to boost. */
raw_spin_lock_irqsave_rcu_node(rnp, flags);
/*
* Recheck under the lock: all tasks in need of boosting
* might exit their RCU read-side critical sections on their own.
*/
if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
}
/*
* Preferentially boost tasks blocking expedited grace periods.
* This cannot starve the normal grace periods because a second
* expedited grace period must boost all blocked tasks, including
* those blocking the pre-existing normal grace period.
*/
if (rnp->exp_tasks != NULL) {
tb = rnp->exp_tasks;
rnp->n_exp_boosts++;
} else {
tb = rnp->boost_tasks;
rnp->n_normal_boosts++;
}
rnp->n_tasks_boosted++;
/*
* We boost task t by manufacturing an rt_mutex that appears to
* be held by task t. We leave a pointer to that rt_mutex where
* task t can find it, and task t will release the mutex when it
* exits its outermost RCU read-side critical section. Then
* simply acquiring this artificial rt_mutex will boost task
* t's priority. (Thanks to tglx for suggesting this approach!)
*
* Note that task t must acquire rnp->lock to remove itself from
* the ->blkd_tasks list, which it will do from exit() if from
* nowhere else. We therefore are guaranteed that task t will
* stay around at least until we drop rnp->lock. Note that
* rnp->lock also resolves races between our priority boosting
* and task t's exiting its outermost RCU read-side critical
* section.
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* Lock only for side effect: boosts task t's priority. */
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
return READ_ONCE(rnp->exp_tasks) != NULL ||
READ_ONCE(rnp->boost_tasks) != NULL;
}
/*
* Priority-boosting kthread, one per leaf rcu_node.
*/
static int rcu_boost_kthread(void *arg)