Skip to content

Commit

Permalink
sched/topology: Skip updating masks for non-online nodes
Browse files Browse the repository at this point in the history
The scheduler currently expects NUMA node distances to be stable from
init onwards, and as a consequence builds the related data structures
once-and-for-all at init (see sched_init_numa()).

Unfortunately, on some architectures node distance is unreliable for
offline nodes and may very well change upon onlining.

Skip over offline nodes during sched_init_numa(). Track nodes that have
been onlined at least once, and trigger a build of a node's NUMA masks
when it is first onlined post-init.

Reported-by: Geetika Moolchandani <[email protected]>
Signed-off-by: Srikar Dronamraju <[email protected]>
Signed-off-by: Valentin Schneider <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
  • Loading branch information
Valentin Schneider authored and Peter Zijlstra committed Aug 20, 2021
1 parent 746f5ea commit 0083242
Showing 1 changed file with 65 additions and 0 deletions.
65 changes: 65 additions & 0 deletions kernel/sched/topology.c
Original file line number Diff line number Diff line change
Expand Up @@ -1482,6 +1482,8 @@ int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;

static unsigned long __read_mostly *sched_numa_onlined_nodes;
#endif

/*
Expand Down Expand Up @@ -1833,6 +1835,16 @@ void sched_init_numa(void)
sched_domains_numa_masks[i][j] = mask;

for_each_node(k) {
/*
* Distance information can be unreliable for
* offline nodes, defer building the node
* masks to its bringup.
* This relies on all unique distance values
* still being visible at init time.
*/
if (!node_online(j))
continue;

if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
sched_numa_warn("Node-distance not symmetric");

Expand Down Expand Up @@ -1886,15 +1898,68 @@ void sched_init_numa(void)
sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];

init_numa_topology_type();

sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
if (!sched_numa_onlined_nodes)
return;

bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
for_each_online_node(i)
bitmap_set(sched_numa_onlined_nodes, i, 1);
}

static void __sched_domains_numa_masks_set(unsigned int node)
{
int i, j;

/*
* NUMA masks are not built for offline nodes in sched_init_numa().
* Thus, when a CPU of a never-onlined-before node gets plugged in,
* adding that new CPU to the right NUMA masks is not sufficient: the
* masks of that CPU's node must also be updated.
*/
if (test_bit(node, sched_numa_onlined_nodes))
return;

bitmap_set(sched_numa_onlined_nodes, node, 1);

for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
if (!node_online(j) || node == j)
continue;

if (node_distance(j, node) > sched_domains_numa_distance[i])
continue;

/* Add remote nodes in our masks */
cpumask_or(sched_domains_numa_masks[i][node],
sched_domains_numa_masks[i][node],
sched_domains_numa_masks[0][j]);
}
}

/*
* A new node has been brought up, potentially changing the topology
* classification.
*
* Note that this is racy vs any use of sched_numa_topology_type :/
*/
init_numa_topology_type();
}

void sched_domains_numa_masks_set(unsigned int cpu)
{
int node = cpu_to_node(cpu);
int i, j;

__sched_domains_numa_masks_set(node);

for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
if (!node_online(j))
continue;

/* Set ourselves in the remote node's masks */
if (node_distance(j, node) <= sched_domains_numa_distance[i])
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
}
Expand Down

0 comments on commit 0083242

Please sign in to comment.