]>
Commit | Line | Data |
---|---|---|
f7b3ea8c ML |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2016 Thomas Gleixner. | |
4 | * Copyright (C) 2016-2017 Christoph Hellwig. | |
5 | */ | |
6 | #include <linux/kernel.h> | |
7 | #include <linux/slab.h> | |
8 | #include <linux/cpu.h> | |
9 | #include <linux/sort.h> | |
10 | #include <linux/group_cpus.h> | |
11 | ||
188a5696 IM |
12 | #ifdef CONFIG_SMP |
13 | ||
f7b3ea8c ML |
14 | static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, |
15 | unsigned int cpus_per_grp) | |
16 | { | |
17 | const struct cpumask *siblmsk; | |
18 | int cpu, sibl; | |
19 | ||
20 | for ( ; cpus_per_grp > 0; ) { | |
21 | cpu = cpumask_first(nmsk); | |
22 | ||
23 | /* Should not happen, but I'm too lazy to think about it */ | |
24 | if (cpu >= nr_cpu_ids) | |
25 | return; | |
26 | ||
27 | cpumask_clear_cpu(cpu, nmsk); | |
28 | cpumask_set_cpu(cpu, irqmsk); | |
29 | cpus_per_grp--; | |
30 | ||
31 | /* If the cpu has siblings, use them first */ | |
32 | siblmsk = topology_sibling_cpumask(cpu); | |
33 | for (sibl = -1; cpus_per_grp > 0; ) { | |
34 | sibl = cpumask_next(sibl, siblmsk); | |
35 | if (sibl >= nr_cpu_ids) | |
36 | break; | |
37 | if (!cpumask_test_and_clear_cpu(sibl, nmsk)) | |
38 | continue; | |
39 | cpumask_set_cpu(sibl, irqmsk); | |
40 | cpus_per_grp--; | |
41 | } | |
42 | } | |
43 | } | |
44 | ||
45 | static cpumask_var_t *alloc_node_to_cpumask(void) | |
46 | { | |
47 | cpumask_var_t *masks; | |
48 | int node; | |
49 | ||
50 | masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); | |
51 | if (!masks) | |
52 | return NULL; | |
53 | ||
54 | for (node = 0; node < nr_node_ids; node++) { | |
55 | if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) | |
56 | goto out_unwind; | |
57 | } | |
58 | ||
59 | return masks; | |
60 | ||
61 | out_unwind: | |
62 | while (--node >= 0) | |
63 | free_cpumask_var(masks[node]); | |
64 | kfree(masks); | |
65 | return NULL; | |
66 | } | |
67 | ||
68 | static void free_node_to_cpumask(cpumask_var_t *masks) | |
69 | { | |
70 | int node; | |
71 | ||
72 | for (node = 0; node < nr_node_ids; node++) | |
73 | free_cpumask_var(masks[node]); | |
74 | kfree(masks); | |
75 | } | |
76 | ||
77 | static void build_node_to_cpumask(cpumask_var_t *masks) | |
78 | { | |
79 | int cpu; | |
80 | ||
81 | for_each_possible_cpu(cpu) | |
82 | cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); | |
83 | } | |
84 | ||
85 | static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, | |
86 | const struct cpumask *mask, nodemask_t *nodemsk) | |
87 | { | |
88 | int n, nodes = 0; | |
89 | ||
90 | /* Calculate the number of nodes in the supplied affinity mask */ | |
91 | for_each_node(n) { | |
92 | if (cpumask_intersects(mask, node_to_cpumask[n])) { | |
93 | node_set(n, *nodemsk); | |
94 | nodes++; | |
95 | } | |
96 | } | |
97 | return nodes; | |
98 | } | |
99 | ||
100 | struct node_groups { | |
101 | unsigned id; | |
102 | ||
103 | union { | |
104 | unsigned ngroups; | |
105 | unsigned ncpus; | |
106 | }; | |
107 | }; | |
108 | ||
109 | static int ncpus_cmp_func(const void *l, const void *r) | |
110 | { | |
111 | const struct node_groups *ln = l; | |
112 | const struct node_groups *rn = r; | |
113 | ||
114 | return ln->ncpus - rn->ncpus; | |
115 | } | |
116 | ||
117 | /* | |
118 | * Allocate group number for each node, so that for each node: | |
119 | * | |
120 | * 1) the allocated number is >= 1 | |
121 | * | |
122 | * 2) the allocated number is <= active CPU number of this node | |
123 | * | |
124 | * The actual allocated total groups may be less than @numgrps when | |
125 | * active total CPU number is less than @numgrps. | |
126 | * | |
127 | * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' | |
128 | * for each node. | |
129 | */ | |
130 | static void alloc_nodes_groups(unsigned int numgrps, | |
131 | cpumask_var_t *node_to_cpumask, | |
132 | const struct cpumask *cpu_mask, | |
133 | const nodemask_t nodemsk, | |
134 | struct cpumask *nmsk, | |
135 | struct node_groups *node_groups) | |
136 | { | |
137 | unsigned n, remaining_ncpus = 0; | |
138 | ||
139 | for (n = 0; n < nr_node_ids; n++) { | |
140 | node_groups[n].id = n; | |
141 | node_groups[n].ncpus = UINT_MAX; | |
142 | } | |
143 | ||
144 | for_each_node_mask(n, nodemsk) { | |
145 | unsigned ncpus; | |
146 | ||
147 | cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); | |
148 | ncpus = cpumask_weight(nmsk); | |
149 | ||
150 | if (!ncpus) | |
151 | continue; | |
152 | remaining_ncpus += ncpus; | |
153 | node_groups[n].ncpus = ncpus; | |
154 | } | |
155 | ||
156 | numgrps = min_t(unsigned, remaining_ncpus, numgrps); | |
157 | ||
158 | sort(node_groups, nr_node_ids, sizeof(node_groups[0]), | |
159 | ncpus_cmp_func, NULL); | |
160 | ||
161 | /* | |
162 | * Allocate groups for each node according to the ratio of this | |
163 | * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is | |
164 | * bigger than number of active numa nodes. Always start the | |
165 | * allocation from the node with minimized nr_cpus. | |
166 | * | |
167 | * This way guarantees that each active node gets allocated at | |
168 | * least one group, and the theory is simple: over-allocation | |
169 | * is only done when this node is assigned by one group, so | |
170 | * other nodes will be allocated >= 1 groups, since 'numgrps' is | |
171 | * bigger than number of numa nodes. | |
172 | * | |
173 | * One perfect invariant is that number of allocated groups for | |
174 | * each node is <= CPU count of this node: | |
175 | * | |
176 | * 1) suppose there are two nodes: A and B | |
177 | * ncpu(X) is CPU count of node X | |
178 | * grps(X) is the group count allocated to node X via this | |
179 | * algorithm | |
180 | * | |
181 | * ncpu(A) <= ncpu(B) | |
182 | * ncpu(A) + ncpu(B) = N | |
183 | * grps(A) + grps(B) = G | |
184 | * | |
185 | * grps(A) = max(1, round_down(G * ncpu(A) / N)) | |
186 | * grps(B) = G - grps(A) | |
187 | * | |
188 | * both N and G are integer, and 2 <= G <= N, suppose | |
189 | * G = N - delta, and 0 <= delta <= N - 2 | |
190 | * | |
191 | * 2) obviously grps(A) <= ncpu(A) because: | |
192 | * | |
193 | * if grps(A) is 1, then grps(A) <= ncpu(A) given | |
194 | * ncpu(A) >= 1 | |
195 | * | |
196 | * otherwise, | |
197 | * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N | |
198 | * | |
199 | * 3) prove how grps(B) <= ncpu(B): | |
200 | * | |
201 | * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be | |
202 | * over-allocated, so grps(B) <= ncpu(B), | |
203 | * | |
204 | * otherwise: | |
205 | * | |
206 | * grps(A) = | |
207 | * round_down(G * ncpu(A) / N) = | |
208 | * round_down((N - delta) * ncpu(A) / N) = | |
209 | * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= | |
210 | * round_down((N * ncpu(A) - delta * N) / N) = | |
211 | * cpu(A) - delta | |
212 | * | |
213 | * then: | |
214 | * | |
215 | * grps(A) - G >= ncpu(A) - delta - G | |
216 | * => | |
217 | * G - grps(A) <= G + delta - ncpu(A) | |
218 | * => | |
219 | * grps(B) <= N - ncpu(A) | |
220 | * => | |
221 | * grps(B) <= cpu(B) | |
222 | * | |
223 | * For nodes >= 3, it can be thought as one node and another big | |
224 | * node given that is exactly what this algorithm is implemented, | |
225 | * and we always re-calculate 'remaining_ncpus' & 'numgrps', and | |
226 | * finally for each node X: grps(X) <= ncpu(X). | |
227 | * | |
228 | */ | |
229 | for (n = 0; n < nr_node_ids; n++) { | |
230 | unsigned ngroups, ncpus; | |
231 | ||
232 | if (node_groups[n].ncpus == UINT_MAX) | |
233 | continue; | |
234 | ||
235 | WARN_ON_ONCE(numgrps == 0); | |
236 | ||
237 | ncpus = node_groups[n].ncpus; | |
238 | ngroups = max_t(unsigned, 1, | |
239 | numgrps * ncpus / remaining_ncpus); | |
240 | WARN_ON_ONCE(ngroups > ncpus); | |
241 | ||
242 | node_groups[n].ngroups = ngroups; | |
243 | ||
244 | remaining_ncpus -= ncpus; | |
245 | numgrps -= ngroups; | |
246 | } | |
247 | } | |
248 | ||
249 | static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, | |
250 | cpumask_var_t *node_to_cpumask, | |
251 | const struct cpumask *cpu_mask, | |
252 | struct cpumask *nmsk, struct cpumask *masks) | |
253 | { | |
254 | unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; | |
255 | unsigned int last_grp = numgrps; | |
256 | unsigned int curgrp = startgrp; | |
257 | nodemask_t nodemsk = NODE_MASK_NONE; | |
258 | struct node_groups *node_groups; | |
259 | ||
260 | if (cpumask_empty(cpu_mask)) | |
261 | return 0; | |
262 | ||
263 | nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); | |
264 | ||
265 | /* | |
266 | * If the number of nodes in the mask is greater than or equal the | |
267 | * number of groups we just spread the groups across the nodes. | |
268 | */ | |
269 | if (numgrps <= nodes) { | |
270 | for_each_node_mask(n, nodemsk) { | |
271 | /* Ensure that only CPUs which are in both masks are set */ | |
272 | cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); | |
273 | cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); | |
274 | if (++curgrp == last_grp) | |
275 | curgrp = 0; | |
276 | } | |
277 | return numgrps; | |
278 | } | |
279 | ||
280 | node_groups = kcalloc(nr_node_ids, | |
281 | sizeof(struct node_groups), | |
282 | GFP_KERNEL); | |
283 | if (!node_groups) | |
284 | return -ENOMEM; | |
285 | ||
286 | /* allocate group number for each node */ | |
287 | alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, | |
288 | nodemsk, nmsk, node_groups); | |
289 | for (i = 0; i < nr_node_ids; i++) { | |
290 | unsigned int ncpus, v; | |
291 | struct node_groups *nv = &node_groups[i]; | |
292 | ||
293 | if (nv->ngroups == UINT_MAX) | |
294 | continue; | |
295 | ||
296 | /* Get the cpus on this node which are in the mask */ | |
297 | cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); | |
298 | ncpus = cpumask_weight(nmsk); | |
299 | if (!ncpus) | |
300 | continue; | |
301 | ||
302 | WARN_ON_ONCE(nv->ngroups > ncpus); | |
303 | ||
304 | /* Account for rounding errors */ | |
305 | extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); | |
306 | ||
307 | /* Spread allocated groups on CPUs of the current node */ | |
308 | for (v = 0; v < nv->ngroups; v++, curgrp++) { | |
309 | cpus_per_grp = ncpus / nv->ngroups; | |
310 | ||
311 | /* Account for extra groups to compensate rounding errors */ | |
312 | if (extra_grps) { | |
313 | cpus_per_grp++; | |
314 | --extra_grps; | |
315 | } | |
316 | ||
317 | /* | |
318 | * wrapping has to be considered given 'startgrp' | |
319 | * may start anywhere | |
320 | */ | |
321 | if (curgrp >= last_grp) | |
322 | curgrp = 0; | |
323 | grp_spread_init_one(&masks[curgrp], nmsk, | |
324 | cpus_per_grp); | |
325 | } | |
326 | done += nv->ngroups; | |
327 | } | |
328 | kfree(node_groups); | |
329 | return done; | |
330 | } | |
331 | ||
f7b3ea8c ML |
332 | /** |
333 | * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality | |
334 | * @numgrps: number of groups | |
335 | * | |
336 | * Return: cpumask array if successful, NULL otherwise. And each element | |
337 | * includes CPUs assigned to this group | |
338 | * | |
339 | * Try to put close CPUs from viewpoint of CPU and NUMA locality into | |
340 | * same group, and run two-stage grouping: | |
341 | * 1) allocate present CPUs on these groups evenly first | |
342 | * 2) allocate other possible CPUs on these groups evenly | |
343 | * | |
344 | * We guarantee in the resulted grouping that all CPUs are covered, and | |
345 | * no same CPU is assigned to multiple groups | |
346 | */ | |
347 | struct cpumask *group_cpus_evenly(unsigned int numgrps) | |
348 | { | |
349 | unsigned int curgrp = 0, nr_present = 0, nr_others = 0; | |
350 | cpumask_var_t *node_to_cpumask; | |
351 | cpumask_var_t nmsk, npresmsk; | |
352 | int ret = -ENOMEM; | |
353 | struct cpumask *masks = NULL; | |
354 | ||
355 | if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) | |
356 | return NULL; | |
357 | ||
358 | if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) | |
359 | goto fail_nmsk; | |
360 | ||
361 | node_to_cpumask = alloc_node_to_cpumask(); | |
362 | if (!node_to_cpumask) | |
363 | goto fail_npresmsk; | |
364 | ||
365 | masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); | |
366 | if (!masks) | |
367 | goto fail_node_to_cpumask; | |
368 | ||
f7b3ea8c ML |
369 | build_node_to_cpumask(node_to_cpumask); |
370 | ||
0263f92f ML |
371 | /* |
372 | * Make a local cache of 'cpu_present_mask', so the two stages | |
373 | * spread can observe consistent 'cpu_present_mask' without holding | |
374 | * cpu hotplug lock, then we can reduce deadlock risk with cpu | |
375 | * hotplug code. | |
376 | * | |
377 | * Here CPU hotplug may happen when reading `cpu_present_mask`, and | |
378 | * we can live with the case because it only affects that hotplug | |
379 | * CPU is handled in the 1st or 2nd stage, and either way is correct | |
380 | * from API user viewpoint since 2-stage spread is sort of | |
381 | * optimization. | |
382 | */ | |
383 | cpumask_copy(npresmsk, data_race(cpu_present_mask)); | |
384 | ||
f7b3ea8c ML |
385 | /* grouping present CPUs first */ |
386 | ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, | |
0263f92f | 387 | npresmsk, nmsk, masks); |
f7b3ea8c ML |
388 | if (ret < 0) |
389 | goto fail_build_affinity; | |
390 | nr_present = ret; | |
391 | ||
392 | /* | |
393 | * Allocate non present CPUs starting from the next group to be | |
394 | * handled. If the grouping of present CPUs already exhausted the | |
395 | * group space, assign the non present CPUs to the already | |
396 | * allocated out groups. | |
397 | */ | |
398 | if (nr_present >= numgrps) | |
399 | curgrp = 0; | |
400 | else | |
401 | curgrp = nr_present; | |
0263f92f | 402 | cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk); |
f7b3ea8c ML |
403 | ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, |
404 | npresmsk, nmsk, masks); | |
405 | if (ret >= 0) | |
406 | nr_others = ret; | |
407 | ||
408 | fail_build_affinity: | |
f7b3ea8c ML |
409 | if (ret >= 0) |
410 | WARN_ON(nr_present + nr_others < numgrps); | |
411 | ||
412 | fail_node_to_cpumask: | |
413 | free_node_to_cpumask(node_to_cpumask); | |
414 | ||
415 | fail_npresmsk: | |
416 | free_cpumask_var(npresmsk); | |
417 | ||
418 | fail_nmsk: | |
419 | free_cpumask_var(nmsk); | |
420 | if (ret < 0) { | |
421 | kfree(masks); | |
422 | return NULL; | |
423 | } | |
424 | return masks; | |
425 | } | |
188a5696 | 426 | #else /* CONFIG_SMP */ |
f7b3ea8c ML |
427 | struct cpumask *group_cpus_evenly(unsigned int numgrps) |
428 | { | |
429 | struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); | |
430 | ||
431 | if (!masks) | |
432 | return NULL; | |
433 | ||
434 | /* assign all CPUs(cpu 0) to the 1st group only */ | |
435 | cpumask_copy(&masks[0], cpu_possible_mask); | |
436 | return masks; | |
437 | } | |
188a5696 | 438 | #endif /* CONFIG_SMP */ |
aaf05948 | 439 | EXPORT_SYMBOL_GPL(group_cpus_evenly); |