]>
Commit | Line | Data |
---|---|---|
992bf775 | 1 | // SPDX-License-Identifier: GPL-2.0 |
992bf775 AK |
2 | #include <linux/slab.h> |
3 | #include <linux/lockdep.h> | |
91952440 AK |
4 | #include <linux/sysfs.h> |
5 | #include <linux/kobject.h> | |
c6123a19 | 6 | #include <linux/memory.h> |
992bf775 | 7 | #include <linux/memory-tiers.h> |
07a8bdd4 | 8 | #include <linux/notifier.h> |
2a28713a | 9 | #include <linux/sched/sysctl.h> |
992bf775 | 10 | |
6c542ab7 AK |
11 | #include "internal.h" |
12 | ||
992bf775 AK |
13 | struct memory_tier { |
14 | /* hierarchy of memory tiers */ | |
15 | struct list_head list; | |
16 | /* list of all memory types part of this tier */ | |
17 | struct list_head memory_types; | |
18 | /* | |
19 | * start value of abstract distance. memory tier maps | |
20 | * an abstract distance range, | |
21 | * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE | |
22 | */ | |
23 | int adistance_start; | |
9832fb87 | 24 | struct device dev; |
32008027 JG |
25 | /* All the nodes that are part of all the lower memory tiers. */ |
26 | nodemask_t lower_tier_mask; | |
992bf775 AK |
27 | }; |
28 | ||
6c542ab7 AK |
29 | struct demotion_nodes { |
30 | nodemask_t preferred; | |
31 | }; | |
32 | ||
7b88bda3 AK |
33 | struct node_memory_type_map { |
34 | struct memory_dev_type *memtype; | |
35 | int map_count; | |
992bf775 AK |
36 | }; |
37 | ||
38 | static DEFINE_MUTEX(memory_tier_lock); | |
39 | static LIST_HEAD(memory_tiers); | |
cf93be18 HRJC |
40 | /* |
41 | * The list is used to store all memory types that are not created | |
42 | * by a device driver. | |
43 | */ | |
44 | static LIST_HEAD(default_memory_types); | |
7b88bda3 | 45 | static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; |
3718c02d | 46 | struct memory_dev_type *default_dram_type; |
823430c8 | 47 | nodemask_t default_dram_nodes __initdata = NODE_MASK_NONE; |
9832fb87 | 48 | |
e374ae2b | 49 | static const struct bus_type memory_tier_subsys = { |
9832fb87 AK |
50 | .name = "memory_tiering", |
51 | .dev_name = "memory_tier", | |
52 | }; | |
53 | ||
2a28713a ZY |
54 | #ifdef CONFIG_NUMA_BALANCING |
55 | /** | |
56 | * folio_use_access_time - check if a folio reuses cpupid for page access time | |
57 | * @folio: folio to check | |
58 | * | |
59 | * folio's _last_cpupid field is repurposed by memory tiering. In memory | |
60 | * tiering mode, cpupid of slow memory folio (not toptier memory) is used to | |
61 | * record page access time. | |
62 | * | |
63 | * Return: the folio _last_cpupid is used to record page access time | |
64 | */ | |
65 | bool folio_use_access_time(struct folio *folio) | |
66 | { | |
67 | return (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && | |
68 | !node_is_toptier(folio_nid(folio)); | |
69 | } | |
70 | #endif | |
71 | ||
6c542ab7 | 72 | #ifdef CONFIG_MIGRATION |
467b171a | 73 | static int top_tier_adistance; |
6c542ab7 AK |
74 | /* |
75 | * node_demotion[] examples: | |
76 | * | |
77 | * Example 1: | |
78 | * | |
79 | * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. | |
80 | * | |
81 | * node distances: | |
82 | * node 0 1 2 3 | |
83 | * 0 10 20 30 40 | |
84 | * 1 20 10 40 30 | |
85 | * 2 30 40 10 40 | |
86 | * 3 40 30 40 10 | |
87 | * | |
88 | * memory_tiers0 = 0-1 | |
89 | * memory_tiers1 = 2-3 | |
90 | * | |
91 | * node_demotion[0].preferred = 2 | |
92 | * node_demotion[1].preferred = 3 | |
93 | * node_demotion[2].preferred = <empty> | |
94 | * node_demotion[3].preferred = <empty> | |
95 | * | |
96 | * Example 2: | |
97 | * | |
98 | * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. | |
99 | * | |
100 | * node distances: | |
101 | * node 0 1 2 | |
102 | * 0 10 20 30 | |
103 | * 1 20 10 30 | |
104 | * 2 30 30 10 | |
105 | * | |
106 | * memory_tiers0 = 0-2 | |
107 | * | |
108 | * node_demotion[0].preferred = <empty> | |
109 | * node_demotion[1].preferred = <empty> | |
110 | * node_demotion[2].preferred = <empty> | |
111 | * | |
112 | * Example 3: | |
113 | * | |
114 | * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. | |
115 | * | |
116 | * node distances: | |
117 | * node 0 1 2 | |
118 | * 0 10 20 30 | |
119 | * 1 20 10 40 | |
120 | * 2 30 40 10 | |
121 | * | |
122 | * memory_tiers0 = 1 | |
123 | * memory_tiers1 = 0 | |
124 | * memory_tiers2 = 2 | |
125 | * | |
126 | * node_demotion[0].preferred = 2 | |
127 | * node_demotion[1].preferred = 0 | |
128 | * node_demotion[2].preferred = <empty> | |
129 | * | |
130 | */ | |
131 | static struct demotion_nodes *node_demotion __read_mostly; | |
132 | #endif /* CONFIG_MIGRATION */ | |
992bf775 | 133 | |
07a8bdd4 YH |
134 | static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); |
135 | ||
cf93be18 HRJC |
136 | /* The lock is used to protect `default_dram_perf*` info and nid. */ |
137 | static DEFINE_MUTEX(default_dram_perf_lock); | |
3718c02d | 138 | static bool default_dram_perf_error; |
6a954e94 | 139 | static struct access_coordinate default_dram_perf; |
3718c02d YH |
140 | static int default_dram_perf_ref_nid = NUMA_NO_NODE; |
141 | static const char *default_dram_perf_ref_source; | |
142 | ||
9832fb87 AK |
143 | static inline struct memory_tier *to_memory_tier(struct device *device) |
144 | { | |
145 | return container_of(device, struct memory_tier, dev); | |
146 | } | |
147 | ||
148 | static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) | |
149 | { | |
150 | nodemask_t nodes = NODE_MASK_NONE; | |
151 | struct memory_dev_type *memtype; | |
152 | ||
51a23b1b | 153 | list_for_each_entry(memtype, &memtier->memory_types, tier_sibling) |
9832fb87 AK |
154 | nodes_or(nodes, nodes, memtype->nodes); |
155 | ||
156 | return nodes; | |
157 | } | |
158 | ||
159 | static void memory_tier_device_release(struct device *dev) | |
160 | { | |
161 | struct memory_tier *tier = to_memory_tier(dev); | |
162 | /* | |
163 | * synchronize_rcu in clear_node_memory_tier makes sure | |
164 | * we don't have rcu access to this memory tier. | |
165 | */ | |
166 | kfree(tier); | |
167 | } | |
168 | ||
27d676a1 YH |
169 | static ssize_t nodelist_show(struct device *dev, |
170 | struct device_attribute *attr, char *buf) | |
9832fb87 AK |
171 | { |
172 | int ret; | |
173 | nodemask_t nmask; | |
174 | ||
175 | mutex_lock(&memory_tier_lock); | |
176 | nmask = get_memtier_nodemask(to_memory_tier(dev)); | |
177 | ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask)); | |
178 | mutex_unlock(&memory_tier_lock); | |
179 | return ret; | |
180 | } | |
27d676a1 | 181 | static DEVICE_ATTR_RO(nodelist); |
9832fb87 AK |
182 | |
183 | static struct attribute *memtier_dev_attrs[] = { | |
27d676a1 | 184 | &dev_attr_nodelist.attr, |
9832fb87 AK |
185 | NULL |
186 | }; | |
187 | ||
188 | static const struct attribute_group memtier_dev_group = { | |
189 | .attrs = memtier_dev_attrs, | |
190 | }; | |
191 | ||
192 | static const struct attribute_group *memtier_dev_groups[] = { | |
193 | &memtier_dev_group, | |
194 | NULL | |
195 | }; | |
196 | ||
992bf775 AK |
197 | static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) |
198 | { | |
9832fb87 | 199 | int ret; |
992bf775 AK |
200 | bool found_slot = false; |
201 | struct memory_tier *memtier, *new_memtier; | |
202 | int adistance = memtype->adistance; | |
203 | unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; | |
204 | ||
205 | lockdep_assert_held_once(&memory_tier_lock); | |
206 | ||
b26ac6f3 | 207 | adistance = round_down(adistance, memtier_adistance_chunk_size); |
992bf775 AK |
208 | /* |
209 | * If the memtype is already part of a memory tier, | |
210 | * just return that. | |
211 | */ | |
51a23b1b | 212 | if (!list_empty(&memtype->tier_sibling)) { |
b26ac6f3 AK |
213 | list_for_each_entry(memtier, &memory_tiers, list) { |
214 | if (adistance == memtier->adistance_start) | |
215 | return memtier; | |
216 | } | |
217 | WARN_ON(1); | |
218 | return ERR_PTR(-EINVAL); | |
219 | } | |
992bf775 | 220 | |
992bf775 AK |
221 | list_for_each_entry(memtier, &memory_tiers, list) { |
222 | if (adistance == memtier->adistance_start) { | |
9832fb87 | 223 | goto link_memtype; |
992bf775 AK |
224 | } else if (adistance < memtier->adistance_start) { |
225 | found_slot = true; | |
226 | break; | |
227 | } | |
228 | } | |
229 | ||
9832fb87 | 230 | new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); |
992bf775 AK |
231 | if (!new_memtier) |
232 | return ERR_PTR(-ENOMEM); | |
233 | ||
234 | new_memtier->adistance_start = adistance; | |
235 | INIT_LIST_HEAD(&new_memtier->list); | |
236 | INIT_LIST_HEAD(&new_memtier->memory_types); | |
237 | if (found_slot) | |
238 | list_add_tail(&new_memtier->list, &memtier->list); | |
239 | else | |
240 | list_add_tail(&new_memtier->list, &memory_tiers); | |
9832fb87 AK |
241 | |
242 | new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; | |
243 | new_memtier->dev.bus = &memory_tier_subsys; | |
244 | new_memtier->dev.release = memory_tier_device_release; | |
245 | new_memtier->dev.groups = memtier_dev_groups; | |
246 | ||
247 | ret = device_register(&new_memtier->dev); | |
248 | if (ret) { | |
93419139 TT |
249 | list_del(&new_memtier->list); |
250 | put_device(&new_memtier->dev); | |
9832fb87 AK |
251 | return ERR_PTR(ret); |
252 | } | |
253 | memtier = new_memtier; | |
254 | ||
255 | link_memtype: | |
51a23b1b | 256 | list_add(&memtype->tier_sibling, &memtier->memory_types); |
9832fb87 | 257 | return memtier; |
992bf775 AK |
258 | } |
259 | ||
6c542ab7 AK |
260 | static struct memory_tier *__node_get_memory_tier(int node) |
261 | { | |
7766cf7a | 262 | pg_data_t *pgdat; |
6c542ab7 | 263 | |
7766cf7a AK |
264 | pgdat = NODE_DATA(node); |
265 | if (!pgdat) | |
266 | return NULL; | |
267 | /* | |
268 | * Since we hold memory_tier_lock, we can avoid | |
269 | * RCU read locks when accessing the details. No | |
270 | * parallel updates are possible here. | |
271 | */ | |
272 | return rcu_dereference_check(pgdat->memtier, | |
273 | lockdep_is_held(&memory_tier_lock)); | |
6c542ab7 AK |
274 | } |
275 | ||
276 | #ifdef CONFIG_MIGRATION | |
467b171a AK |
277 | bool node_is_toptier(int node) |
278 | { | |
279 | bool toptier; | |
280 | pg_data_t *pgdat; | |
281 | struct memory_tier *memtier; | |
282 | ||
283 | pgdat = NODE_DATA(node); | |
284 | if (!pgdat) | |
285 | return false; | |
286 | ||
287 | rcu_read_lock(); | |
288 | memtier = rcu_dereference(pgdat->memtier); | |
289 | if (!memtier) { | |
290 | toptier = true; | |
291 | goto out; | |
292 | } | |
293 | if (memtier->adistance_start <= top_tier_adistance) | |
294 | toptier = true; | |
295 | else | |
296 | toptier = false; | |
297 | out: | |
298 | rcu_read_unlock(); | |
299 | return toptier; | |
300 | } | |
301 | ||
32008027 JG |
302 | void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) |
303 | { | |
304 | struct memory_tier *memtier; | |
305 | ||
306 | /* | |
307 | * pg_data_t.memtier updates includes a synchronize_rcu() | |
308 | * which ensures that we either find NULL or a valid memtier | |
309 | * in NODE_DATA. protect the access via rcu_read_lock(); | |
310 | */ | |
311 | rcu_read_lock(); | |
312 | memtier = rcu_dereference(pgdat->memtier); | |
313 | if (memtier) | |
314 | *targets = memtier->lower_tier_mask; | |
315 | else | |
316 | *targets = NODE_MASK_NONE; | |
317 | rcu_read_unlock(); | |
318 | } | |
319 | ||
6c542ab7 AK |
320 | /** |
321 | * next_demotion_node() - Get the next node in the demotion path | |
322 | * @node: The starting node to lookup the next node | |
323 | * | |
324 | * Return: node id for next memory node in the demotion path hierarchy | |
325 | * from @node; NUMA_NO_NODE if @node is terminal. This does not keep | |
326 | * @node online or guarantee that it *continues* to be the next demotion | |
327 | * target. | |
328 | */ | |
329 | int next_demotion_node(int node) | |
330 | { | |
331 | struct demotion_nodes *nd; | |
332 | int target; | |
333 | ||
334 | if (!node_demotion) | |
335 | return NUMA_NO_NODE; | |
336 | ||
337 | nd = &node_demotion[node]; | |
338 | ||
339 | /* | |
340 | * node_demotion[] is updated without excluding this | |
341 | * function from running. | |
342 | * | |
343 | * Make sure to use RCU over entire code blocks if | |
344 | * node_demotion[] reads need to be consistent. | |
345 | */ | |
346 | rcu_read_lock(); | |
347 | /* | |
348 | * If there are multiple target nodes, just select one | |
349 | * target node randomly. | |
350 | * | |
351 | * In addition, we can also use round-robin to select | |
352 | * target node, but we should introduce another variable | |
353 | * for node_demotion[] to record last selected target node, | |
354 | * that may cause cache ping-pong due to the changing of | |
355 | * last target node. Or introducing per-cpu data to avoid | |
356 | * caching issue, which seems more complicated. So selecting | |
357 | * target node randomly seems better until now. | |
358 | */ | |
359 | target = node_random(&nd->preferred); | |
360 | rcu_read_unlock(); | |
361 | ||
362 | return target; | |
363 | } | |
364 | ||
365 | static void disable_all_demotion_targets(void) | |
366 | { | |
32008027 | 367 | struct memory_tier *memtier; |
6c542ab7 AK |
368 | int node; |
369 | ||
32008027 | 370 | for_each_node_state(node, N_MEMORY) { |
6c542ab7 | 371 | node_demotion[node].preferred = NODE_MASK_NONE; |
32008027 JG |
372 | /* |
373 | * We are holding memory_tier_lock, it is safe | |
374 | * to access pgda->memtier. | |
375 | */ | |
376 | memtier = __node_get_memory_tier(node); | |
377 | if (memtier) | |
378 | memtier->lower_tier_mask = NODE_MASK_NONE; | |
379 | } | |
6c542ab7 AK |
380 | /* |
381 | * Ensure that the "disable" is visible across the system. | |
382 | * Readers will see either a combination of before+disable | |
383 | * state or disable+after. They will never see before and | |
384 | * after state together. | |
385 | */ | |
386 | synchronize_rcu(); | |
387 | } | |
388 | ||
601e793a LZ |
389 | static void dump_demotion_targets(void) |
390 | { | |
391 | int node; | |
392 | ||
393 | for_each_node_state(node, N_MEMORY) { | |
394 | struct memory_tier *memtier = __node_get_memory_tier(node); | |
395 | nodemask_t preferred = node_demotion[node].preferred; | |
396 | ||
397 | if (!memtier) | |
398 | continue; | |
399 | ||
400 | if (nodes_empty(preferred)) | |
401 | pr_info("Demotion targets for Node %d: null\n", node); | |
402 | else | |
403 | pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n", | |
404 | node, nodemask_pr_args(&preferred), | |
405 | nodemask_pr_args(&memtier->lower_tier_mask)); | |
406 | } | |
407 | } | |
408 | ||
6c542ab7 AK |
409 | /* |
410 | * Find an automatic demotion target for all memory | |
411 | * nodes. Failing here is OK. It might just indicate | |
412 | * being at the end of a chain. | |
413 | */ | |
414 | static void establish_demotion_targets(void) | |
415 | { | |
416 | struct memory_tier *memtier; | |
417 | struct demotion_nodes *nd; | |
418 | int target = NUMA_NO_NODE, node; | |
419 | int distance, best_distance; | |
32008027 | 420 | nodemask_t tier_nodes, lower_tier; |
6c542ab7 AK |
421 | |
422 | lockdep_assert_held_once(&memory_tier_lock); | |
423 | ||
33ee4f18 | 424 | if (!node_demotion) |
6c542ab7 AK |
425 | return; |
426 | ||
427 | disable_all_demotion_targets(); | |
428 | ||
429 | for_each_node_state(node, N_MEMORY) { | |
430 | best_distance = -1; | |
431 | nd = &node_demotion[node]; | |
432 | ||
433 | memtier = __node_get_memory_tier(node); | |
434 | if (!memtier || list_is_last(&memtier->list, &memory_tiers)) | |
435 | continue; | |
436 | /* | |
437 | * Get the lower memtier to find the demotion node list. | |
438 | */ | |
439 | memtier = list_next_entry(memtier, list); | |
440 | tier_nodes = get_memtier_nodemask(memtier); | |
441 | /* | |
442 | * find_next_best_node, use 'used' nodemask as a skip list. | |
443 | * Add all memory nodes except the selected memory tier | |
444 | * nodelist to skip list so that we find the best node from the | |
445 | * memtier nodelist. | |
446 | */ | |
447 | nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); | |
448 | ||
449 | /* | |
450 | * Find all the nodes in the memory tier node list of same best distance. | |
451 | * add them to the preferred mask. We randomly select between nodes | |
452 | * in the preferred mask when allocating pages during demotion. | |
453 | */ | |
454 | do { | |
455 | target = find_next_best_node(node, &tier_nodes); | |
456 | if (target == NUMA_NO_NODE) | |
457 | break; | |
458 | ||
459 | distance = node_distance(node, target); | |
460 | if (distance == best_distance || best_distance == -1) { | |
461 | best_distance = distance; | |
462 | node_set(target, nd->preferred); | |
463 | } else { | |
464 | break; | |
465 | } | |
466 | } while (1); | |
467 | } | |
467b171a AK |
468 | /* |
469 | * Promotion is allowed from a memory tier to higher | |
470 | * memory tier only if the memory tier doesn't include | |
471 | * compute. We want to skip promotion from a memory tier, | |
472 | * if any node that is part of the memory tier have CPUs. | |
473 | * Once we detect such a memory tier, we consider that tier | |
474 | * as top tiper from which promotion is not allowed. | |
475 | */ | |
476 | list_for_each_entry_reverse(memtier, &memory_tiers, list) { | |
477 | tier_nodes = get_memtier_nodemask(memtier); | |
478 | nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); | |
479 | if (!nodes_empty(tier_nodes)) { | |
480 | /* | |
481 | * abstract distance below the max value of this memtier | |
482 | * is considered toptier. | |
483 | */ | |
484 | top_tier_adistance = memtier->adistance_start + | |
485 | MEMTIER_CHUNK_SIZE - 1; | |
486 | break; | |
487 | } | |
488 | } | |
32008027 JG |
489 | /* |
490 | * Now build the lower_tier mask for each node collecting node mask from | |
491 | * all memory tier below it. This allows us to fallback demotion page | |
492 | * allocation to a set of nodes that is closer the above selected | |
601e793a | 493 | * preferred node. |
32008027 JG |
494 | */ |
495 | lower_tier = node_states[N_MEMORY]; | |
496 | list_for_each_entry(memtier, &memory_tiers, list) { | |
497 | /* | |
498 | * Keep removing current tier from lower_tier nodes, | |
499 | * This will remove all nodes in current and above | |
500 | * memory tier from the lower_tier mask. | |
501 | */ | |
502 | tier_nodes = get_memtier_nodemask(memtier); | |
503 | nodes_andnot(lower_tier, lower_tier, tier_nodes); | |
504 | memtier->lower_tier_mask = lower_tier; | |
505 | } | |
601e793a LZ |
506 | |
507 | dump_demotion_targets(); | |
6c542ab7 AK |
508 | } |
509 | ||
510 | #else | |
6c542ab7 AK |
511 | static inline void establish_demotion_targets(void) {} |
512 | #endif /* CONFIG_MIGRATION */ | |
513 | ||
7b88bda3 AK |
514 | static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) |
515 | { | |
516 | if (!node_memory_types[node].memtype) | |
517 | node_memory_types[node].memtype = memtype; | |
518 | /* | |
519 | * for each device getting added in the same NUMA node | |
520 | * with this specific memtype, bump the map count. We | |
521 | * Only take memtype device reference once, so that | |
522 | * changing a node memtype can be done by droping the | |
523 | * only reference count taken here. | |
524 | */ | |
525 | ||
526 | if (node_memory_types[node].memtype == memtype) { | |
527 | if (!node_memory_types[node].map_count++) | |
528 | kref_get(&memtype->kref); | |
529 | } | |
530 | } | |
531 | ||
992bf775 AK |
532 | static struct memory_tier *set_node_memory_tier(int node) |
533 | { | |
534 | struct memory_tier *memtier; | |
cf93be18 HRJC |
535 | struct memory_dev_type *memtype = default_dram_type; |
536 | int adist = MEMTIER_ADISTANCE_DRAM; | |
7766cf7a AK |
537 | pg_data_t *pgdat = NODE_DATA(node); |
538 | ||
992bf775 AK |
539 | |
540 | lockdep_assert_held_once(&memory_tier_lock); | |
541 | ||
542 | if (!node_state(node, N_MEMORY)) | |
543 | return ERR_PTR(-EINVAL); | |
544 | ||
cf93be18 HRJC |
545 | mt_calc_adistance(node, &adist); |
546 | if (!node_memory_types[node].memtype) { | |
547 | memtype = mt_find_alloc_memory_type(adist, &default_memory_types); | |
548 | if (IS_ERR(memtype)) { | |
549 | memtype = default_dram_type; | |
550 | pr_info("Failed to allocate a memory type. Fall back.\n"); | |
551 | } | |
552 | } | |
553 | ||
554 | __init_node_memory_type(node, memtype); | |
992bf775 | 555 | |
7b88bda3 | 556 | memtype = node_memory_types[node].memtype; |
992bf775 AK |
557 | node_set(node, memtype->nodes); |
558 | memtier = find_create_memory_tier(memtype); | |
7766cf7a AK |
559 | if (!IS_ERR(memtier)) |
560 | rcu_assign_pointer(pgdat->memtier, memtier); | |
992bf775 AK |
561 | return memtier; |
562 | } | |
563 | ||
c6123a19 AK |
564 | static void destroy_memory_tier(struct memory_tier *memtier) |
565 | { | |
566 | list_del(&memtier->list); | |
9832fb87 | 567 | device_unregister(&memtier->dev); |
c6123a19 AK |
568 | } |
569 | ||
570 | static bool clear_node_memory_tier(int node) | |
571 | { | |
572 | bool cleared = false; | |
7766cf7a | 573 | pg_data_t *pgdat; |
c6123a19 AK |
574 | struct memory_tier *memtier; |
575 | ||
7766cf7a AK |
576 | pgdat = NODE_DATA(node); |
577 | if (!pgdat) | |
578 | return false; | |
579 | ||
580 | /* | |
581 | * Make sure that anybody looking at NODE_DATA who finds | |
582 | * a valid memtier finds memory_dev_types with nodes still | |
583 | * linked to the memtier. We achieve this by waiting for | |
584 | * rcu read section to finish using synchronize_rcu. | |
585 | * This also enables us to free the destroyed memory tier | |
586 | * with kfree instead of kfree_rcu | |
587 | */ | |
c6123a19 AK |
588 | memtier = __node_get_memory_tier(node); |
589 | if (memtier) { | |
590 | struct memory_dev_type *memtype; | |
591 | ||
7766cf7a AK |
592 | rcu_assign_pointer(pgdat->memtier, NULL); |
593 | synchronize_rcu(); | |
7b88bda3 | 594 | memtype = node_memory_types[node].memtype; |
c6123a19 AK |
595 | node_clear(node, memtype->nodes); |
596 | if (nodes_empty(memtype->nodes)) { | |
51a23b1b | 597 | list_del_init(&memtype->tier_sibling); |
c6123a19 AK |
598 | if (list_empty(&memtier->memory_types)) |
599 | destroy_memory_tier(memtier); | |
600 | } | |
601 | cleared = true; | |
602 | } | |
603 | return cleared; | |
604 | } | |
605 | ||
7b88bda3 AK |
606 | static void release_memtype(struct kref *kref) |
607 | { | |
608 | struct memory_dev_type *memtype; | |
609 | ||
610 | memtype = container_of(kref, struct memory_dev_type, kref); | |
611 | kfree(memtype); | |
612 | } | |
613 | ||
614 | struct memory_dev_type *alloc_memory_type(int adistance) | |
615 | { | |
616 | struct memory_dev_type *memtype; | |
617 | ||
618 | memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); | |
619 | if (!memtype) | |
620 | return ERR_PTR(-ENOMEM); | |
621 | ||
622 | memtype->adistance = adistance; | |
51a23b1b | 623 | INIT_LIST_HEAD(&memtype->tier_sibling); |
7b88bda3 | 624 | memtype->nodes = NODE_MASK_NONE; |
7b88bda3 AK |
625 | kref_init(&memtype->kref); |
626 | return memtype; | |
627 | } | |
628 | EXPORT_SYMBOL_GPL(alloc_memory_type); | |
629 | ||
bded67f8 | 630 | void put_memory_type(struct memory_dev_type *memtype) |
7b88bda3 AK |
631 | { |
632 | kref_put(&memtype->kref, release_memtype); | |
633 | } | |
bded67f8 | 634 | EXPORT_SYMBOL_GPL(put_memory_type); |
7b88bda3 AK |
635 | |
636 | void init_node_memory_type(int node, struct memory_dev_type *memtype) | |
637 | { | |
638 | ||
639 | mutex_lock(&memory_tier_lock); | |
640 | __init_node_memory_type(node, memtype); | |
641 | mutex_unlock(&memory_tier_lock); | |
642 | } | |
643 | EXPORT_SYMBOL_GPL(init_node_memory_type); | |
644 | ||
645 | void clear_node_memory_type(int node, struct memory_dev_type *memtype) | |
646 | { | |
647 | mutex_lock(&memory_tier_lock); | |
6bc2cfdf | 648 | if (node_memory_types[node].memtype == memtype || !memtype) |
7b88bda3 AK |
649 | node_memory_types[node].map_count--; |
650 | /* | |
651 | * If we umapped all the attached devices to this node, | |
652 | * clear the node memory type. | |
653 | */ | |
654 | if (!node_memory_types[node].map_count) { | |
6bc2cfdf | 655 | memtype = node_memory_types[node].memtype; |
7b88bda3 | 656 | node_memory_types[node].memtype = NULL; |
bded67f8 | 657 | put_memory_type(memtype); |
7b88bda3 AK |
658 | } |
659 | mutex_unlock(&memory_tier_lock); | |
660 | } | |
661 | EXPORT_SYMBOL_GPL(clear_node_memory_type); | |
662 | ||
a72a30af HRJC |
663 | struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types) |
664 | { | |
665 | struct memory_dev_type *mtype; | |
666 | ||
667 | list_for_each_entry(mtype, memory_types, list) | |
668 | if (mtype->adistance == adist) | |
669 | return mtype; | |
670 | ||
671 | mtype = alloc_memory_type(adist); | |
672 | if (IS_ERR(mtype)) | |
673 | return mtype; | |
674 | ||
675 | list_add(&mtype->list, memory_types); | |
676 | ||
677 | return mtype; | |
678 | } | |
679 | EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type); | |
680 | ||
681 | void mt_put_memory_types(struct list_head *memory_types) | |
682 | { | |
683 | struct memory_dev_type *mtype, *mtn; | |
684 | ||
685 | list_for_each_entry_safe(mtype, mtn, memory_types, list) { | |
686 | list_del(&mtype->list); | |
687 | put_memory_type(mtype); | |
688 | } | |
689 | } | |
690 | EXPORT_SYMBOL_GPL(mt_put_memory_types); | |
691 | ||
cf93be18 HRJC |
692 | /* |
693 | * This is invoked via `late_initcall()` to initialize memory tiers for | |
823430c8 HRJC |
694 | * memory nodes, both with and without CPUs. After the initialization of |
695 | * firmware and devices, adistance algorithms are expected to be provided. | |
cf93be18 HRJC |
696 | */ |
697 | static int __init memory_tier_late_init(void) | |
698 | { | |
699 | int nid; | |
823430c8 | 700 | struct memory_tier *memtier; |
cf93be18 | 701 | |
823430c8 | 702 | get_online_mems(); |
cf93be18 | 703 | guard(mutex)(&memory_tier_lock); |
823430c8 HRJC |
704 | |
705 | /* Assign each uninitialized N_MEMORY node to a memory tier. */ | |
cf93be18 HRJC |
706 | for_each_node_state(nid, N_MEMORY) { |
707 | /* | |
823430c8 HRJC |
708 | * Some device drivers may have initialized |
709 | * memory tiers, potentially bringing memory nodes | |
710 | * online and configuring memory tiers. | |
711 | * Exclude them here. | |
cf93be18 HRJC |
712 | */ |
713 | if (node_memory_types[nid].memtype) | |
714 | continue; | |
715 | ||
823430c8 HRJC |
716 | memtier = set_node_memory_tier(nid); |
717 | if (IS_ERR(memtier)) | |
718 | continue; | |
cf93be18 HRJC |
719 | } |
720 | ||
721 | establish_demotion_targets(); | |
823430c8 | 722 | put_online_mems(); |
cf93be18 HRJC |
723 | |
724 | return 0; | |
725 | } | |
726 | late_initcall(memory_tier_late_init); | |
727 | ||
6a954e94 | 728 | static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) |
3718c02d YH |
729 | { |
730 | pr_info( | |
731 | "%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n", | |
6a954e94 DJ |
732 | prefix, coord->read_latency, coord->write_latency, |
733 | coord->read_bandwidth, coord->write_bandwidth); | |
3718c02d YH |
734 | } |
735 | ||
6a954e94 | 736 | int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, |
3718c02d YH |
737 | const char *source) |
738 | { | |
cf93be18 HRJC |
739 | guard(mutex)(&default_dram_perf_lock); |
740 | if (default_dram_perf_error) | |
741 | return -EIO; | |
3718c02d YH |
742 | |
743 | if (perf->read_latency + perf->write_latency == 0 || | |
cf93be18 HRJC |
744 | perf->read_bandwidth + perf->write_bandwidth == 0) |
745 | return -EINVAL; | |
3718c02d YH |
746 | |
747 | if (default_dram_perf_ref_nid == NUMA_NO_NODE) { | |
748 | default_dram_perf = *perf; | |
749 | default_dram_perf_ref_nid = nid; | |
750 | default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL); | |
cf93be18 | 751 | return 0; |
3718c02d YH |
752 | } |
753 | ||
754 | /* | |
755 | * The performance of all default DRAM nodes is expected to be | |
756 | * same (that is, the variation is less than 10%). And it | |
757 | * will be used as base to calculate the abstract distance of | |
758 | * other memory nodes. | |
759 | */ | |
760 | if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 > | |
761 | default_dram_perf.read_latency || | |
762 | abs(perf->write_latency - default_dram_perf.write_latency) * 10 > | |
763 | default_dram_perf.write_latency || | |
764 | abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 > | |
765 | default_dram_perf.read_bandwidth || | |
766 | abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 > | |
767 | default_dram_perf.write_bandwidth) { | |
768 | pr_info( | |
769 | "memory-tiers: the performance of DRAM node %d mismatches that of the reference\n" | |
770 | "DRAM node %d.\n", nid, default_dram_perf_ref_nid); | |
a530bbc5 YH |
771 | pr_info(" performance of reference DRAM node %d from %s:\n", |
772 | default_dram_perf_ref_nid, default_dram_perf_ref_source); | |
3718c02d | 773 | dump_hmem_attrs(&default_dram_perf, " "); |
a530bbc5 | 774 | pr_info(" performance of DRAM node %d from %s:\n", nid, source); |
3718c02d YH |
775 | dump_hmem_attrs(perf, " "); |
776 | pr_info( | |
777 | " disable default DRAM node performance based abstract distance algorithm.\n"); | |
778 | default_dram_perf_error = true; | |
cf93be18 | 779 | return -EINVAL; |
3718c02d YH |
780 | } |
781 | ||
cf93be18 | 782 | return 0; |
3718c02d YH |
783 | } |
784 | ||
6a954e94 | 785 | int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) |
3718c02d | 786 | { |
cf93be18 | 787 | guard(mutex)(&default_dram_perf_lock); |
3718c02d YH |
788 | if (default_dram_perf_error) |
789 | return -EIO; | |
790 | ||
3718c02d YH |
791 | if (perf->read_latency + perf->write_latency == 0 || |
792 | perf->read_bandwidth + perf->write_bandwidth == 0) | |
793 | return -EINVAL; | |
794 | ||
cf93be18 HRJC |
795 | if (default_dram_perf_ref_nid == NUMA_NO_NODE) |
796 | return -ENOENT; | |
797 | ||
3718c02d YH |
798 | /* |
799 | * The abstract distance of a memory node is in direct proportion to | |
800 | * its memory latency (read + write) and inversely proportional to its | |
801 | * memory bandwidth (read + write). The abstract distance, memory | |
802 | * latency, and memory bandwidth of the default DRAM nodes are used as | |
803 | * the base. | |
804 | */ | |
805 | *adist = MEMTIER_ADISTANCE_DRAM * | |
806 | (perf->read_latency + perf->write_latency) / | |
807 | (default_dram_perf.read_latency + default_dram_perf.write_latency) * | |
808 | (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / | |
809 | (perf->read_bandwidth + perf->write_bandwidth); | |
3718c02d YH |
810 | |
811 | return 0; | |
812 | } | |
813 | EXPORT_SYMBOL_GPL(mt_perf_to_adistance); | |
814 | ||
07a8bdd4 YH |
815 | /** |
816 | * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm | |
817 | * @nb: The notifier block which describe the algorithm | |
818 | * | |
819 | * Return: 0 on success, errno on error. | |
820 | * | |
821 | * Every memory tiering abstract distance algorithm provider needs to | |
822 | * register the algorithm with register_mt_adistance_algorithm(). To | |
823 | * calculate the abstract distance for a specified memory node, the | |
824 | * notifier function will be called unless some high priority | |
825 | * algorithm has provided result. The prototype of the notifier | |
826 | * function is as follows, | |
827 | * | |
828 | * int (*algorithm_notifier)(struct notifier_block *nb, | |
829 | * unsigned long nid, void *data); | |
830 | * | |
831 | * Where "nid" specifies the memory node, "data" is the pointer to the | |
832 | * returned abstract distance (that is, "int *adist"). If the | |
833 | * algorithm provides the result, NOTIFY_STOP should be returned. | |
834 | * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next | |
835 | * algorithm in the chain to provide the result. | |
836 | */ | |
837 | int register_mt_adistance_algorithm(struct notifier_block *nb) | |
838 | { | |
839 | return blocking_notifier_chain_register(&mt_adistance_algorithms, nb); | |
840 | } | |
841 | EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm); | |
842 | ||
843 | /** | |
844 | * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm | |
845 | * @nb: the notifier block which describe the algorithm | |
846 | * | |
847 | * Return: 0 on success, errno on error. | |
848 | */ | |
849 | int unregister_mt_adistance_algorithm(struct notifier_block *nb) | |
850 | { | |
851 | return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb); | |
852 | } | |
853 | EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm); | |
854 | ||
855 | /** | |
856 | * mt_calc_adistance() - Calculate abstract distance with registered algorithms | |
857 | * @node: the node to calculate abstract distance for | |
858 | * @adist: the returned abstract distance | |
859 | * | |
860 | * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some | |
861 | * abstract distance algorithm provides the result, and return it via | |
862 | * @adist. Otherwise, no algorithm can provide the result and @adist | |
863 | * will be kept as it is. | |
864 | */ | |
865 | int mt_calc_adistance(int node, int *adist) | |
866 | { | |
867 | return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist); | |
868 | } | |
869 | EXPORT_SYMBOL_GPL(mt_calc_adistance); | |
870 | ||
c6123a19 AK |
871 | static int __meminit memtier_hotplug_callback(struct notifier_block *self, |
872 | unsigned long action, void *_arg) | |
873 | { | |
6c542ab7 | 874 | struct memory_tier *memtier; |
c6123a19 AK |
875 | struct memory_notify *arg = _arg; |
876 | ||
877 | /* | |
878 | * Only update the node migration order when a node is | |
879 | * changing status, like online->offline. | |
880 | */ | |
881 | if (arg->status_change_nid < 0) | |
882 | return notifier_from_errno(0); | |
883 | ||
884 | switch (action) { | |
885 | case MEM_OFFLINE: | |
886 | mutex_lock(&memory_tier_lock); | |
6c542ab7 AK |
887 | if (clear_node_memory_tier(arg->status_change_nid)) |
888 | establish_demotion_targets(); | |
c6123a19 AK |
889 | mutex_unlock(&memory_tier_lock); |
890 | break; | |
891 | case MEM_ONLINE: | |
892 | mutex_lock(&memory_tier_lock); | |
6c542ab7 AK |
893 | memtier = set_node_memory_tier(arg->status_change_nid); |
894 | if (!IS_ERR(memtier)) | |
895 | establish_demotion_targets(); | |
c6123a19 AK |
896 | mutex_unlock(&memory_tier_lock); |
897 | break; | |
898 | } | |
899 | ||
900 | return notifier_from_errno(0); | |
901 | } | |
902 | ||
992bf775 AK |
903 | static int __init memory_tier_init(void) |
904 | { | |
823430c8 | 905 | int ret; |
992bf775 | 906 | |
9832fb87 AK |
907 | ret = subsys_virtual_register(&memory_tier_subsys, NULL); |
908 | if (ret) | |
909 | panic("%s() failed to register memory tier subsystem\n", __func__); | |
910 | ||
6c542ab7 AK |
911 | #ifdef CONFIG_MIGRATION |
912 | node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), | |
913 | GFP_KERNEL); | |
914 | WARN_ON(!node_demotion); | |
915 | #endif | |
823430c8 | 916 | |
073c78ed | 917 | mutex_lock(&memory_tier_lock); |
7b88bda3 AK |
918 | /* |
919 | * For now we can have 4 faster memory tiers with smaller adistance | |
920 | * than default DRAM tier. | |
921 | */ | |
cf93be18 HRJC |
922 | default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM, |
923 | &default_memory_types); | |
073c78ed | 924 | mutex_unlock(&memory_tier_lock); |
4a625cee | 925 | if (IS_ERR(default_dram_type)) |
7b88bda3 AK |
926 | panic("%s() failed to allocate default DRAM tier\n", __func__); |
927 | ||
823430c8 HRJC |
928 | /* Record nodes with memory and CPU to set default DRAM performance. */ |
929 | nodes_and(default_dram_nodes, node_states[N_MEMORY], | |
930 | node_states[N_CPU]); | |
992bf775 | 931 | |
1eeaa4fd | 932 | hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); |
992bf775 AK |
933 | return 0; |
934 | } | |
935 | subsys_initcall(memory_tier_init); | |
91952440 AK |
936 | |
937 | bool numa_demotion_enabled = false; | |
938 | ||
939 | #ifdef CONFIG_MIGRATION | |
940 | #ifdef CONFIG_SYSFS | |
8d3a7d79 ML |
941 | static ssize_t demotion_enabled_show(struct kobject *kobj, |
942 | struct kobj_attribute *attr, char *buf) | |
91952440 | 943 | { |
01b58b17 | 944 | return sysfs_emit(buf, "%s\n", str_true_false(numa_demotion_enabled)); |
91952440 AK |
945 | } |
946 | ||
8d3a7d79 ML |
947 | static ssize_t demotion_enabled_store(struct kobject *kobj, |
948 | struct kobj_attribute *attr, | |
949 | const char *buf, size_t count) | |
91952440 AK |
950 | { |
951 | ssize_t ret; | |
952 | ||
953 | ret = kstrtobool(buf, &numa_demotion_enabled); | |
954 | if (ret) | |
955 | return ret; | |
956 | ||
957 | return count; | |
958 | } | |
959 | ||
960 | static struct kobj_attribute numa_demotion_enabled_attr = | |
8d3a7d79 | 961 | __ATTR_RW(demotion_enabled); |
91952440 AK |
962 | |
963 | static struct attribute *numa_attrs[] = { | |
964 | &numa_demotion_enabled_attr.attr, | |
965 | NULL, | |
966 | }; | |
967 | ||
968 | static const struct attribute_group numa_attr_group = { | |
969 | .attrs = numa_attrs, | |
970 | }; | |
971 | ||
972 | static int __init numa_init_sysfs(void) | |
973 | { | |
974 | int err; | |
975 | struct kobject *numa_kobj; | |
976 | ||
977 | numa_kobj = kobject_create_and_add("numa", mm_kobj); | |
978 | if (!numa_kobj) { | |
979 | pr_err("failed to create numa kobject\n"); | |
980 | return -ENOMEM; | |
981 | } | |
982 | err = sysfs_create_group(numa_kobj, &numa_attr_group); | |
983 | if (err) { | |
984 | pr_err("failed to register numa group\n"); | |
985 | goto delete_obj; | |
986 | } | |
987 | return 0; | |
988 | ||
989 | delete_obj: | |
990 | kobject_put(numa_kobj); | |
991 | return err; | |
992 | } | |
993 | subsys_initcall(numa_init_sysfs); | |
994 | #endif /* CONFIG_SYSFS */ | |
995 | #endif |