]>
Commit | Line | Data |
---|---|---|
992bf775 | 1 | // SPDX-License-Identifier: GPL-2.0 |
992bf775 AK |
2 | #include <linux/slab.h> |
3 | #include <linux/lockdep.h> | |
91952440 AK |
4 | #include <linux/sysfs.h> |
5 | #include <linux/kobject.h> | |
c6123a19 | 6 | #include <linux/memory.h> |
992bf775 | 7 | #include <linux/memory-tiers.h> |
07a8bdd4 | 8 | #include <linux/notifier.h> |
992bf775 | 9 | |
6c542ab7 AK |
10 | #include "internal.h" |
11 | ||
992bf775 AK |
12 | struct memory_tier { |
13 | /* hierarchy of memory tiers */ | |
14 | struct list_head list; | |
15 | /* list of all memory types part of this tier */ | |
16 | struct list_head memory_types; | |
17 | /* | |
18 | * start value of abstract distance. memory tier maps | |
19 | * an abstract distance range, | |
20 | * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE | |
21 | */ | |
22 | int adistance_start; | |
9832fb87 | 23 | struct device dev; |
32008027 JG |
24 | /* All the nodes that are part of all the lower memory tiers. */ |
25 | nodemask_t lower_tier_mask; | |
992bf775 AK |
26 | }; |
27 | ||
6c542ab7 AK |
28 | struct demotion_nodes { |
29 | nodemask_t preferred; | |
30 | }; | |
31 | ||
7b88bda3 AK |
32 | struct node_memory_type_map { |
33 | struct memory_dev_type *memtype; | |
34 | int map_count; | |
992bf775 AK |
35 | }; |
36 | ||
37 | static DEFINE_MUTEX(memory_tier_lock); | |
38 | static LIST_HEAD(memory_tiers); | |
cf93be18 HRJC |
39 | /* |
40 | * The list is used to store all memory types that are not created | |
41 | * by a device driver. | |
42 | */ | |
43 | static LIST_HEAD(default_memory_types); | |
7b88bda3 | 44 | static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; |
3718c02d | 45 | struct memory_dev_type *default_dram_type; |
9832fb87 | 46 | |
e374ae2b | 47 | static const struct bus_type memory_tier_subsys = { |
9832fb87 AK |
48 | .name = "memory_tiering", |
49 | .dev_name = "memory_tier", | |
50 | }; | |
51 | ||
6c542ab7 | 52 | #ifdef CONFIG_MIGRATION |
467b171a | 53 | static int top_tier_adistance; |
6c542ab7 AK |
54 | /* |
55 | * node_demotion[] examples: | |
56 | * | |
57 | * Example 1: | |
58 | * | |
59 | * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. | |
60 | * | |
61 | * node distances: | |
62 | * node 0 1 2 3 | |
63 | * 0 10 20 30 40 | |
64 | * 1 20 10 40 30 | |
65 | * 2 30 40 10 40 | |
66 | * 3 40 30 40 10 | |
67 | * | |
68 | * memory_tiers0 = 0-1 | |
69 | * memory_tiers1 = 2-3 | |
70 | * | |
71 | * node_demotion[0].preferred = 2 | |
72 | * node_demotion[1].preferred = 3 | |
73 | * node_demotion[2].preferred = <empty> | |
74 | * node_demotion[3].preferred = <empty> | |
75 | * | |
76 | * Example 2: | |
77 | * | |
78 | * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. | |
79 | * | |
80 | * node distances: | |
81 | * node 0 1 2 | |
82 | * 0 10 20 30 | |
83 | * 1 20 10 30 | |
84 | * 2 30 30 10 | |
85 | * | |
86 | * memory_tiers0 = 0-2 | |
87 | * | |
88 | * node_demotion[0].preferred = <empty> | |
89 | * node_demotion[1].preferred = <empty> | |
90 | * node_demotion[2].preferred = <empty> | |
91 | * | |
92 | * Example 3: | |
93 | * | |
94 | * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. | |
95 | * | |
96 | * node distances: | |
97 | * node 0 1 2 | |
98 | * 0 10 20 30 | |
99 | * 1 20 10 40 | |
100 | * 2 30 40 10 | |
101 | * | |
102 | * memory_tiers0 = 1 | |
103 | * memory_tiers1 = 0 | |
104 | * memory_tiers2 = 2 | |
105 | * | |
106 | * node_demotion[0].preferred = 2 | |
107 | * node_demotion[1].preferred = 0 | |
108 | * node_demotion[2].preferred = <empty> | |
109 | * | |
110 | */ | |
111 | static struct demotion_nodes *node_demotion __read_mostly; | |
112 | #endif /* CONFIG_MIGRATION */ | |
992bf775 | 113 | |
07a8bdd4 YH |
114 | static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); |
115 | ||
cf93be18 HRJC |
116 | /* The lock is used to protect `default_dram_perf*` info and nid. */ |
117 | static DEFINE_MUTEX(default_dram_perf_lock); | |
3718c02d | 118 | static bool default_dram_perf_error; |
6a954e94 | 119 | static struct access_coordinate default_dram_perf; |
3718c02d YH |
120 | static int default_dram_perf_ref_nid = NUMA_NO_NODE; |
121 | static const char *default_dram_perf_ref_source; | |
122 | ||
9832fb87 AK |
123 | static inline struct memory_tier *to_memory_tier(struct device *device) |
124 | { | |
125 | return container_of(device, struct memory_tier, dev); | |
126 | } | |
127 | ||
128 | static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) | |
129 | { | |
130 | nodemask_t nodes = NODE_MASK_NONE; | |
131 | struct memory_dev_type *memtype; | |
132 | ||
51a23b1b | 133 | list_for_each_entry(memtype, &memtier->memory_types, tier_sibling) |
9832fb87 AK |
134 | nodes_or(nodes, nodes, memtype->nodes); |
135 | ||
136 | return nodes; | |
137 | } | |
138 | ||
139 | static void memory_tier_device_release(struct device *dev) | |
140 | { | |
141 | struct memory_tier *tier = to_memory_tier(dev); | |
142 | /* | |
143 | * synchronize_rcu in clear_node_memory_tier makes sure | |
144 | * we don't have rcu access to this memory tier. | |
145 | */ | |
146 | kfree(tier); | |
147 | } | |
148 | ||
27d676a1 YH |
149 | static ssize_t nodelist_show(struct device *dev, |
150 | struct device_attribute *attr, char *buf) | |
9832fb87 AK |
151 | { |
152 | int ret; | |
153 | nodemask_t nmask; | |
154 | ||
155 | mutex_lock(&memory_tier_lock); | |
156 | nmask = get_memtier_nodemask(to_memory_tier(dev)); | |
157 | ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask)); | |
158 | mutex_unlock(&memory_tier_lock); | |
159 | return ret; | |
160 | } | |
27d676a1 | 161 | static DEVICE_ATTR_RO(nodelist); |
9832fb87 AK |
162 | |
163 | static struct attribute *memtier_dev_attrs[] = { | |
27d676a1 | 164 | &dev_attr_nodelist.attr, |
9832fb87 AK |
165 | NULL |
166 | }; | |
167 | ||
168 | static const struct attribute_group memtier_dev_group = { | |
169 | .attrs = memtier_dev_attrs, | |
170 | }; | |
171 | ||
172 | static const struct attribute_group *memtier_dev_groups[] = { | |
173 | &memtier_dev_group, | |
174 | NULL | |
175 | }; | |
176 | ||
992bf775 AK |
177 | static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) |
178 | { | |
9832fb87 | 179 | int ret; |
992bf775 AK |
180 | bool found_slot = false; |
181 | struct memory_tier *memtier, *new_memtier; | |
182 | int adistance = memtype->adistance; | |
183 | unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; | |
184 | ||
185 | lockdep_assert_held_once(&memory_tier_lock); | |
186 | ||
b26ac6f3 | 187 | adistance = round_down(adistance, memtier_adistance_chunk_size); |
992bf775 AK |
188 | /* |
189 | * If the memtype is already part of a memory tier, | |
190 | * just return that. | |
191 | */ | |
51a23b1b | 192 | if (!list_empty(&memtype->tier_sibling)) { |
b26ac6f3 AK |
193 | list_for_each_entry(memtier, &memory_tiers, list) { |
194 | if (adistance == memtier->adistance_start) | |
195 | return memtier; | |
196 | } | |
197 | WARN_ON(1); | |
198 | return ERR_PTR(-EINVAL); | |
199 | } | |
992bf775 | 200 | |
992bf775 AK |
201 | list_for_each_entry(memtier, &memory_tiers, list) { |
202 | if (adistance == memtier->adistance_start) { | |
9832fb87 | 203 | goto link_memtype; |
992bf775 AK |
204 | } else if (adistance < memtier->adistance_start) { |
205 | found_slot = true; | |
206 | break; | |
207 | } | |
208 | } | |
209 | ||
9832fb87 | 210 | new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); |
992bf775 AK |
211 | if (!new_memtier) |
212 | return ERR_PTR(-ENOMEM); | |
213 | ||
214 | new_memtier->adistance_start = adistance; | |
215 | INIT_LIST_HEAD(&new_memtier->list); | |
216 | INIT_LIST_HEAD(&new_memtier->memory_types); | |
217 | if (found_slot) | |
218 | list_add_tail(&new_memtier->list, &memtier->list); | |
219 | else | |
220 | list_add_tail(&new_memtier->list, &memory_tiers); | |
9832fb87 AK |
221 | |
222 | new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; | |
223 | new_memtier->dev.bus = &memory_tier_subsys; | |
224 | new_memtier->dev.release = memory_tier_device_release; | |
225 | new_memtier->dev.groups = memtier_dev_groups; | |
226 | ||
227 | ret = device_register(&new_memtier->dev); | |
228 | if (ret) { | |
93419139 TT |
229 | list_del(&new_memtier->list); |
230 | put_device(&new_memtier->dev); | |
9832fb87 AK |
231 | return ERR_PTR(ret); |
232 | } | |
233 | memtier = new_memtier; | |
234 | ||
235 | link_memtype: | |
51a23b1b | 236 | list_add(&memtype->tier_sibling, &memtier->memory_types); |
9832fb87 | 237 | return memtier; |
992bf775 AK |
238 | } |
239 | ||
6c542ab7 AK |
240 | static struct memory_tier *__node_get_memory_tier(int node) |
241 | { | |
7766cf7a | 242 | pg_data_t *pgdat; |
6c542ab7 | 243 | |
7766cf7a AK |
244 | pgdat = NODE_DATA(node); |
245 | if (!pgdat) | |
246 | return NULL; | |
247 | /* | |
248 | * Since we hold memory_tier_lock, we can avoid | |
249 | * RCU read locks when accessing the details. No | |
250 | * parallel updates are possible here. | |
251 | */ | |
252 | return rcu_dereference_check(pgdat->memtier, | |
253 | lockdep_is_held(&memory_tier_lock)); | |
6c542ab7 AK |
254 | } |
255 | ||
256 | #ifdef CONFIG_MIGRATION | |
467b171a AK |
257 | bool node_is_toptier(int node) |
258 | { | |
259 | bool toptier; | |
260 | pg_data_t *pgdat; | |
261 | struct memory_tier *memtier; | |
262 | ||
263 | pgdat = NODE_DATA(node); | |
264 | if (!pgdat) | |
265 | return false; | |
266 | ||
267 | rcu_read_lock(); | |
268 | memtier = rcu_dereference(pgdat->memtier); | |
269 | if (!memtier) { | |
270 | toptier = true; | |
271 | goto out; | |
272 | } | |
273 | if (memtier->adistance_start <= top_tier_adistance) | |
274 | toptier = true; | |
275 | else | |
276 | toptier = false; | |
277 | out: | |
278 | rcu_read_unlock(); | |
279 | return toptier; | |
280 | } | |
281 | ||
32008027 JG |
282 | void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) |
283 | { | |
284 | struct memory_tier *memtier; | |
285 | ||
286 | /* | |
287 | * pg_data_t.memtier updates includes a synchronize_rcu() | |
288 | * which ensures that we either find NULL or a valid memtier | |
289 | * in NODE_DATA. protect the access via rcu_read_lock(); | |
290 | */ | |
291 | rcu_read_lock(); | |
292 | memtier = rcu_dereference(pgdat->memtier); | |
293 | if (memtier) | |
294 | *targets = memtier->lower_tier_mask; | |
295 | else | |
296 | *targets = NODE_MASK_NONE; | |
297 | rcu_read_unlock(); | |
298 | } | |
299 | ||
6c542ab7 AK |
300 | /** |
301 | * next_demotion_node() - Get the next node in the demotion path | |
302 | * @node: The starting node to lookup the next node | |
303 | * | |
304 | * Return: node id for next memory node in the demotion path hierarchy | |
305 | * from @node; NUMA_NO_NODE if @node is terminal. This does not keep | |
306 | * @node online or guarantee that it *continues* to be the next demotion | |
307 | * target. | |
308 | */ | |
309 | int next_demotion_node(int node) | |
310 | { | |
311 | struct demotion_nodes *nd; | |
312 | int target; | |
313 | ||
314 | if (!node_demotion) | |
315 | return NUMA_NO_NODE; | |
316 | ||
317 | nd = &node_demotion[node]; | |
318 | ||
319 | /* | |
320 | * node_demotion[] is updated without excluding this | |
321 | * function from running. | |
322 | * | |
323 | * Make sure to use RCU over entire code blocks if | |
324 | * node_demotion[] reads need to be consistent. | |
325 | */ | |
326 | rcu_read_lock(); | |
327 | /* | |
328 | * If there are multiple target nodes, just select one | |
329 | * target node randomly. | |
330 | * | |
331 | * In addition, we can also use round-robin to select | |
332 | * target node, but we should introduce another variable | |
333 | * for node_demotion[] to record last selected target node, | |
334 | * that may cause cache ping-pong due to the changing of | |
335 | * last target node. Or introducing per-cpu data to avoid | |
336 | * caching issue, which seems more complicated. So selecting | |
337 | * target node randomly seems better until now. | |
338 | */ | |
339 | target = node_random(&nd->preferred); | |
340 | rcu_read_unlock(); | |
341 | ||
342 | return target; | |
343 | } | |
344 | ||
345 | static void disable_all_demotion_targets(void) | |
346 | { | |
32008027 | 347 | struct memory_tier *memtier; |
6c542ab7 AK |
348 | int node; |
349 | ||
32008027 | 350 | for_each_node_state(node, N_MEMORY) { |
6c542ab7 | 351 | node_demotion[node].preferred = NODE_MASK_NONE; |
32008027 JG |
352 | /* |
353 | * We are holding memory_tier_lock, it is safe | |
354 | * to access pgda->memtier. | |
355 | */ | |
356 | memtier = __node_get_memory_tier(node); | |
357 | if (memtier) | |
358 | memtier->lower_tier_mask = NODE_MASK_NONE; | |
359 | } | |
6c542ab7 AK |
360 | /* |
361 | * Ensure that the "disable" is visible across the system. | |
362 | * Readers will see either a combination of before+disable | |
363 | * state or disable+after. They will never see before and | |
364 | * after state together. | |
365 | */ | |
366 | synchronize_rcu(); | |
367 | } | |
368 | ||
601e793a LZ |
369 | static void dump_demotion_targets(void) |
370 | { | |
371 | int node; | |
372 | ||
373 | for_each_node_state(node, N_MEMORY) { | |
374 | struct memory_tier *memtier = __node_get_memory_tier(node); | |
375 | nodemask_t preferred = node_demotion[node].preferred; | |
376 | ||
377 | if (!memtier) | |
378 | continue; | |
379 | ||
380 | if (nodes_empty(preferred)) | |
381 | pr_info("Demotion targets for Node %d: null\n", node); | |
382 | else | |
383 | pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n", | |
384 | node, nodemask_pr_args(&preferred), | |
385 | nodemask_pr_args(&memtier->lower_tier_mask)); | |
386 | } | |
387 | } | |
388 | ||
6c542ab7 AK |
389 | /* |
390 | * Find an automatic demotion target for all memory | |
391 | * nodes. Failing here is OK. It might just indicate | |
392 | * being at the end of a chain. | |
393 | */ | |
394 | static void establish_demotion_targets(void) | |
395 | { | |
396 | struct memory_tier *memtier; | |
397 | struct demotion_nodes *nd; | |
398 | int target = NUMA_NO_NODE, node; | |
399 | int distance, best_distance; | |
32008027 | 400 | nodemask_t tier_nodes, lower_tier; |
6c542ab7 AK |
401 | |
402 | lockdep_assert_held_once(&memory_tier_lock); | |
403 | ||
33ee4f18 | 404 | if (!node_demotion) |
6c542ab7 AK |
405 | return; |
406 | ||
407 | disable_all_demotion_targets(); | |
408 | ||
409 | for_each_node_state(node, N_MEMORY) { | |
410 | best_distance = -1; | |
411 | nd = &node_demotion[node]; | |
412 | ||
413 | memtier = __node_get_memory_tier(node); | |
414 | if (!memtier || list_is_last(&memtier->list, &memory_tiers)) | |
415 | continue; | |
416 | /* | |
417 | * Get the lower memtier to find the demotion node list. | |
418 | */ | |
419 | memtier = list_next_entry(memtier, list); | |
420 | tier_nodes = get_memtier_nodemask(memtier); | |
421 | /* | |
422 | * find_next_best_node, use 'used' nodemask as a skip list. | |
423 | * Add all memory nodes except the selected memory tier | |
424 | * nodelist to skip list so that we find the best node from the | |
425 | * memtier nodelist. | |
426 | */ | |
427 | nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); | |
428 | ||
429 | /* | |
430 | * Find all the nodes in the memory tier node list of same best distance. | |
431 | * add them to the preferred mask. We randomly select between nodes | |
432 | * in the preferred mask when allocating pages during demotion. | |
433 | */ | |
434 | do { | |
435 | target = find_next_best_node(node, &tier_nodes); | |
436 | if (target == NUMA_NO_NODE) | |
437 | break; | |
438 | ||
439 | distance = node_distance(node, target); | |
440 | if (distance == best_distance || best_distance == -1) { | |
441 | best_distance = distance; | |
442 | node_set(target, nd->preferred); | |
443 | } else { | |
444 | break; | |
445 | } | |
446 | } while (1); | |
447 | } | |
467b171a AK |
448 | /* |
449 | * Promotion is allowed from a memory tier to higher | |
450 | * memory tier only if the memory tier doesn't include | |
451 | * compute. We want to skip promotion from a memory tier, | |
452 | * if any node that is part of the memory tier have CPUs. | |
453 | * Once we detect such a memory tier, we consider that tier | |
454 | * as top tiper from which promotion is not allowed. | |
455 | */ | |
456 | list_for_each_entry_reverse(memtier, &memory_tiers, list) { | |
457 | tier_nodes = get_memtier_nodemask(memtier); | |
458 | nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); | |
459 | if (!nodes_empty(tier_nodes)) { | |
460 | /* | |
461 | * abstract distance below the max value of this memtier | |
462 | * is considered toptier. | |
463 | */ | |
464 | top_tier_adistance = memtier->adistance_start + | |
465 | MEMTIER_CHUNK_SIZE - 1; | |
466 | break; | |
467 | } | |
468 | } | |
32008027 JG |
469 | /* |
470 | * Now build the lower_tier mask for each node collecting node mask from | |
471 | * all memory tier below it. This allows us to fallback demotion page | |
472 | * allocation to a set of nodes that is closer the above selected | |
601e793a | 473 | * preferred node. |
32008027 JG |
474 | */ |
475 | lower_tier = node_states[N_MEMORY]; | |
476 | list_for_each_entry(memtier, &memory_tiers, list) { | |
477 | /* | |
478 | * Keep removing current tier from lower_tier nodes, | |
479 | * This will remove all nodes in current and above | |
480 | * memory tier from the lower_tier mask. | |
481 | */ | |
482 | tier_nodes = get_memtier_nodemask(memtier); | |
483 | nodes_andnot(lower_tier, lower_tier, tier_nodes); | |
484 | memtier->lower_tier_mask = lower_tier; | |
485 | } | |
601e793a LZ |
486 | |
487 | dump_demotion_targets(); | |
6c542ab7 AK |
488 | } |
489 | ||
490 | #else | |
6c542ab7 AK |
491 | static inline void establish_demotion_targets(void) {} |
492 | #endif /* CONFIG_MIGRATION */ | |
493 | ||
7b88bda3 AK |
494 | static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) |
495 | { | |
496 | if (!node_memory_types[node].memtype) | |
497 | node_memory_types[node].memtype = memtype; | |
498 | /* | |
499 | * for each device getting added in the same NUMA node | |
500 | * with this specific memtype, bump the map count. We | |
501 | * Only take memtype device reference once, so that | |
502 | * changing a node memtype can be done by droping the | |
503 | * only reference count taken here. | |
504 | */ | |
505 | ||
506 | if (node_memory_types[node].memtype == memtype) { | |
507 | if (!node_memory_types[node].map_count++) | |
508 | kref_get(&memtype->kref); | |
509 | } | |
510 | } | |
511 | ||
992bf775 AK |
512 | static struct memory_tier *set_node_memory_tier(int node) |
513 | { | |
514 | struct memory_tier *memtier; | |
cf93be18 HRJC |
515 | struct memory_dev_type *memtype = default_dram_type; |
516 | int adist = MEMTIER_ADISTANCE_DRAM; | |
7766cf7a AK |
517 | pg_data_t *pgdat = NODE_DATA(node); |
518 | ||
992bf775 AK |
519 | |
520 | lockdep_assert_held_once(&memory_tier_lock); | |
521 | ||
522 | if (!node_state(node, N_MEMORY)) | |
523 | return ERR_PTR(-EINVAL); | |
524 | ||
cf93be18 HRJC |
525 | mt_calc_adistance(node, &adist); |
526 | if (!node_memory_types[node].memtype) { | |
527 | memtype = mt_find_alloc_memory_type(adist, &default_memory_types); | |
528 | if (IS_ERR(memtype)) { | |
529 | memtype = default_dram_type; | |
530 | pr_info("Failed to allocate a memory type. Fall back.\n"); | |
531 | } | |
532 | } | |
533 | ||
534 | __init_node_memory_type(node, memtype); | |
992bf775 | 535 | |
7b88bda3 | 536 | memtype = node_memory_types[node].memtype; |
992bf775 AK |
537 | node_set(node, memtype->nodes); |
538 | memtier = find_create_memory_tier(memtype); | |
7766cf7a AK |
539 | if (!IS_ERR(memtier)) |
540 | rcu_assign_pointer(pgdat->memtier, memtier); | |
992bf775 AK |
541 | return memtier; |
542 | } | |
543 | ||
c6123a19 AK |
544 | static void destroy_memory_tier(struct memory_tier *memtier) |
545 | { | |
546 | list_del(&memtier->list); | |
9832fb87 | 547 | device_unregister(&memtier->dev); |
c6123a19 AK |
548 | } |
549 | ||
550 | static bool clear_node_memory_tier(int node) | |
551 | { | |
552 | bool cleared = false; | |
7766cf7a | 553 | pg_data_t *pgdat; |
c6123a19 AK |
554 | struct memory_tier *memtier; |
555 | ||
7766cf7a AK |
556 | pgdat = NODE_DATA(node); |
557 | if (!pgdat) | |
558 | return false; | |
559 | ||
560 | /* | |
561 | * Make sure that anybody looking at NODE_DATA who finds | |
562 | * a valid memtier finds memory_dev_types with nodes still | |
563 | * linked to the memtier. We achieve this by waiting for | |
564 | * rcu read section to finish using synchronize_rcu. | |
565 | * This also enables us to free the destroyed memory tier | |
566 | * with kfree instead of kfree_rcu | |
567 | */ | |
c6123a19 AK |
568 | memtier = __node_get_memory_tier(node); |
569 | if (memtier) { | |
570 | struct memory_dev_type *memtype; | |
571 | ||
7766cf7a AK |
572 | rcu_assign_pointer(pgdat->memtier, NULL); |
573 | synchronize_rcu(); | |
7b88bda3 | 574 | memtype = node_memory_types[node].memtype; |
c6123a19 AK |
575 | node_clear(node, memtype->nodes); |
576 | if (nodes_empty(memtype->nodes)) { | |
51a23b1b | 577 | list_del_init(&memtype->tier_sibling); |
c6123a19 AK |
578 | if (list_empty(&memtier->memory_types)) |
579 | destroy_memory_tier(memtier); | |
580 | } | |
581 | cleared = true; | |
582 | } | |
583 | return cleared; | |
584 | } | |
585 | ||
7b88bda3 AK |
586 | static void release_memtype(struct kref *kref) |
587 | { | |
588 | struct memory_dev_type *memtype; | |
589 | ||
590 | memtype = container_of(kref, struct memory_dev_type, kref); | |
591 | kfree(memtype); | |
592 | } | |
593 | ||
594 | struct memory_dev_type *alloc_memory_type(int adistance) | |
595 | { | |
596 | struct memory_dev_type *memtype; | |
597 | ||
598 | memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); | |
599 | if (!memtype) | |
600 | return ERR_PTR(-ENOMEM); | |
601 | ||
602 | memtype->adistance = adistance; | |
51a23b1b | 603 | INIT_LIST_HEAD(&memtype->tier_sibling); |
7b88bda3 | 604 | memtype->nodes = NODE_MASK_NONE; |
7b88bda3 AK |
605 | kref_init(&memtype->kref); |
606 | return memtype; | |
607 | } | |
608 | EXPORT_SYMBOL_GPL(alloc_memory_type); | |
609 | ||
bded67f8 | 610 | void put_memory_type(struct memory_dev_type *memtype) |
7b88bda3 AK |
611 | { |
612 | kref_put(&memtype->kref, release_memtype); | |
613 | } | |
bded67f8 | 614 | EXPORT_SYMBOL_GPL(put_memory_type); |
7b88bda3 AK |
615 | |
616 | void init_node_memory_type(int node, struct memory_dev_type *memtype) | |
617 | { | |
618 | ||
619 | mutex_lock(&memory_tier_lock); | |
620 | __init_node_memory_type(node, memtype); | |
621 | mutex_unlock(&memory_tier_lock); | |
622 | } | |
623 | EXPORT_SYMBOL_GPL(init_node_memory_type); | |
624 | ||
625 | void clear_node_memory_type(int node, struct memory_dev_type *memtype) | |
626 | { | |
627 | mutex_lock(&memory_tier_lock); | |
6bc2cfdf | 628 | if (node_memory_types[node].memtype == memtype || !memtype) |
7b88bda3 AK |
629 | node_memory_types[node].map_count--; |
630 | /* | |
631 | * If we umapped all the attached devices to this node, | |
632 | * clear the node memory type. | |
633 | */ | |
634 | if (!node_memory_types[node].map_count) { | |
6bc2cfdf | 635 | memtype = node_memory_types[node].memtype; |
7b88bda3 | 636 | node_memory_types[node].memtype = NULL; |
bded67f8 | 637 | put_memory_type(memtype); |
7b88bda3 AK |
638 | } |
639 | mutex_unlock(&memory_tier_lock); | |
640 | } | |
641 | EXPORT_SYMBOL_GPL(clear_node_memory_type); | |
642 | ||
a72a30af HRJC |
643 | struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types) |
644 | { | |
645 | struct memory_dev_type *mtype; | |
646 | ||
647 | list_for_each_entry(mtype, memory_types, list) | |
648 | if (mtype->adistance == adist) | |
649 | return mtype; | |
650 | ||
651 | mtype = alloc_memory_type(adist); | |
652 | if (IS_ERR(mtype)) | |
653 | return mtype; | |
654 | ||
655 | list_add(&mtype->list, memory_types); | |
656 | ||
657 | return mtype; | |
658 | } | |
659 | EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type); | |
660 | ||
661 | void mt_put_memory_types(struct list_head *memory_types) | |
662 | { | |
663 | struct memory_dev_type *mtype, *mtn; | |
664 | ||
665 | list_for_each_entry_safe(mtype, mtn, memory_types, list) { | |
666 | list_del(&mtype->list); | |
667 | put_memory_type(mtype); | |
668 | } | |
669 | } | |
670 | EXPORT_SYMBOL_GPL(mt_put_memory_types); | |
671 | ||
cf93be18 HRJC |
672 | /* |
673 | * This is invoked via `late_initcall()` to initialize memory tiers for | |
674 | * CPU-less memory nodes after driver initialization, which is | |
675 | * expected to provide `adistance` algorithms. | |
676 | */ | |
677 | static int __init memory_tier_late_init(void) | |
678 | { | |
679 | int nid; | |
680 | ||
681 | guard(mutex)(&memory_tier_lock); | |
682 | for_each_node_state(nid, N_MEMORY) { | |
683 | /* | |
684 | * Some device drivers may have initialized memory tiers | |
685 | * between `memory_tier_init()` and `memory_tier_late_init()`, | |
686 | * potentially bringing online memory nodes and | |
687 | * configuring memory tiers. Exclude them here. | |
688 | */ | |
689 | if (node_memory_types[nid].memtype) | |
690 | continue; | |
691 | ||
692 | set_node_memory_tier(nid); | |
693 | } | |
694 | ||
695 | establish_demotion_targets(); | |
696 | ||
697 | return 0; | |
698 | } | |
699 | late_initcall(memory_tier_late_init); | |
700 | ||
6a954e94 | 701 | static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) |
3718c02d YH |
702 | { |
703 | pr_info( | |
704 | "%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n", | |
6a954e94 DJ |
705 | prefix, coord->read_latency, coord->write_latency, |
706 | coord->read_bandwidth, coord->write_bandwidth); | |
3718c02d YH |
707 | } |
708 | ||
6a954e94 | 709 | int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, |
3718c02d YH |
710 | const char *source) |
711 | { | |
cf93be18 HRJC |
712 | guard(mutex)(&default_dram_perf_lock); |
713 | if (default_dram_perf_error) | |
714 | return -EIO; | |
3718c02d YH |
715 | |
716 | if (perf->read_latency + perf->write_latency == 0 || | |
cf93be18 HRJC |
717 | perf->read_bandwidth + perf->write_bandwidth == 0) |
718 | return -EINVAL; | |
3718c02d YH |
719 | |
720 | if (default_dram_perf_ref_nid == NUMA_NO_NODE) { | |
721 | default_dram_perf = *perf; | |
722 | default_dram_perf_ref_nid = nid; | |
723 | default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL); | |
cf93be18 | 724 | return 0; |
3718c02d YH |
725 | } |
726 | ||
727 | /* | |
728 | * The performance of all default DRAM nodes is expected to be | |
729 | * same (that is, the variation is less than 10%). And it | |
730 | * will be used as base to calculate the abstract distance of | |
731 | * other memory nodes. | |
732 | */ | |
733 | if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 > | |
734 | default_dram_perf.read_latency || | |
735 | abs(perf->write_latency - default_dram_perf.write_latency) * 10 > | |
736 | default_dram_perf.write_latency || | |
737 | abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 > | |
738 | default_dram_perf.read_bandwidth || | |
739 | abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 > | |
740 | default_dram_perf.write_bandwidth) { | |
741 | pr_info( | |
742 | "memory-tiers: the performance of DRAM node %d mismatches that of the reference\n" | |
743 | "DRAM node %d.\n", nid, default_dram_perf_ref_nid); | |
744 | pr_info(" performance of reference DRAM node %d:\n", | |
745 | default_dram_perf_ref_nid); | |
746 | dump_hmem_attrs(&default_dram_perf, " "); | |
747 | pr_info(" performance of DRAM node %d:\n", nid); | |
748 | dump_hmem_attrs(perf, " "); | |
749 | pr_info( | |
750 | " disable default DRAM node performance based abstract distance algorithm.\n"); | |
751 | default_dram_perf_error = true; | |
cf93be18 | 752 | return -EINVAL; |
3718c02d YH |
753 | } |
754 | ||
cf93be18 | 755 | return 0; |
3718c02d YH |
756 | } |
757 | ||
6a954e94 | 758 | int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) |
3718c02d | 759 | { |
cf93be18 | 760 | guard(mutex)(&default_dram_perf_lock); |
3718c02d YH |
761 | if (default_dram_perf_error) |
762 | return -EIO; | |
763 | ||
3718c02d YH |
764 | if (perf->read_latency + perf->write_latency == 0 || |
765 | perf->read_bandwidth + perf->write_bandwidth == 0) | |
766 | return -EINVAL; | |
767 | ||
cf93be18 HRJC |
768 | if (default_dram_perf_ref_nid == NUMA_NO_NODE) |
769 | return -ENOENT; | |
770 | ||
3718c02d YH |
771 | /* |
772 | * The abstract distance of a memory node is in direct proportion to | |
773 | * its memory latency (read + write) and inversely proportional to its | |
774 | * memory bandwidth (read + write). The abstract distance, memory | |
775 | * latency, and memory bandwidth of the default DRAM nodes are used as | |
776 | * the base. | |
777 | */ | |
778 | *adist = MEMTIER_ADISTANCE_DRAM * | |
779 | (perf->read_latency + perf->write_latency) / | |
780 | (default_dram_perf.read_latency + default_dram_perf.write_latency) * | |
781 | (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / | |
782 | (perf->read_bandwidth + perf->write_bandwidth); | |
3718c02d YH |
783 | |
784 | return 0; | |
785 | } | |
786 | EXPORT_SYMBOL_GPL(mt_perf_to_adistance); | |
787 | ||
07a8bdd4 YH |
788 | /** |
789 | * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm | |
790 | * @nb: The notifier block which describe the algorithm | |
791 | * | |
792 | * Return: 0 on success, errno on error. | |
793 | * | |
794 | * Every memory tiering abstract distance algorithm provider needs to | |
795 | * register the algorithm with register_mt_adistance_algorithm(). To | |
796 | * calculate the abstract distance for a specified memory node, the | |
797 | * notifier function will be called unless some high priority | |
798 | * algorithm has provided result. The prototype of the notifier | |
799 | * function is as follows, | |
800 | * | |
801 | * int (*algorithm_notifier)(struct notifier_block *nb, | |
802 | * unsigned long nid, void *data); | |
803 | * | |
804 | * Where "nid" specifies the memory node, "data" is the pointer to the | |
805 | * returned abstract distance (that is, "int *adist"). If the | |
806 | * algorithm provides the result, NOTIFY_STOP should be returned. | |
807 | * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next | |
808 | * algorithm in the chain to provide the result. | |
809 | */ | |
810 | int register_mt_adistance_algorithm(struct notifier_block *nb) | |
811 | { | |
812 | return blocking_notifier_chain_register(&mt_adistance_algorithms, nb); | |
813 | } | |
814 | EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm); | |
815 | ||
816 | /** | |
817 | * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm | |
818 | * @nb: the notifier block which describe the algorithm | |
819 | * | |
820 | * Return: 0 on success, errno on error. | |
821 | */ | |
822 | int unregister_mt_adistance_algorithm(struct notifier_block *nb) | |
823 | { | |
824 | return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb); | |
825 | } | |
826 | EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm); | |
827 | ||
828 | /** | |
829 | * mt_calc_adistance() - Calculate abstract distance with registered algorithms | |
830 | * @node: the node to calculate abstract distance for | |
831 | * @adist: the returned abstract distance | |
832 | * | |
833 | * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some | |
834 | * abstract distance algorithm provides the result, and return it via | |
835 | * @adist. Otherwise, no algorithm can provide the result and @adist | |
836 | * will be kept as it is. | |
837 | */ | |
838 | int mt_calc_adistance(int node, int *adist) | |
839 | { | |
840 | return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist); | |
841 | } | |
842 | EXPORT_SYMBOL_GPL(mt_calc_adistance); | |
843 | ||
c6123a19 AK |
844 | static int __meminit memtier_hotplug_callback(struct notifier_block *self, |
845 | unsigned long action, void *_arg) | |
846 | { | |
6c542ab7 | 847 | struct memory_tier *memtier; |
c6123a19 AK |
848 | struct memory_notify *arg = _arg; |
849 | ||
850 | /* | |
851 | * Only update the node migration order when a node is | |
852 | * changing status, like online->offline. | |
853 | */ | |
854 | if (arg->status_change_nid < 0) | |
855 | return notifier_from_errno(0); | |
856 | ||
857 | switch (action) { | |
858 | case MEM_OFFLINE: | |
859 | mutex_lock(&memory_tier_lock); | |
6c542ab7 AK |
860 | if (clear_node_memory_tier(arg->status_change_nid)) |
861 | establish_demotion_targets(); | |
c6123a19 AK |
862 | mutex_unlock(&memory_tier_lock); |
863 | break; | |
864 | case MEM_ONLINE: | |
865 | mutex_lock(&memory_tier_lock); | |
6c542ab7 AK |
866 | memtier = set_node_memory_tier(arg->status_change_nid); |
867 | if (!IS_ERR(memtier)) | |
868 | establish_demotion_targets(); | |
c6123a19 AK |
869 | mutex_unlock(&memory_tier_lock); |
870 | break; | |
871 | } | |
872 | ||
873 | return notifier_from_errno(0); | |
874 | } | |
875 | ||
992bf775 AK |
876 | static int __init memory_tier_init(void) |
877 | { | |
9832fb87 | 878 | int ret, node; |
992bf775 AK |
879 | struct memory_tier *memtier; |
880 | ||
9832fb87 AK |
881 | ret = subsys_virtual_register(&memory_tier_subsys, NULL); |
882 | if (ret) | |
883 | panic("%s() failed to register memory tier subsystem\n", __func__); | |
884 | ||
6c542ab7 AK |
885 | #ifdef CONFIG_MIGRATION |
886 | node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), | |
887 | GFP_KERNEL); | |
888 | WARN_ON(!node_demotion); | |
889 | #endif | |
992bf775 | 890 | mutex_lock(&memory_tier_lock); |
7b88bda3 AK |
891 | /* |
892 | * For now we can have 4 faster memory tiers with smaller adistance | |
893 | * than default DRAM tier. | |
894 | */ | |
cf93be18 HRJC |
895 | default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM, |
896 | &default_memory_types); | |
4a625cee | 897 | if (IS_ERR(default_dram_type)) |
7b88bda3 AK |
898 | panic("%s() failed to allocate default DRAM tier\n", __func__); |
899 | ||
992bf775 AK |
900 | /* |
901 | * Look at all the existing N_MEMORY nodes and add them to | |
902 | * default memory tier or to a tier if we already have memory | |
903 | * types assigned. | |
904 | */ | |
905 | for_each_node_state(node, N_MEMORY) { | |
cf93be18 HRJC |
906 | if (!node_state(node, N_CPU)) |
907 | /* | |
908 | * Defer memory tier initialization on | |
909 | * CPUless numa nodes. These will be initialized | |
910 | * after firmware and devices are initialized. | |
911 | */ | |
912 | continue; | |
913 | ||
992bf775 AK |
914 | memtier = set_node_memory_tier(node); |
915 | if (IS_ERR(memtier)) | |
916 | /* | |
917 | * Continue with memtiers we are able to setup | |
918 | */ | |
919 | break; | |
920 | } | |
6c542ab7 | 921 | establish_demotion_targets(); |
992bf775 AK |
922 | mutex_unlock(&memory_tier_lock); |
923 | ||
1eeaa4fd | 924 | hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); |
992bf775 AK |
925 | return 0; |
926 | } | |
927 | subsys_initcall(memory_tier_init); | |
91952440 AK |
928 | |
929 | bool numa_demotion_enabled = false; | |
930 | ||
931 | #ifdef CONFIG_MIGRATION | |
932 | #ifdef CONFIG_SYSFS | |
8d3a7d79 ML |
933 | static ssize_t demotion_enabled_show(struct kobject *kobj, |
934 | struct kobj_attribute *attr, char *buf) | |
91952440 AK |
935 | { |
936 | return sysfs_emit(buf, "%s\n", | |
937 | numa_demotion_enabled ? "true" : "false"); | |
938 | } | |
939 | ||
8d3a7d79 ML |
940 | static ssize_t demotion_enabled_store(struct kobject *kobj, |
941 | struct kobj_attribute *attr, | |
942 | const char *buf, size_t count) | |
91952440 AK |
943 | { |
944 | ssize_t ret; | |
945 | ||
946 | ret = kstrtobool(buf, &numa_demotion_enabled); | |
947 | if (ret) | |
948 | return ret; | |
949 | ||
950 | return count; | |
951 | } | |
952 | ||
953 | static struct kobj_attribute numa_demotion_enabled_attr = | |
8d3a7d79 | 954 | __ATTR_RW(demotion_enabled); |
91952440 AK |
955 | |
956 | static struct attribute *numa_attrs[] = { | |
957 | &numa_demotion_enabled_attr.attr, | |
958 | NULL, | |
959 | }; | |
960 | ||
961 | static const struct attribute_group numa_attr_group = { | |
962 | .attrs = numa_attrs, | |
963 | }; | |
964 | ||
965 | static int __init numa_init_sysfs(void) | |
966 | { | |
967 | int err; | |
968 | struct kobject *numa_kobj; | |
969 | ||
970 | numa_kobj = kobject_create_and_add("numa", mm_kobj); | |
971 | if (!numa_kobj) { | |
972 | pr_err("failed to create numa kobject\n"); | |
973 | return -ENOMEM; | |
974 | } | |
975 | err = sysfs_create_group(numa_kobj, &numa_attr_group); | |
976 | if (err) { | |
977 | pr_err("failed to register numa group\n"); | |
978 | goto delete_obj; | |
979 | } | |
980 | return 0; | |
981 | ||
982 | delete_obj: | |
983 | kobject_put(numa_kobj); | |
984 | return err; | |
985 | } | |
986 | subsys_initcall(numa_init_sysfs); | |
987 | #endif /* CONFIG_SYSFS */ | |
988 | #endif |