]>
Commit | Line | Data |
---|---|---|
96d0e26c WG |
1 | /* |
2 | * NUMA parameter parsing routines | |
3 | * | |
4 | * Copyright (c) 2014 Fujitsu Ltd. | |
5 | * | |
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy | |
7 | * of this software and associated documentation files (the "Software"), to deal | |
8 | * in the Software without restriction, including without limitation the rights | |
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
10 | * copies of the Software, and to permit persons to whom the Software is | |
11 | * furnished to do so, subject to the following conditions: | |
12 | * | |
13 | * The above copyright notice and this permission notice shall be included in | |
14 | * all copies or substantial portions of the Software. | |
15 | * | |
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
22 | * THE SOFTWARE. | |
23 | */ | |
24 | ||
d38ea87a | 25 | #include "qemu/osdep.h" |
9b12dfa0 | 26 | #include "qemu/units.h" |
b58c5c2d | 27 | #include "sysemu/hostmem.h" |
e35704ba | 28 | #include "sysemu/numa.h" |
46517dd4 | 29 | #include "sysemu/sysemu.h" |
96d0e26c | 30 | #include "exec/cpu-common.h" |
0987d735 | 31 | #include "exec/ramlist.h" |
96d0e26c | 32 | #include "qemu/bitmap.h" |
2b631ec2 | 33 | #include "qemu/error-report.h" |
e688df6b | 34 | #include "qapi/error.h" |
0042109a | 35 | #include "qapi/opts-visitor.h" |
8ac25c84 | 36 | #include "qapi/qapi-visit-machine.h" |
f8123f22 | 37 | #include "sysemu/qtest.h" |
2e5b09fd | 38 | #include "hw/core/cpu.h" |
5b009e40 | 39 | #include "hw/mem/pc-dimm.h" |
d6454270 | 40 | #include "migration/vmstate.h" |
12e9493d | 41 | #include "hw/boards.h" |
2cc0e2e8 | 42 | #include "hw/mem/memory-device.h" |
7dcd1d70 EH |
43 | #include "qemu/option.h" |
44 | #include "qemu/config-file.h" | |
cc001888 | 45 | #include "qemu/cutils.h" |
0042109a WG |
46 | |
47 | QemuOptsList qemu_numa_opts = { | |
48 | .name = "numa", | |
49 | .implied_opt_name = "type", | |
50 | .head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head), | |
51 | .desc = { { 0 } } /* validated with OptsVisitor */ | |
52 | }; | |
53 | ||
b69239e0 | 54 | static int have_memdevs; |
6b61c2c5 IM |
55 | bool numa_uses_legacy_mem(void) |
56 | { | |
57 | return !have_memdevs; | |
58 | } | |
59 | ||
b69239e0 | 60 | static int have_mem; |
25712ffe EH |
61 | static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one. |
62 | * For all nodes, nodeid < max_numa_nodeid | |
63 | */ | |
e75e2a14 | 64 | |
64c2a8f6 | 65 | static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, |
cc001888 | 66 | Error **errp) |
96d0e26c | 67 | { |
a22528b9 | 68 | Error *err = NULL; |
0042109a WG |
69 | uint16_t nodenr; |
70 | uint16List *cpus = NULL; | |
64c2a8f6 | 71 | MachineClass *mc = MACHINE_GET_CLASS(ms); |
5cc8767d | 72 | unsigned int max_cpus = ms->smp.max_cpus; |
7e721e7b | 73 | NodeInfo *numa_info = ms->numa_state->nodes; |
96d0e26c | 74 | |
0042109a WG |
75 | if (node->has_nodeid) { |
76 | nodenr = node->nodeid; | |
96d0e26c | 77 | } else { |
aa570207 | 78 | nodenr = ms->numa_state->num_nodes; |
96d0e26c WG |
79 | } |
80 | ||
0042109a WG |
81 | if (nodenr >= MAX_NODES) { |
82 | error_setg(errp, "Max number of NUMA nodes reached: %" | |
01bbbcf4 | 83 | PRIu16 "", nodenr); |
0042109a | 84 | return; |
96d0e26c WG |
85 | } |
86 | ||
1945b9d8 EH |
87 | if (numa_info[nodenr].present) { |
88 | error_setg(errp, "Duplicate NUMA nodeid: %" PRIu16, nodenr); | |
89 | return; | |
90 | } | |
91 | ||
0042109a | 92 | for (cpus = node->cpus; cpus; cpus = cpus->next) { |
7c88e65d | 93 | CpuInstanceProperties props; |
8979c945 EH |
94 | if (cpus->value >= max_cpus) { |
95 | error_setg(errp, | |
96 | "CPU index (%" PRIu16 ")" | |
97 | " should be smaller than maxcpus (%d)", | |
98 | cpus->value, max_cpus); | |
0042109a WG |
99 | return; |
100 | } | |
7c88e65d IM |
101 | props = mc->cpu_index_to_instance_props(ms, cpus->value); |
102 | props.node_id = nodenr; | |
103 | props.has_node_id = true; | |
a22528b9 MA |
104 | machine_set_cpu_numa_node(ms, &props, &err); |
105 | if (err) { | |
106 | error_propagate(errp, err); | |
107 | return; | |
108 | } | |
96d0e26c WG |
109 | } |
110 | ||
b69239e0 IM |
111 | have_memdevs = have_memdevs ? : node->has_memdev; |
112 | have_mem = have_mem ? : node->has_mem; | |
113 | if ((node->has_mem && have_memdevs) || (node->has_memdev && have_mem)) { | |
114 | error_setg(errp, "numa configuration should use either mem= or memdev=," | |
115 | "mixing both is not allowed"); | |
7febe36f PB |
116 | return; |
117 | } | |
118 | ||
0042109a | 119 | if (node->has_mem) { |
cc001888 | 120 | numa_info[nodenr].node_mem = node->mem; |
f8123f22 EH |
121 | if (!qtest_enabled()) { |
122 | warn_report("Parameter -numa node,mem is deprecated," | |
123 | " use -numa node,memdev instead"); | |
124 | } | |
0042109a | 125 | } |
7febe36f PB |
126 | if (node->has_memdev) { |
127 | Object *o; | |
128 | o = object_resolve_path_type(node->memdev, TYPE_MEMORY_BACKEND, NULL); | |
129 | if (!o) { | |
130 | error_setg(errp, "memdev=%s is ambiguous", node->memdev); | |
131 | return; | |
132 | } | |
133 | ||
134 | object_ref(o); | |
61d7c144 | 135 | numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL); |
7febe36f PB |
136 | numa_info[nodenr].node_memdev = MEMORY_BACKEND(o); |
137 | } | |
244b3f44 TX |
138 | |
139 | /* | |
140 | * If not set the initiator, set it to MAX_NODES. And if | |
141 | * HMAT is enabled and this node has no cpus, QEMU will raise error. | |
142 | */ | |
143 | numa_info[nodenr].initiator = MAX_NODES; | |
144 | if (node->has_initiator) { | |
145 | if (!ms->numa_state->hmat_enabled) { | |
146 | error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " | |
147 | "(HMAT) is disabled, enable it with -machine hmat=on " | |
148 | "before using any of hmat specific options"); | |
149 | return; | |
150 | } | |
151 | ||
152 | if (node->initiator >= MAX_NODES) { | |
153 | error_report("The initiator id %" PRIu16 " expects an integer " | |
154 | "between 0 and %d", node->initiator, | |
155 | MAX_NODES - 1); | |
156 | return; | |
157 | } | |
158 | ||
159 | numa_info[nodenr].initiator = node->initiator; | |
160 | } | |
1af878e0 EH |
161 | numa_info[nodenr].present = true; |
162 | max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1); | |
aa570207 | 163 | ms->numa_state->num_nodes++; |
96d0e26c WG |
164 | } |
165 | ||
aa570207 TX |
166 | static |
167 | void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp) | |
0f203430 HC |
168 | { |
169 | uint16_t src = dist->src; | |
170 | uint16_t dst = dist->dst; | |
171 | uint8_t val = dist->val; | |
7e721e7b | 172 | NodeInfo *numa_info = ms->numa_state->nodes; |
0f203430 HC |
173 | |
174 | if (src >= MAX_NODES || dst >= MAX_NODES) { | |
74f38e96 IM |
175 | error_setg(errp, "Parameter '%s' expects an integer between 0 and %d", |
176 | src >= MAX_NODES ? "src" : "dst", MAX_NODES - 1); | |
0f203430 HC |
177 | return; |
178 | } | |
179 | ||
180 | if (!numa_info[src].present || !numa_info[dst].present) { | |
181 | error_setg(errp, "Source/Destination NUMA node is missing. " | |
182 | "Please use '-numa node' option to declare it first."); | |
183 | return; | |
184 | } | |
185 | ||
186 | if (val < NUMA_DISTANCE_MIN) { | |
187 | error_setg(errp, "NUMA distance (%" PRIu8 ") is invalid, " | |
188 | "it shouldn't be less than %d.", | |
189 | val, NUMA_DISTANCE_MIN); | |
190 | return; | |
191 | } | |
192 | ||
193 | if (src == dst && val != NUMA_DISTANCE_MIN) { | |
194 | error_setg(errp, "Local distance of node %d should be %d.", | |
195 | src, NUMA_DISTANCE_MIN); | |
196 | return; | |
197 | } | |
198 | ||
199 | numa_info[src].distance[dst] = val; | |
118154b7 | 200 | ms->numa_state->have_numa_distance = true; |
0f203430 HC |
201 | } |
202 | ||
9b12dfa0 LJ |
203 | void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, |
204 | Error **errp) | |
205 | { | |
206 | int i, first_bit, last_bit; | |
207 | uint64_t max_entry, temp_base, bitmap_copy; | |
208 | NodeInfo *numa_info = numa_state->nodes; | |
209 | HMAT_LB_Info *hmat_lb = | |
210 | numa_state->hmat_lb[node->hierarchy][node->data_type]; | |
211 | HMAT_LB_Data lb_data = {}; | |
212 | HMAT_LB_Data *lb_temp; | |
213 | ||
214 | /* Error checking */ | |
215 | if (node->initiator > numa_state->num_nodes) { | |
216 | error_setg(errp, "Invalid initiator=%d, it should be less than %d", | |
217 | node->initiator, numa_state->num_nodes); | |
218 | return; | |
219 | } | |
220 | if (node->target > numa_state->num_nodes) { | |
221 | error_setg(errp, "Invalid target=%d, it should be less than %d", | |
222 | node->target, numa_state->num_nodes); | |
223 | return; | |
224 | } | |
225 | if (!numa_info[node->initiator].has_cpu) { | |
226 | error_setg(errp, "Invalid initiator=%d, it isn't an " | |
227 | "initiator proximity domain", node->initiator); | |
228 | return; | |
229 | } | |
230 | if (!numa_info[node->target].present) { | |
231 | error_setg(errp, "The target=%d should point to an existing node", | |
232 | node->target); | |
233 | return; | |
234 | } | |
235 | ||
236 | if (!hmat_lb) { | |
237 | hmat_lb = g_malloc0(sizeof(*hmat_lb)); | |
238 | numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb; | |
239 | hmat_lb->list = g_array_new(false, true, sizeof(HMAT_LB_Data)); | |
240 | } | |
241 | hmat_lb->hierarchy = node->hierarchy; | |
242 | hmat_lb->data_type = node->data_type; | |
243 | lb_data.initiator = node->initiator; | |
244 | lb_data.target = node->target; | |
245 | ||
246 | if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) { | |
247 | /* Input latency data */ | |
248 | ||
249 | if (!node->has_latency) { | |
250 | error_setg(errp, "Missing 'latency' option"); | |
251 | return; | |
252 | } | |
253 | if (node->has_bandwidth) { | |
254 | error_setg(errp, "Invalid option 'bandwidth' since " | |
255 | "the data type is latency"); | |
256 | return; | |
257 | } | |
258 | ||
259 | /* Detect duplicate configuration */ | |
260 | for (i = 0; i < hmat_lb->list->len; i++) { | |
261 | lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); | |
262 | ||
263 | if (node->initiator == lb_temp->initiator && | |
264 | node->target == lb_temp->target) { | |
265 | error_setg(errp, "Duplicate configuration of the latency for " | |
266 | "initiator=%d and target=%d", node->initiator, | |
267 | node->target); | |
268 | return; | |
269 | } | |
270 | } | |
271 | ||
272 | hmat_lb->base = hmat_lb->base ? hmat_lb->base : UINT64_MAX; | |
273 | ||
274 | if (node->latency) { | |
275 | /* Calculate the temporary base and compressed latency */ | |
276 | max_entry = node->latency; | |
277 | temp_base = 1; | |
278 | while (QEMU_IS_ALIGNED(max_entry, 10)) { | |
279 | max_entry /= 10; | |
280 | temp_base *= 10; | |
281 | } | |
282 | ||
283 | /* Calculate the max compressed latency */ | |
284 | temp_base = MIN(hmat_lb->base, temp_base); | |
285 | max_entry = node->latency / hmat_lb->base; | |
286 | max_entry = MAX(hmat_lb->range_bitmap, max_entry); | |
287 | ||
288 | /* | |
289 | * For latency hmat_lb->range_bitmap record the max compressed | |
290 | * latency which should be less than 0xFFFF (UINT16_MAX) | |
291 | */ | |
292 | if (max_entry >= UINT16_MAX) { | |
293 | error_setg(errp, "Latency %" PRIu64 " between initiator=%d and " | |
294 | "target=%d should not differ from previously entered " | |
295 | "min or max values on more than %d", node->latency, | |
296 | node->initiator, node->target, UINT16_MAX - 1); | |
297 | return; | |
298 | } else { | |
299 | hmat_lb->base = temp_base; | |
300 | hmat_lb->range_bitmap = max_entry; | |
301 | } | |
302 | ||
303 | /* | |
304 | * Set lb_info_provided bit 0 as 1, | |
305 | * latency information is provided | |
306 | */ | |
307 | numa_info[node->target].lb_info_provided |= BIT(0); | |
308 | } | |
309 | lb_data.data = node->latency; | |
310 | } else if (node->data_type >= HMATLB_DATA_TYPE_ACCESS_BANDWIDTH) { | |
311 | /* Input bandwidth data */ | |
312 | if (!node->has_bandwidth) { | |
313 | error_setg(errp, "Missing 'bandwidth' option"); | |
314 | return; | |
315 | } | |
316 | if (node->has_latency) { | |
317 | error_setg(errp, "Invalid option 'latency' since " | |
318 | "the data type is bandwidth"); | |
319 | return; | |
320 | } | |
321 | if (!QEMU_IS_ALIGNED(node->bandwidth, MiB)) { | |
322 | error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and " | |
323 | "target=%d should be 1MB aligned", node->bandwidth, | |
324 | node->initiator, node->target); | |
325 | return; | |
326 | } | |
327 | ||
328 | /* Detect duplicate configuration */ | |
329 | for (i = 0; i < hmat_lb->list->len; i++) { | |
330 | lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); | |
331 | ||
332 | if (node->initiator == lb_temp->initiator && | |
333 | node->target == lb_temp->target) { | |
334 | error_setg(errp, "Duplicate configuration of the bandwidth for " | |
335 | "initiator=%d and target=%d", node->initiator, | |
336 | node->target); | |
337 | return; | |
338 | } | |
339 | } | |
340 | ||
341 | hmat_lb->base = hmat_lb->base ? hmat_lb->base : 1; | |
342 | ||
343 | if (node->bandwidth) { | |
344 | /* Keep bitmap unchanged when bandwidth out of range */ | |
345 | bitmap_copy = hmat_lb->range_bitmap; | |
346 | bitmap_copy |= node->bandwidth; | |
347 | first_bit = ctz64(bitmap_copy); | |
348 | temp_base = UINT64_C(1) << first_bit; | |
349 | max_entry = node->bandwidth / temp_base; | |
350 | last_bit = 64 - clz64(bitmap_copy); | |
351 | ||
352 | /* | |
353 | * For bandwidth, first_bit record the base unit of bandwidth bits, | |
354 | * last_bit record the last bit of the max bandwidth. The max | |
355 | * compressed bandwidth should be less than 0xFFFF (UINT16_MAX) | |
356 | */ | |
357 | if ((last_bit - first_bit) > UINT16_BITS || | |
358 | max_entry >= UINT16_MAX) { | |
359 | error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d " | |
360 | "and target=%d should not differ from previously " | |
361 | "entered values on more than %d", node->bandwidth, | |
362 | node->initiator, node->target, UINT16_MAX - 1); | |
363 | return; | |
364 | } else { | |
365 | hmat_lb->base = temp_base; | |
366 | hmat_lb->range_bitmap = bitmap_copy; | |
367 | } | |
368 | ||
369 | /* | |
370 | * Set lb_info_provided bit 1 as 1, | |
371 | * bandwidth information is provided | |
372 | */ | |
373 | numa_info[node->target].lb_info_provided |= BIT(1); | |
374 | } | |
375 | lb_data.data = node->bandwidth; | |
376 | } else { | |
377 | assert(0); | |
378 | } | |
379 | ||
380 | g_array_append_val(hmat_lb->list, lb_data); | |
381 | } | |
382 | ||
c412a48d LJ |
383 | void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, |
384 | Error **errp) | |
385 | { | |
386 | int nb_numa_nodes = ms->numa_state->num_nodes; | |
387 | NodeInfo *numa_info = ms->numa_state->nodes; | |
388 | NumaHmatCacheOptions *hmat_cache = NULL; | |
389 | ||
390 | if (node->node_id >= nb_numa_nodes) { | |
391 | error_setg(errp, "Invalid node-id=%" PRIu32 ", it should be less " | |
392 | "than %d", node->node_id, nb_numa_nodes); | |
393 | return; | |
394 | } | |
395 | ||
396 | if (numa_info[node->node_id].lb_info_provided != (BIT(0) | BIT(1))) { | |
397 | error_setg(errp, "The latency and bandwidth information of " | |
398 | "node-id=%" PRIu32 " should be provided before memory side " | |
399 | "cache attributes", node->node_id); | |
400 | return; | |
401 | } | |
402 | ||
403 | if (node->level < 1 || node->level >= HMAT_LB_LEVELS) { | |
404 | error_setg(errp, "Invalid level=%" PRIu8 ", it should be larger than 0 " | |
405 | "and less than or equal to %d", node->level, | |
406 | HMAT_LB_LEVELS - 1); | |
407 | return; | |
408 | } | |
409 | ||
410 | assert(node->associativity < HMAT_CACHE_ASSOCIATIVITY__MAX); | |
411 | assert(node->policy < HMAT_CACHE_WRITE_POLICY__MAX); | |
412 | if (ms->numa_state->hmat_cache[node->node_id][node->level]) { | |
413 | error_setg(errp, "Duplicate configuration of the side cache for " | |
414 | "node-id=%" PRIu32 " and level=%" PRIu8, | |
415 | node->node_id, node->level); | |
416 | return; | |
417 | } | |
418 | ||
419 | if ((node->level > 1) && | |
420 | ms->numa_state->hmat_cache[node->node_id][node->level - 1] && | |
421 | (node->size >= | |
422 | ms->numa_state->hmat_cache[node->node_id][node->level - 1]->size)) { | |
423 | error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 | |
424 | " should be less than the size(%" PRIu64 ") of " | |
425 | "level=%u", node->size, node->level, | |
426 | ms->numa_state->hmat_cache[node->node_id] | |
427 | [node->level - 1]->size, | |
428 | node->level - 1); | |
429 | return; | |
430 | } | |
431 | ||
432 | if ((node->level < HMAT_LB_LEVELS - 1) && | |
433 | ms->numa_state->hmat_cache[node->node_id][node->level + 1] && | |
434 | (node->size <= | |
435 | ms->numa_state->hmat_cache[node->node_id][node->level + 1]->size)) { | |
436 | error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 | |
437 | " should be larger than the size(%" PRIu64 ") of " | |
438 | "level=%u", node->size, node->level, | |
439 | ms->numa_state->hmat_cache[node->node_id] | |
440 | [node->level + 1]->size, | |
441 | node->level + 1); | |
442 | return; | |
443 | } | |
444 | ||
445 | hmat_cache = g_malloc0(sizeof(*hmat_cache)); | |
446 | memcpy(hmat_cache, node, sizeof(*hmat_cache)); | |
447 | ms->numa_state->hmat_cache[node->node_id][node->level] = hmat_cache; | |
448 | } | |
449 | ||
3319b4ef | 450 | void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) |
96d0e26c | 451 | { |
0042109a | 452 | Error *err = NULL; |
aa570207 | 453 | |
5275db59 | 454 | if (!ms->numa_state) { |
aa570207 TX |
455 | error_setg(errp, "NUMA is not supported by this machine-type"); |
456 | goto end; | |
457 | } | |
96d0e26c | 458 | |
1fd5d4fe | 459 | switch (object->type) { |
d081a49a | 460 | case NUMA_OPTIONS_TYPE_NODE: |
cc001888 | 461 | parse_numa_node(ms, &object->u.node, &err); |
0042109a | 462 | if (err) { |
157e94e8 | 463 | goto end; |
96d0e26c | 464 | } |
0042109a | 465 | break; |
0f203430 | 466 | case NUMA_OPTIONS_TYPE_DIST: |
aa570207 | 467 | parse_numa_distance(ms, &object->u.dist, &err); |
0f203430 HC |
468 | if (err) { |
469 | goto end; | |
470 | } | |
471 | break; | |
419fcdec IM |
472 | case NUMA_OPTIONS_TYPE_CPU: |
473 | if (!object->u.cpu.has_node_id) { | |
474 | error_setg(&err, "Missing mandatory node-id property"); | |
475 | goto end; | |
476 | } | |
7e721e7b | 477 | if (!ms->numa_state->nodes[object->u.cpu.node_id].present) { |
419fcdec IM |
478 | error_setg(&err, "Invalid node-id=%" PRId64 ", NUMA node must be " |
479 | "defined with -numa node,nodeid=ID before it's used with " | |
480 | "-numa cpu,node-id=ID", object->u.cpu.node_id); | |
481 | goto end; | |
482 | } | |
483 | ||
484 | machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu), | |
485 | &err); | |
486 | break; | |
9b12dfa0 LJ |
487 | case NUMA_OPTIONS_TYPE_HMAT_LB: |
488 | if (!ms->numa_state->hmat_enabled) { | |
489 | error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " | |
490 | "(HMAT) is disabled, enable it with -machine hmat=on " | |
491 | "before using any of hmat specific options"); | |
492 | return; | |
493 | } | |
494 | ||
495 | parse_numa_hmat_lb(ms->numa_state, &object->u.hmat_lb, &err); | |
496 | if (err) { | |
497 | goto end; | |
498 | } | |
499 | break; | |
c412a48d LJ |
500 | case NUMA_OPTIONS_TYPE_HMAT_CACHE: |
501 | if (!ms->numa_state->hmat_enabled) { | |
502 | error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " | |
503 | "(HMAT) is disabled, enable it with -machine hmat=on " | |
504 | "before using any of hmat specific options"); | |
505 | return; | |
506 | } | |
507 | ||
508 | parse_numa_hmat_cache(ms, &object->u.hmat_cache, &err); | |
509 | if (err) { | |
510 | goto end; | |
511 | } | |
512 | break; | |
0042109a WG |
513 | default: |
514 | abort(); | |
515 | } | |
96d0e26c | 516 | |
3319b4ef IM |
517 | end: |
518 | error_propagate(errp, err); | |
519 | } | |
520 | ||
4f7ec696 | 521 | static int parse_numa(void *opaque, QemuOpts *opts, Error **errp) |
3319b4ef IM |
522 | { |
523 | NumaOptions *object = NULL; | |
524 | MachineState *ms = MACHINE(opaque); | |
525 | Error *err = NULL; | |
526 | Visitor *v = opts_visitor_new(opts); | |
527 | ||
528 | visit_type_NumaOptions(v, NULL, &object, &err); | |
529 | visit_free(v); | |
530 | if (err) { | |
531 | goto end; | |
532 | } | |
533 | ||
534 | /* Fix up legacy suffix-less format */ | |
535 | if ((object->type == NUMA_OPTIONS_TYPE_NODE) && object->u.node.has_mem) { | |
536 | const char *mem_str = qemu_opt_get(opts, "mem"); | |
537 | qemu_strtosz_MiB(mem_str, NULL, &object->u.node.mem); | |
538 | } | |
539 | ||
540 | set_numa_options(ms, object, &err); | |
541 | ||
157e94e8 | 542 | end: |
96a1616c | 543 | qapi_free_NumaOptions(object); |
157e94e8 | 544 | if (err) { |
4f7ec696 | 545 | error_propagate(errp, err); |
157e94e8 MAL |
546 | return -1; |
547 | } | |
0042109a | 548 | |
157e94e8 | 549 | return 0; |
96d0e26c WG |
550 | } |
551 | ||
0f203430 HC |
552 | /* If all node pair distances are symmetric, then only distances |
553 | * in one direction are enough. If there is even one asymmetric | |
554 | * pair, though, then all distances must be provided. The | |
555 | * distance from a node to itself is always NUMA_DISTANCE_MIN, | |
556 | * so providing it is never necessary. | |
557 | */ | |
aa570207 | 558 | static void validate_numa_distance(MachineState *ms) |
3ef71975 | 559 | { |
0f203430 HC |
560 | int src, dst; |
561 | bool is_asymmetrical = false; | |
aa570207 | 562 | int nb_numa_nodes = ms->numa_state->num_nodes; |
7e721e7b | 563 | NodeInfo *numa_info = ms->numa_state->nodes; |
0f203430 HC |
564 | |
565 | for (src = 0; src < nb_numa_nodes; src++) { | |
566 | for (dst = src; dst < nb_numa_nodes; dst++) { | |
567 | if (numa_info[src].distance[dst] == 0 && | |
568 | numa_info[dst].distance[src] == 0) { | |
569 | if (src != dst) { | |
570 | error_report("The distance between node %d and %d is " | |
571 | "missing, at least one distance value " | |
572 | "between each nodes should be provided.", | |
573 | src, dst); | |
574 | exit(EXIT_FAILURE); | |
575 | } | |
576 | } | |
3ef71975 | 577 | |
0f203430 HC |
578 | if (numa_info[src].distance[dst] != 0 && |
579 | numa_info[dst].distance[src] != 0 && | |
580 | numa_info[src].distance[dst] != | |
581 | numa_info[dst].distance[src]) { | |
582 | is_asymmetrical = true; | |
583 | } | |
584 | } | |
585 | } | |
586 | ||
587 | if (is_asymmetrical) { | |
588 | for (src = 0; src < nb_numa_nodes; src++) { | |
589 | for (dst = 0; dst < nb_numa_nodes; dst++) { | |
590 | if (src != dst && numa_info[src].distance[dst] == 0) { | |
591 | error_report("At least one asymmetrical pair of " | |
592 | "distances is given, please provide distances " | |
593 | "for both directions of all node pairs."); | |
594 | exit(EXIT_FAILURE); | |
595 | } | |
596 | } | |
597 | } | |
3ef71975 | 598 | } |
3ef71975 EH |
599 | } |
600 | ||
aa570207 | 601 | static void complete_init_numa_distance(MachineState *ms) |
3ef71975 | 602 | { |
0f203430 | 603 | int src, dst; |
7e721e7b | 604 | NodeInfo *numa_info = ms->numa_state->nodes; |
3ef71975 | 605 | |
0f203430 HC |
606 | /* Fixup NUMA distance by symmetric policy because if it is an |
607 | * asymmetric distance table, it should be a complete table and | |
608 | * there would not be any missing distance except local node, which | |
609 | * is verified by validate_numa_distance above. | |
610 | */ | |
aa570207 TX |
611 | for (src = 0; src < ms->numa_state->num_nodes; src++) { |
612 | for (dst = 0; dst < ms->numa_state->num_nodes; dst++) { | |
0f203430 HC |
613 | if (numa_info[src].distance[dst] == 0) { |
614 | if (src == dst) { | |
615 | numa_info[src].distance[dst] = NUMA_DISTANCE_MIN; | |
616 | } else { | |
617 | numa_info[src].distance[dst] = numa_info[dst].distance[src]; | |
618 | } | |
619 | } | |
3ef71975 | 620 | } |
3ef71975 | 621 | } |
0f203430 | 622 | } |
549fc54b | 623 | |
3bfe5716 LV |
624 | void numa_legacy_auto_assign_ram(MachineClass *mc, NodeInfo *nodes, |
625 | int nb_nodes, ram_addr_t size) | |
626 | { | |
627 | int i; | |
628 | uint64_t usedmem = 0; | |
629 | ||
630 | /* Align each node according to the alignment | |
631 | * requirements of the machine class | |
632 | */ | |
633 | ||
634 | for (i = 0; i < nb_nodes - 1; i++) { | |
635 | nodes[i].node_mem = (size / nb_nodes) & | |
636 | ~((1 << mc->numa_mem_align_shift) - 1); | |
637 | usedmem += nodes[i].node_mem; | |
549fc54b | 638 | } |
3bfe5716 | 639 | nodes[i].node_mem = size - usedmem; |
3ef71975 EH |
640 | } |
641 | ||
3bfe5716 LV |
642 | void numa_default_auto_assign_ram(MachineClass *mc, NodeInfo *nodes, |
643 | int nb_nodes, ram_addr_t size) | |
96d0e26c | 644 | { |
12d6e464 | 645 | int i; |
3bfe5716 LV |
646 | uint64_t usedmem = 0, node_mem; |
647 | uint64_t granularity = size / nb_nodes; | |
648 | uint64_t propagate = 0; | |
649 | ||
650 | for (i = 0; i < nb_nodes - 1; i++) { | |
651 | node_mem = (granularity + propagate) & | |
652 | ~((1 << mc->numa_mem_align_shift) - 1); | |
653 | propagate = granularity + propagate - node_mem; | |
654 | nodes[i].node_mem = node_mem; | |
655 | usedmem += node_mem; | |
656 | } | |
657 | nodes[i].node_mem = size - usedmem; | |
658 | } | |
12d6e464 | 659 | |
6b61c2c5 IM |
660 | static void numa_init_memdev_container(MachineState *ms, MemoryRegion *ram) |
661 | { | |
662 | int i; | |
663 | uint64_t addr = 0; | |
664 | ||
665 | for (i = 0; i < ms->numa_state->num_nodes; i++) { | |
666 | uint64_t size = ms->numa_state->nodes[i].node_mem; | |
667 | HostMemoryBackend *backend = ms->numa_state->nodes[i].node_memdev; | |
668 | if (!backend) { | |
669 | continue; | |
670 | } | |
671 | MemoryRegion *seg = machine_consume_memdev(ms, backend); | |
672 | memory_region_add_subregion(ram, addr, seg); | |
673 | addr += size; | |
674 | } | |
675 | } | |
676 | ||
7a3099fc | 677 | void numa_complete_configuration(MachineState *ms) |
96d0e26c | 678 | { |
12d6e464 | 679 | int i; |
ea089eeb | 680 | MachineClass *mc = MACHINE_GET_CLASS(ms); |
7e721e7b | 681 | NodeInfo *numa_info = ms->numa_state->nodes; |
cdda2018 | 682 | |
7b8be49d DL |
683 | /* |
684 | * If memory hotplug is enabled (slots > 0) but without '-numa' | |
685 | * options explicitly on CLI, guestes will break. | |
686 | * | |
687 | * Windows: won't enable memory hotplug without SRAT table at all | |
688 | * | |
689 | * Linux: if QEMU is started with initial memory all below 4Gb | |
690 | * and no SRAT table present, guest kernel will use nommu DMA ops, | |
691 | * which breaks 32bit hw drivers when memory is hotplugged and | |
692 | * guest tries to use it with that drivers. | |
693 | * | |
694 | * Enable NUMA implicitly by adding a new NUMA node automatically. | |
0533ef5f TX |
695 | * |
696 | * Or if MachineClass::auto_enable_numa is true and no NUMA nodes, | |
697 | * assume there is just one node with whole RAM. | |
7b8be49d | 698 | */ |
0533ef5f TX |
699 | if (ms->numa_state->num_nodes == 0 && |
700 | ((ms->ram_slots > 0 && | |
701 | mc->auto_enable_numa_with_memhp) || | |
702 | mc->auto_enable_numa)) { | |
7b8be49d | 703 | NumaNodeOptions node = { }; |
a22528b9 | 704 | parse_numa_node(ms, &node, &error_abort); |
0533ef5f | 705 | numa_info[0].node_mem = ram_size; |
7b8be49d DL |
706 | } |
707 | ||
12d6e464 EH |
708 | assert(max_numa_nodeid <= MAX_NODES); |
709 | ||
710 | /* No support for sparse NUMA node IDs yet: */ | |
711 | for (i = max_numa_nodeid - 1; i >= 0; i--) { | |
712 | /* Report large node IDs first, to make mistakes easier to spot */ | |
713 | if (!numa_info[i].present) { | |
714 | error_report("numa: Node ID missing: %d", i); | |
715 | exit(1); | |
716 | } | |
717 | } | |
718 | ||
719 | /* This must be always true if all nodes are present: */ | |
aa570207 | 720 | assert(ms->numa_state->num_nodes == max_numa_nodeid); |
12d6e464 | 721 | |
aa570207 | 722 | if (ms->numa_state->num_nodes > 0) { |
2b631ec2 | 723 | uint64_t numa_total; |
96d0e26c | 724 | |
aa570207 TX |
725 | if (ms->numa_state->num_nodes > MAX_NODES) { |
726 | ms->numa_state->num_nodes = MAX_NODES; | |
96d0e26c WG |
727 | } |
728 | ||
9851d0fe | 729 | /* If no memory size is given for any node, assume the default case |
96d0e26c WG |
730 | * and distribute the available memory equally across all nodes |
731 | */ | |
aa570207 | 732 | for (i = 0; i < ms->numa_state->num_nodes; i++) { |
8c85901e | 733 | if (numa_info[i].node_mem != 0) { |
96d0e26c WG |
734 | break; |
735 | } | |
736 | } | |
aa570207 | 737 | if (i == ms->numa_state->num_nodes) { |
3bfe5716 | 738 | assert(mc->numa_auto_assign_ram); |
aa570207 TX |
739 | mc->numa_auto_assign_ram(mc, numa_info, |
740 | ms->numa_state->num_nodes, ram_size); | |
f8123f22 EH |
741 | if (!qtest_enabled()) { |
742 | warn_report("Default splitting of RAM between nodes is deprecated," | |
743 | " Use '-numa node,memdev' to explictly define RAM" | |
744 | " allocation per node"); | |
745 | } | |
96d0e26c WG |
746 | } |
747 | ||
2b631ec2 | 748 | numa_total = 0; |
aa570207 | 749 | for (i = 0; i < ms->numa_state->num_nodes; i++) { |
8c85901e | 750 | numa_total += numa_info[i].node_mem; |
2b631ec2 WG |
751 | } |
752 | if (numa_total != ram_size) { | |
c68233ae HT |
753 | error_report("total memory for NUMA nodes (0x%" PRIx64 ")" |
754 | " should equal RAM size (0x" RAM_ADDR_FMT ")", | |
2b631ec2 WG |
755 | numa_total, ram_size); |
756 | exit(1); | |
757 | } | |
758 | ||
6b61c2c5 IM |
759 | if (!numa_uses_legacy_mem() && mc->default_ram_id) { |
760 | ms->ram = g_new(MemoryRegion, 1); | |
761 | memory_region_init(ms->ram, OBJECT(ms), mc->default_ram_id, | |
762 | ram_size); | |
763 | numa_init_memdev_container(ms, ms->ram); | |
764 | } | |
0f203430 HC |
765 | /* QEMU needs at least all unique node pair distances to build |
766 | * the whole NUMA distance table. QEMU treats the distance table | |
767 | * as symmetric by default, i.e. distance A->B == distance B->A. | |
768 | * Thus, QEMU is able to complete the distance table | |
769 | * initialization even though only distance A->B is provided and | |
770 | * distance B->A is not. QEMU knows the distance of a node to | |
771 | * itself is always 10, so A->A distances may be omitted. When | |
772 | * the distances of two nodes of a pair differ, i.e. distance | |
773 | * A->B != distance B->A, then that means the distance table is | |
774 | * asymmetric. In this case, the distances for both directions | |
775 | * of all node pairs are required. | |
776 | */ | |
118154b7 | 777 | if (ms->numa_state->have_numa_distance) { |
0f203430 | 778 | /* Validate enough NUMA distance information was provided. */ |
aa570207 | 779 | validate_numa_distance(ms); |
96d0e26c | 780 | |
0f203430 | 781 | /* Validation succeeded, now fill in any missing distances. */ |
aa570207 | 782 | complete_init_numa_distance(ms); |
96d0e26c WG |
783 | } |
784 | } | |
785 | } | |
dfabb8b9 | 786 | |
7a3099fc IM |
787 | void parse_numa_opts(MachineState *ms) |
788 | { | |
4f7ec696 | 789 | qemu_opts_foreach(qemu_find_opts("numa"), parse_numa, ms, &error_fatal); |
7a3099fc IM |
790 | } |
791 | ||
a0ceb640 IM |
792 | void numa_cpu_pre_plug(const CPUArchId *slot, DeviceState *dev, Error **errp) |
793 | { | |
a0ceb640 IM |
794 | int node_id = object_property_get_int(OBJECT(dev), "node-id", &error_abort); |
795 | ||
a0ceb640 IM |
796 | if (node_id == CPU_UNSET_NUMA_NODE_ID) { |
797 | /* due to bug in libvirt, it doesn't pass node-id from props on | |
798 | * device_add as expected, so we have to fix it up here */ | |
d41f3e75 IM |
799 | if (slot->props.has_node_id) { |
800 | object_property_set_int(OBJECT(dev), slot->props.node_id, | |
801 | "node-id", errp); | |
802 | } | |
803 | } else if (node_id != slot->props.node_id) { | |
a5bf9fbc LV |
804 | error_setg(errp, "invalid node-id, must be %"PRId64, |
805 | slot->props.node_id); | |
a0ceb640 IM |
806 | } |
807 | } | |
808 | ||
31959e82 | 809 | static void numa_stat_memory_devices(NumaNodeMem node_mem[]) |
5b009e40 | 810 | { |
2cc0e2e8 | 811 | MemoryDeviceInfoList *info_list = qmp_memory_device_list(); |
5b009e40 | 812 | MemoryDeviceInfoList *info; |
31959e82 | 813 | PCDIMMDeviceInfo *pcdimm_info; |
cae02c34 | 814 | VirtioPMEMDeviceInfo *vpi; |
5b009e40 | 815 | |
5b009e40 HZ |
816 | for (info = info_list; info; info = info->next) { |
817 | MemoryDeviceInfo *value = info->value; | |
818 | ||
819 | if (value) { | |
1fd5d4fe | 820 | switch (value->type) { |
6388e18d | 821 | case MEMORY_DEVICE_INFO_KIND_DIMM: |
6388e18d | 822 | case MEMORY_DEVICE_INFO_KIND_NVDIMM: |
cae02c34 DH |
823 | pcdimm_info = value->type == MEMORY_DEVICE_INFO_KIND_DIMM ? |
824 | value->u.dimm.data : value->u.nvdimm.data; | |
31959e82 | 825 | node_mem[pcdimm_info->node].node_mem += pcdimm_info->size; |
178003ea DH |
826 | node_mem[pcdimm_info->node].node_plugged_mem += |
827 | pcdimm_info->size; | |
cae02c34 DH |
828 | break; |
829 | case MEMORY_DEVICE_INFO_KIND_VIRTIO_PMEM: | |
830 | vpi = value->u.virtio_pmem.data; | |
831 | /* TODO: once we support numa, assign to right node */ | |
832 | node_mem[0].node_mem += vpi->size; | |
833 | node_mem[0].node_plugged_mem += vpi->size; | |
834 | break; | |
835 | default: | |
836 | g_assert_not_reached(); | |
5b009e40 HZ |
837 | } |
838 | } | |
839 | } | |
840 | qapi_free_MemoryDeviceInfoList(info_list); | |
841 | } | |
842 | ||
aa570207 | 843 | void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms) |
5b009e40 HZ |
844 | { |
845 | int i; | |
846 | ||
aa570207 | 847 | if (ms->numa_state == NULL || ms->numa_state->num_nodes <= 0) { |
5b009e40 HZ |
848 | return; |
849 | } | |
850 | ||
851 | numa_stat_memory_devices(node_mem); | |
aa570207 | 852 | for (i = 0; i < ms->numa_state->num_nodes; i++) { |
7e721e7b | 853 | node_mem[i].node_mem += ms->numa_state->nodes[i].node_mem; |
5b009e40 HZ |
854 | } |
855 | } | |
856 | ||
0987d735 PB |
857 | void ram_block_notifier_add(RAMBlockNotifier *n) |
858 | { | |
859 | QLIST_INSERT_HEAD(&ram_list.ramblock_notifiers, n, next); | |
860 | } | |
861 | ||
862 | void ram_block_notifier_remove(RAMBlockNotifier *n) | |
863 | { | |
864 | QLIST_REMOVE(n, next); | |
865 | } | |
866 | ||
867 | void ram_block_notify_add(void *host, size_t size) | |
868 | { | |
869 | RAMBlockNotifier *notifier; | |
870 | ||
871 | QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) { | |
872 | notifier->ram_block_added(notifier, host, size); | |
873 | } | |
874 | } | |
875 | ||
876 | void ram_block_notify_remove(void *host, size_t size) | |
877 | { | |
878 | RAMBlockNotifier *notifier; | |
879 | ||
880 | QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) { | |
881 | notifier->ram_block_removed(notifier, host, size); | |
882 | } | |
883 | } |