]>
Commit | Line | Data |
---|---|---|
96d0e26c WG |
1 | /* |
2 | * NUMA parameter parsing routines | |
3 | * | |
4 | * Copyright (c) 2014 Fujitsu Ltd. | |
5 | * | |
6 | * Permission is hereby granted, free of charge, to any person obtaining a copy | |
7 | * of this software and associated documentation files (the "Software"), to deal | |
8 | * in the Software without restriction, including without limitation the rights | |
9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
10 | * copies of the Software, and to permit persons to whom the Software is | |
11 | * furnished to do so, subject to the following conditions: | |
12 | * | |
13 | * The above copyright notice and this permission notice shall be included in | |
14 | * all copies or substantial portions of the Software. | |
15 | * | |
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
22 | * THE SOFTWARE. | |
23 | */ | |
24 | ||
d38ea87a | 25 | #include "qemu/osdep.h" |
9b12dfa0 | 26 | #include "qemu/units.h" |
b58c5c2d | 27 | #include "sysemu/hostmem.h" |
e35704ba | 28 | #include "sysemu/numa.h" |
46517dd4 | 29 | #include "sysemu/sysemu.h" |
96d0e26c | 30 | #include "exec/cpu-common.h" |
0987d735 | 31 | #include "exec/ramlist.h" |
96d0e26c | 32 | #include "qemu/bitmap.h" |
2b631ec2 | 33 | #include "qemu/error-report.h" |
e688df6b | 34 | #include "qapi/error.h" |
0042109a | 35 | #include "qapi/opts-visitor.h" |
8ac25c84 | 36 | #include "qapi/qapi-visit-machine.h" |
f8123f22 | 37 | #include "sysemu/qtest.h" |
2e5b09fd | 38 | #include "hw/core/cpu.h" |
5b009e40 | 39 | #include "hw/mem/pc-dimm.h" |
d6454270 | 40 | #include "migration/vmstate.h" |
12e9493d | 41 | #include "hw/boards.h" |
2cc0e2e8 | 42 | #include "hw/mem/memory-device.h" |
7dcd1d70 EH |
43 | #include "qemu/option.h" |
44 | #include "qemu/config-file.h" | |
cc001888 | 45 | #include "qemu/cutils.h" |
0042109a WG |
46 | |
47 | QemuOptsList qemu_numa_opts = { | |
48 | .name = "numa", | |
49 | .implied_opt_name = "type", | |
50 | .head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head), | |
51 | .desc = { { 0 } } /* validated with OptsVisitor */ | |
52 | }; | |
53 | ||
b69239e0 IM |
54 | static int have_memdevs; |
55 | static int have_mem; | |
25712ffe EH |
56 | static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one. |
57 | * For all nodes, nodeid < max_numa_nodeid | |
58 | */ | |
e75e2a14 | 59 | |
64c2a8f6 | 60 | static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, |
cc001888 | 61 | Error **errp) |
96d0e26c | 62 | { |
a22528b9 | 63 | Error *err = NULL; |
0042109a WG |
64 | uint16_t nodenr; |
65 | uint16List *cpus = NULL; | |
64c2a8f6 | 66 | MachineClass *mc = MACHINE_GET_CLASS(ms); |
5cc8767d | 67 | unsigned int max_cpus = ms->smp.max_cpus; |
7e721e7b | 68 | NodeInfo *numa_info = ms->numa_state->nodes; |
96d0e26c | 69 | |
0042109a WG |
70 | if (node->has_nodeid) { |
71 | nodenr = node->nodeid; | |
96d0e26c | 72 | } else { |
aa570207 | 73 | nodenr = ms->numa_state->num_nodes; |
96d0e26c WG |
74 | } |
75 | ||
0042109a WG |
76 | if (nodenr >= MAX_NODES) { |
77 | error_setg(errp, "Max number of NUMA nodes reached: %" | |
01bbbcf4 | 78 | PRIu16 "", nodenr); |
0042109a | 79 | return; |
96d0e26c WG |
80 | } |
81 | ||
1945b9d8 EH |
82 | if (numa_info[nodenr].present) { |
83 | error_setg(errp, "Duplicate NUMA nodeid: %" PRIu16, nodenr); | |
84 | return; | |
85 | } | |
86 | ||
0042109a | 87 | for (cpus = node->cpus; cpus; cpus = cpus->next) { |
7c88e65d | 88 | CpuInstanceProperties props; |
8979c945 EH |
89 | if (cpus->value >= max_cpus) { |
90 | error_setg(errp, | |
91 | "CPU index (%" PRIu16 ")" | |
92 | " should be smaller than maxcpus (%d)", | |
93 | cpus->value, max_cpus); | |
0042109a WG |
94 | return; |
95 | } | |
7c88e65d IM |
96 | props = mc->cpu_index_to_instance_props(ms, cpus->value); |
97 | props.node_id = nodenr; | |
98 | props.has_node_id = true; | |
a22528b9 MA |
99 | machine_set_cpu_numa_node(ms, &props, &err); |
100 | if (err) { | |
101 | error_propagate(errp, err); | |
102 | return; | |
103 | } | |
96d0e26c WG |
104 | } |
105 | ||
b69239e0 IM |
106 | have_memdevs = have_memdevs ? : node->has_memdev; |
107 | have_mem = have_mem ? : node->has_mem; | |
108 | if ((node->has_mem && have_memdevs) || (node->has_memdev && have_mem)) { | |
109 | error_setg(errp, "numa configuration should use either mem= or memdev=," | |
110 | "mixing both is not allowed"); | |
7febe36f PB |
111 | return; |
112 | } | |
113 | ||
0042109a | 114 | if (node->has_mem) { |
cc001888 | 115 | numa_info[nodenr].node_mem = node->mem; |
f8123f22 EH |
116 | if (!qtest_enabled()) { |
117 | warn_report("Parameter -numa node,mem is deprecated," | |
118 | " use -numa node,memdev instead"); | |
119 | } | |
0042109a | 120 | } |
7febe36f PB |
121 | if (node->has_memdev) { |
122 | Object *o; | |
123 | o = object_resolve_path_type(node->memdev, TYPE_MEMORY_BACKEND, NULL); | |
124 | if (!o) { | |
125 | error_setg(errp, "memdev=%s is ambiguous", node->memdev); | |
126 | return; | |
127 | } | |
128 | ||
129 | object_ref(o); | |
61d7c144 | 130 | numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL); |
7febe36f PB |
131 | numa_info[nodenr].node_memdev = MEMORY_BACKEND(o); |
132 | } | |
244b3f44 TX |
133 | |
134 | /* | |
135 | * If not set the initiator, set it to MAX_NODES. And if | |
136 | * HMAT is enabled and this node has no cpus, QEMU will raise error. | |
137 | */ | |
138 | numa_info[nodenr].initiator = MAX_NODES; | |
139 | if (node->has_initiator) { | |
140 | if (!ms->numa_state->hmat_enabled) { | |
141 | error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " | |
142 | "(HMAT) is disabled, enable it with -machine hmat=on " | |
143 | "before using any of hmat specific options"); | |
144 | return; | |
145 | } | |
146 | ||
147 | if (node->initiator >= MAX_NODES) { | |
148 | error_report("The initiator id %" PRIu16 " expects an integer " | |
149 | "between 0 and %d", node->initiator, | |
150 | MAX_NODES - 1); | |
151 | return; | |
152 | } | |
153 | ||
154 | numa_info[nodenr].initiator = node->initiator; | |
155 | } | |
1af878e0 EH |
156 | numa_info[nodenr].present = true; |
157 | max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1); | |
aa570207 | 158 | ms->numa_state->num_nodes++; |
96d0e26c WG |
159 | } |
160 | ||
aa570207 TX |
161 | static |
162 | void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp) | |
0f203430 HC |
163 | { |
164 | uint16_t src = dist->src; | |
165 | uint16_t dst = dist->dst; | |
166 | uint8_t val = dist->val; | |
7e721e7b | 167 | NodeInfo *numa_info = ms->numa_state->nodes; |
0f203430 HC |
168 | |
169 | if (src >= MAX_NODES || dst >= MAX_NODES) { | |
74f38e96 IM |
170 | error_setg(errp, "Parameter '%s' expects an integer between 0 and %d", |
171 | src >= MAX_NODES ? "src" : "dst", MAX_NODES - 1); | |
0f203430 HC |
172 | return; |
173 | } | |
174 | ||
175 | if (!numa_info[src].present || !numa_info[dst].present) { | |
176 | error_setg(errp, "Source/Destination NUMA node is missing. " | |
177 | "Please use '-numa node' option to declare it first."); | |
178 | return; | |
179 | } | |
180 | ||
181 | if (val < NUMA_DISTANCE_MIN) { | |
182 | error_setg(errp, "NUMA distance (%" PRIu8 ") is invalid, " | |
183 | "it shouldn't be less than %d.", | |
184 | val, NUMA_DISTANCE_MIN); | |
185 | return; | |
186 | } | |
187 | ||
188 | if (src == dst && val != NUMA_DISTANCE_MIN) { | |
189 | error_setg(errp, "Local distance of node %d should be %d.", | |
190 | src, NUMA_DISTANCE_MIN); | |
191 | return; | |
192 | } | |
193 | ||
194 | numa_info[src].distance[dst] = val; | |
118154b7 | 195 | ms->numa_state->have_numa_distance = true; |
0f203430 HC |
196 | } |
197 | ||
9b12dfa0 LJ |
198 | void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, |
199 | Error **errp) | |
200 | { | |
201 | int i, first_bit, last_bit; | |
202 | uint64_t max_entry, temp_base, bitmap_copy; | |
203 | NodeInfo *numa_info = numa_state->nodes; | |
204 | HMAT_LB_Info *hmat_lb = | |
205 | numa_state->hmat_lb[node->hierarchy][node->data_type]; | |
206 | HMAT_LB_Data lb_data = {}; | |
207 | HMAT_LB_Data *lb_temp; | |
208 | ||
209 | /* Error checking */ | |
210 | if (node->initiator > numa_state->num_nodes) { | |
211 | error_setg(errp, "Invalid initiator=%d, it should be less than %d", | |
212 | node->initiator, numa_state->num_nodes); | |
213 | return; | |
214 | } | |
215 | if (node->target > numa_state->num_nodes) { | |
216 | error_setg(errp, "Invalid target=%d, it should be less than %d", | |
217 | node->target, numa_state->num_nodes); | |
218 | return; | |
219 | } | |
220 | if (!numa_info[node->initiator].has_cpu) { | |
221 | error_setg(errp, "Invalid initiator=%d, it isn't an " | |
222 | "initiator proximity domain", node->initiator); | |
223 | return; | |
224 | } | |
225 | if (!numa_info[node->target].present) { | |
226 | error_setg(errp, "The target=%d should point to an existing node", | |
227 | node->target); | |
228 | return; | |
229 | } | |
230 | ||
231 | if (!hmat_lb) { | |
232 | hmat_lb = g_malloc0(sizeof(*hmat_lb)); | |
233 | numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb; | |
234 | hmat_lb->list = g_array_new(false, true, sizeof(HMAT_LB_Data)); | |
235 | } | |
236 | hmat_lb->hierarchy = node->hierarchy; | |
237 | hmat_lb->data_type = node->data_type; | |
238 | lb_data.initiator = node->initiator; | |
239 | lb_data.target = node->target; | |
240 | ||
241 | if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) { | |
242 | /* Input latency data */ | |
243 | ||
244 | if (!node->has_latency) { | |
245 | error_setg(errp, "Missing 'latency' option"); | |
246 | return; | |
247 | } | |
248 | if (node->has_bandwidth) { | |
249 | error_setg(errp, "Invalid option 'bandwidth' since " | |
250 | "the data type is latency"); | |
251 | return; | |
252 | } | |
253 | ||
254 | /* Detect duplicate configuration */ | |
255 | for (i = 0; i < hmat_lb->list->len; i++) { | |
256 | lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); | |
257 | ||
258 | if (node->initiator == lb_temp->initiator && | |
259 | node->target == lb_temp->target) { | |
260 | error_setg(errp, "Duplicate configuration of the latency for " | |
261 | "initiator=%d and target=%d", node->initiator, | |
262 | node->target); | |
263 | return; | |
264 | } | |
265 | } | |
266 | ||
267 | hmat_lb->base = hmat_lb->base ? hmat_lb->base : UINT64_MAX; | |
268 | ||
269 | if (node->latency) { | |
270 | /* Calculate the temporary base and compressed latency */ | |
271 | max_entry = node->latency; | |
272 | temp_base = 1; | |
273 | while (QEMU_IS_ALIGNED(max_entry, 10)) { | |
274 | max_entry /= 10; | |
275 | temp_base *= 10; | |
276 | } | |
277 | ||
278 | /* Calculate the max compressed latency */ | |
279 | temp_base = MIN(hmat_lb->base, temp_base); | |
280 | max_entry = node->latency / hmat_lb->base; | |
281 | max_entry = MAX(hmat_lb->range_bitmap, max_entry); | |
282 | ||
283 | /* | |
284 | * For latency hmat_lb->range_bitmap record the max compressed | |
285 | * latency which should be less than 0xFFFF (UINT16_MAX) | |
286 | */ | |
287 | if (max_entry >= UINT16_MAX) { | |
288 | error_setg(errp, "Latency %" PRIu64 " between initiator=%d and " | |
289 | "target=%d should not differ from previously entered " | |
290 | "min or max values on more than %d", node->latency, | |
291 | node->initiator, node->target, UINT16_MAX - 1); | |
292 | return; | |
293 | } else { | |
294 | hmat_lb->base = temp_base; | |
295 | hmat_lb->range_bitmap = max_entry; | |
296 | } | |
297 | ||
298 | /* | |
299 | * Set lb_info_provided bit 0 as 1, | |
300 | * latency information is provided | |
301 | */ | |
302 | numa_info[node->target].lb_info_provided |= BIT(0); | |
303 | } | |
304 | lb_data.data = node->latency; | |
305 | } else if (node->data_type >= HMATLB_DATA_TYPE_ACCESS_BANDWIDTH) { | |
306 | /* Input bandwidth data */ | |
307 | if (!node->has_bandwidth) { | |
308 | error_setg(errp, "Missing 'bandwidth' option"); | |
309 | return; | |
310 | } | |
311 | if (node->has_latency) { | |
312 | error_setg(errp, "Invalid option 'latency' since " | |
313 | "the data type is bandwidth"); | |
314 | return; | |
315 | } | |
316 | if (!QEMU_IS_ALIGNED(node->bandwidth, MiB)) { | |
317 | error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and " | |
318 | "target=%d should be 1MB aligned", node->bandwidth, | |
319 | node->initiator, node->target); | |
320 | return; | |
321 | } | |
322 | ||
323 | /* Detect duplicate configuration */ | |
324 | for (i = 0; i < hmat_lb->list->len; i++) { | |
325 | lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); | |
326 | ||
327 | if (node->initiator == lb_temp->initiator && | |
328 | node->target == lb_temp->target) { | |
329 | error_setg(errp, "Duplicate configuration of the bandwidth for " | |
330 | "initiator=%d and target=%d", node->initiator, | |
331 | node->target); | |
332 | return; | |
333 | } | |
334 | } | |
335 | ||
336 | hmat_lb->base = hmat_lb->base ? hmat_lb->base : 1; | |
337 | ||
338 | if (node->bandwidth) { | |
339 | /* Keep bitmap unchanged when bandwidth out of range */ | |
340 | bitmap_copy = hmat_lb->range_bitmap; | |
341 | bitmap_copy |= node->bandwidth; | |
342 | first_bit = ctz64(bitmap_copy); | |
343 | temp_base = UINT64_C(1) << first_bit; | |
344 | max_entry = node->bandwidth / temp_base; | |
345 | last_bit = 64 - clz64(bitmap_copy); | |
346 | ||
347 | /* | |
348 | * For bandwidth, first_bit record the base unit of bandwidth bits, | |
349 | * last_bit record the last bit of the max bandwidth. The max | |
350 | * compressed bandwidth should be less than 0xFFFF (UINT16_MAX) | |
351 | */ | |
352 | if ((last_bit - first_bit) > UINT16_BITS || | |
353 | max_entry >= UINT16_MAX) { | |
354 | error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d " | |
355 | "and target=%d should not differ from previously " | |
356 | "entered values on more than %d", node->bandwidth, | |
357 | node->initiator, node->target, UINT16_MAX - 1); | |
358 | return; | |
359 | } else { | |
360 | hmat_lb->base = temp_base; | |
361 | hmat_lb->range_bitmap = bitmap_copy; | |
362 | } | |
363 | ||
364 | /* | |
365 | * Set lb_info_provided bit 1 as 1, | |
366 | * bandwidth information is provided | |
367 | */ | |
368 | numa_info[node->target].lb_info_provided |= BIT(1); | |
369 | } | |
370 | lb_data.data = node->bandwidth; | |
371 | } else { | |
372 | assert(0); | |
373 | } | |
374 | ||
375 | g_array_append_val(hmat_lb->list, lb_data); | |
376 | } | |
377 | ||
c412a48d LJ |
378 | void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, |
379 | Error **errp) | |
380 | { | |
381 | int nb_numa_nodes = ms->numa_state->num_nodes; | |
382 | NodeInfo *numa_info = ms->numa_state->nodes; | |
383 | NumaHmatCacheOptions *hmat_cache = NULL; | |
384 | ||
385 | if (node->node_id >= nb_numa_nodes) { | |
386 | error_setg(errp, "Invalid node-id=%" PRIu32 ", it should be less " | |
387 | "than %d", node->node_id, nb_numa_nodes); | |
388 | return; | |
389 | } | |
390 | ||
391 | if (numa_info[node->node_id].lb_info_provided != (BIT(0) | BIT(1))) { | |
392 | error_setg(errp, "The latency and bandwidth information of " | |
393 | "node-id=%" PRIu32 " should be provided before memory side " | |
394 | "cache attributes", node->node_id); | |
395 | return; | |
396 | } | |
397 | ||
398 | if (node->level < 1 || node->level >= HMAT_LB_LEVELS) { | |
399 | error_setg(errp, "Invalid level=%" PRIu8 ", it should be larger than 0 " | |
400 | "and less than or equal to %d", node->level, | |
401 | HMAT_LB_LEVELS - 1); | |
402 | return; | |
403 | } | |
404 | ||
405 | assert(node->associativity < HMAT_CACHE_ASSOCIATIVITY__MAX); | |
406 | assert(node->policy < HMAT_CACHE_WRITE_POLICY__MAX); | |
407 | if (ms->numa_state->hmat_cache[node->node_id][node->level]) { | |
408 | error_setg(errp, "Duplicate configuration of the side cache for " | |
409 | "node-id=%" PRIu32 " and level=%" PRIu8, | |
410 | node->node_id, node->level); | |
411 | return; | |
412 | } | |
413 | ||
414 | if ((node->level > 1) && | |
415 | ms->numa_state->hmat_cache[node->node_id][node->level - 1] && | |
416 | (node->size >= | |
417 | ms->numa_state->hmat_cache[node->node_id][node->level - 1]->size)) { | |
418 | error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 | |
419 | " should be less than the size(%" PRIu64 ") of " | |
420 | "level=%u", node->size, node->level, | |
421 | ms->numa_state->hmat_cache[node->node_id] | |
422 | [node->level - 1]->size, | |
423 | node->level - 1); | |
424 | return; | |
425 | } | |
426 | ||
427 | if ((node->level < HMAT_LB_LEVELS - 1) && | |
428 | ms->numa_state->hmat_cache[node->node_id][node->level + 1] && | |
429 | (node->size <= | |
430 | ms->numa_state->hmat_cache[node->node_id][node->level + 1]->size)) { | |
431 | error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 | |
432 | " should be larger than the size(%" PRIu64 ") of " | |
433 | "level=%u", node->size, node->level, | |
434 | ms->numa_state->hmat_cache[node->node_id] | |
435 | [node->level + 1]->size, | |
436 | node->level + 1); | |
437 | return; | |
438 | } | |
439 | ||
440 | hmat_cache = g_malloc0(sizeof(*hmat_cache)); | |
441 | memcpy(hmat_cache, node, sizeof(*hmat_cache)); | |
442 | ms->numa_state->hmat_cache[node->node_id][node->level] = hmat_cache; | |
443 | } | |
444 | ||
3319b4ef | 445 | void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) |
96d0e26c | 446 | { |
0042109a | 447 | Error *err = NULL; |
aa570207 | 448 | |
5275db59 | 449 | if (!ms->numa_state) { |
aa570207 TX |
450 | error_setg(errp, "NUMA is not supported by this machine-type"); |
451 | goto end; | |
452 | } | |
96d0e26c | 453 | |
1fd5d4fe | 454 | switch (object->type) { |
d081a49a | 455 | case NUMA_OPTIONS_TYPE_NODE: |
cc001888 | 456 | parse_numa_node(ms, &object->u.node, &err); |
0042109a | 457 | if (err) { |
157e94e8 | 458 | goto end; |
96d0e26c | 459 | } |
0042109a | 460 | break; |
0f203430 | 461 | case NUMA_OPTIONS_TYPE_DIST: |
aa570207 | 462 | parse_numa_distance(ms, &object->u.dist, &err); |
0f203430 HC |
463 | if (err) { |
464 | goto end; | |
465 | } | |
466 | break; | |
419fcdec IM |
467 | case NUMA_OPTIONS_TYPE_CPU: |
468 | if (!object->u.cpu.has_node_id) { | |
469 | error_setg(&err, "Missing mandatory node-id property"); | |
470 | goto end; | |
471 | } | |
7e721e7b | 472 | if (!ms->numa_state->nodes[object->u.cpu.node_id].present) { |
419fcdec IM |
473 | error_setg(&err, "Invalid node-id=%" PRId64 ", NUMA node must be " |
474 | "defined with -numa node,nodeid=ID before it's used with " | |
475 | "-numa cpu,node-id=ID", object->u.cpu.node_id); | |
476 | goto end; | |
477 | } | |
478 | ||
479 | machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu), | |
480 | &err); | |
481 | break; | |
9b12dfa0 LJ |
482 | case NUMA_OPTIONS_TYPE_HMAT_LB: |
483 | if (!ms->numa_state->hmat_enabled) { | |
484 | error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " | |
485 | "(HMAT) is disabled, enable it with -machine hmat=on " | |
486 | "before using any of hmat specific options"); | |
487 | return; | |
488 | } | |
489 | ||
490 | parse_numa_hmat_lb(ms->numa_state, &object->u.hmat_lb, &err); | |
491 | if (err) { | |
492 | goto end; | |
493 | } | |
494 | break; | |
c412a48d LJ |
495 | case NUMA_OPTIONS_TYPE_HMAT_CACHE: |
496 | if (!ms->numa_state->hmat_enabled) { | |
497 | error_setg(errp, "ACPI Heterogeneous Memory Attribute Table " | |
498 | "(HMAT) is disabled, enable it with -machine hmat=on " | |
499 | "before using any of hmat specific options"); | |
500 | return; | |
501 | } | |
502 | ||
503 | parse_numa_hmat_cache(ms, &object->u.hmat_cache, &err); | |
504 | if (err) { | |
505 | goto end; | |
506 | } | |
507 | break; | |
0042109a WG |
508 | default: |
509 | abort(); | |
510 | } | |
96d0e26c | 511 | |
3319b4ef IM |
512 | end: |
513 | error_propagate(errp, err); | |
514 | } | |
515 | ||
4f7ec696 | 516 | static int parse_numa(void *opaque, QemuOpts *opts, Error **errp) |
3319b4ef IM |
517 | { |
518 | NumaOptions *object = NULL; | |
519 | MachineState *ms = MACHINE(opaque); | |
520 | Error *err = NULL; | |
521 | Visitor *v = opts_visitor_new(opts); | |
522 | ||
523 | visit_type_NumaOptions(v, NULL, &object, &err); | |
524 | visit_free(v); | |
525 | if (err) { | |
526 | goto end; | |
527 | } | |
528 | ||
529 | /* Fix up legacy suffix-less format */ | |
530 | if ((object->type == NUMA_OPTIONS_TYPE_NODE) && object->u.node.has_mem) { | |
531 | const char *mem_str = qemu_opt_get(opts, "mem"); | |
532 | qemu_strtosz_MiB(mem_str, NULL, &object->u.node.mem); | |
533 | } | |
534 | ||
535 | set_numa_options(ms, object, &err); | |
536 | ||
157e94e8 | 537 | end: |
96a1616c | 538 | qapi_free_NumaOptions(object); |
157e94e8 | 539 | if (err) { |
4f7ec696 | 540 | error_propagate(errp, err); |
157e94e8 MAL |
541 | return -1; |
542 | } | |
0042109a | 543 | |
157e94e8 | 544 | return 0; |
96d0e26c WG |
545 | } |
546 | ||
0f203430 HC |
547 | /* If all node pair distances are symmetric, then only distances |
548 | * in one direction are enough. If there is even one asymmetric | |
549 | * pair, though, then all distances must be provided. The | |
550 | * distance from a node to itself is always NUMA_DISTANCE_MIN, | |
551 | * so providing it is never necessary. | |
552 | */ | |
aa570207 | 553 | static void validate_numa_distance(MachineState *ms) |
3ef71975 | 554 | { |
0f203430 HC |
555 | int src, dst; |
556 | bool is_asymmetrical = false; | |
aa570207 | 557 | int nb_numa_nodes = ms->numa_state->num_nodes; |
7e721e7b | 558 | NodeInfo *numa_info = ms->numa_state->nodes; |
0f203430 HC |
559 | |
560 | for (src = 0; src < nb_numa_nodes; src++) { | |
561 | for (dst = src; dst < nb_numa_nodes; dst++) { | |
562 | if (numa_info[src].distance[dst] == 0 && | |
563 | numa_info[dst].distance[src] == 0) { | |
564 | if (src != dst) { | |
565 | error_report("The distance between node %d and %d is " | |
566 | "missing, at least one distance value " | |
567 | "between each nodes should be provided.", | |
568 | src, dst); | |
569 | exit(EXIT_FAILURE); | |
570 | } | |
571 | } | |
3ef71975 | 572 | |
0f203430 HC |
573 | if (numa_info[src].distance[dst] != 0 && |
574 | numa_info[dst].distance[src] != 0 && | |
575 | numa_info[src].distance[dst] != | |
576 | numa_info[dst].distance[src]) { | |
577 | is_asymmetrical = true; | |
578 | } | |
579 | } | |
580 | } | |
581 | ||
582 | if (is_asymmetrical) { | |
583 | for (src = 0; src < nb_numa_nodes; src++) { | |
584 | for (dst = 0; dst < nb_numa_nodes; dst++) { | |
585 | if (src != dst && numa_info[src].distance[dst] == 0) { | |
586 | error_report("At least one asymmetrical pair of " | |
587 | "distances is given, please provide distances " | |
588 | "for both directions of all node pairs."); | |
589 | exit(EXIT_FAILURE); | |
590 | } | |
591 | } | |
592 | } | |
3ef71975 | 593 | } |
3ef71975 EH |
594 | } |
595 | ||
aa570207 | 596 | static void complete_init_numa_distance(MachineState *ms) |
3ef71975 | 597 | { |
0f203430 | 598 | int src, dst; |
7e721e7b | 599 | NodeInfo *numa_info = ms->numa_state->nodes; |
3ef71975 | 600 | |
0f203430 HC |
601 | /* Fixup NUMA distance by symmetric policy because if it is an |
602 | * asymmetric distance table, it should be a complete table and | |
603 | * there would not be any missing distance except local node, which | |
604 | * is verified by validate_numa_distance above. | |
605 | */ | |
aa570207 TX |
606 | for (src = 0; src < ms->numa_state->num_nodes; src++) { |
607 | for (dst = 0; dst < ms->numa_state->num_nodes; dst++) { | |
0f203430 HC |
608 | if (numa_info[src].distance[dst] == 0) { |
609 | if (src == dst) { | |
610 | numa_info[src].distance[dst] = NUMA_DISTANCE_MIN; | |
611 | } else { | |
612 | numa_info[src].distance[dst] = numa_info[dst].distance[src]; | |
613 | } | |
614 | } | |
3ef71975 | 615 | } |
3ef71975 | 616 | } |
0f203430 | 617 | } |
549fc54b | 618 | |
3bfe5716 LV |
619 | void numa_legacy_auto_assign_ram(MachineClass *mc, NodeInfo *nodes, |
620 | int nb_nodes, ram_addr_t size) | |
621 | { | |
622 | int i; | |
623 | uint64_t usedmem = 0; | |
624 | ||
625 | /* Align each node according to the alignment | |
626 | * requirements of the machine class | |
627 | */ | |
628 | ||
629 | for (i = 0; i < nb_nodes - 1; i++) { | |
630 | nodes[i].node_mem = (size / nb_nodes) & | |
631 | ~((1 << mc->numa_mem_align_shift) - 1); | |
632 | usedmem += nodes[i].node_mem; | |
549fc54b | 633 | } |
3bfe5716 | 634 | nodes[i].node_mem = size - usedmem; |
3ef71975 EH |
635 | } |
636 | ||
3bfe5716 LV |
637 | void numa_default_auto_assign_ram(MachineClass *mc, NodeInfo *nodes, |
638 | int nb_nodes, ram_addr_t size) | |
96d0e26c | 639 | { |
12d6e464 | 640 | int i; |
3bfe5716 LV |
641 | uint64_t usedmem = 0, node_mem; |
642 | uint64_t granularity = size / nb_nodes; | |
643 | uint64_t propagate = 0; | |
644 | ||
645 | for (i = 0; i < nb_nodes - 1; i++) { | |
646 | node_mem = (granularity + propagate) & | |
647 | ~((1 << mc->numa_mem_align_shift) - 1); | |
648 | propagate = granularity + propagate - node_mem; | |
649 | nodes[i].node_mem = node_mem; | |
650 | usedmem += node_mem; | |
651 | } | |
652 | nodes[i].node_mem = size - usedmem; | |
653 | } | |
12d6e464 | 654 | |
7a3099fc | 655 | void numa_complete_configuration(MachineState *ms) |
96d0e26c | 656 | { |
12d6e464 | 657 | int i; |
ea089eeb | 658 | MachineClass *mc = MACHINE_GET_CLASS(ms); |
7e721e7b | 659 | NodeInfo *numa_info = ms->numa_state->nodes; |
cdda2018 | 660 | |
7b8be49d DL |
661 | /* |
662 | * If memory hotplug is enabled (slots > 0) but without '-numa' | |
663 | * options explicitly on CLI, guestes will break. | |
664 | * | |
665 | * Windows: won't enable memory hotplug without SRAT table at all | |
666 | * | |
667 | * Linux: if QEMU is started with initial memory all below 4Gb | |
668 | * and no SRAT table present, guest kernel will use nommu DMA ops, | |
669 | * which breaks 32bit hw drivers when memory is hotplugged and | |
670 | * guest tries to use it with that drivers. | |
671 | * | |
672 | * Enable NUMA implicitly by adding a new NUMA node automatically. | |
0533ef5f TX |
673 | * |
674 | * Or if MachineClass::auto_enable_numa is true and no NUMA nodes, | |
675 | * assume there is just one node with whole RAM. | |
7b8be49d | 676 | */ |
0533ef5f TX |
677 | if (ms->numa_state->num_nodes == 0 && |
678 | ((ms->ram_slots > 0 && | |
679 | mc->auto_enable_numa_with_memhp) || | |
680 | mc->auto_enable_numa)) { | |
7b8be49d | 681 | NumaNodeOptions node = { }; |
a22528b9 | 682 | parse_numa_node(ms, &node, &error_abort); |
0533ef5f | 683 | numa_info[0].node_mem = ram_size; |
7b8be49d DL |
684 | } |
685 | ||
12d6e464 EH |
686 | assert(max_numa_nodeid <= MAX_NODES); |
687 | ||
688 | /* No support for sparse NUMA node IDs yet: */ | |
689 | for (i = max_numa_nodeid - 1; i >= 0; i--) { | |
690 | /* Report large node IDs first, to make mistakes easier to spot */ | |
691 | if (!numa_info[i].present) { | |
692 | error_report("numa: Node ID missing: %d", i); | |
693 | exit(1); | |
694 | } | |
695 | } | |
696 | ||
697 | /* This must be always true if all nodes are present: */ | |
aa570207 | 698 | assert(ms->numa_state->num_nodes == max_numa_nodeid); |
12d6e464 | 699 | |
aa570207 | 700 | if (ms->numa_state->num_nodes > 0) { |
2b631ec2 | 701 | uint64_t numa_total; |
96d0e26c | 702 | |
aa570207 TX |
703 | if (ms->numa_state->num_nodes > MAX_NODES) { |
704 | ms->numa_state->num_nodes = MAX_NODES; | |
96d0e26c WG |
705 | } |
706 | ||
9851d0fe | 707 | /* If no memory size is given for any node, assume the default case |
96d0e26c WG |
708 | * and distribute the available memory equally across all nodes |
709 | */ | |
aa570207 | 710 | for (i = 0; i < ms->numa_state->num_nodes; i++) { |
8c85901e | 711 | if (numa_info[i].node_mem != 0) { |
96d0e26c WG |
712 | break; |
713 | } | |
714 | } | |
aa570207 | 715 | if (i == ms->numa_state->num_nodes) { |
3bfe5716 | 716 | assert(mc->numa_auto_assign_ram); |
aa570207 TX |
717 | mc->numa_auto_assign_ram(mc, numa_info, |
718 | ms->numa_state->num_nodes, ram_size); | |
f8123f22 EH |
719 | if (!qtest_enabled()) { |
720 | warn_report("Default splitting of RAM between nodes is deprecated," | |
721 | " Use '-numa node,memdev' to explictly define RAM" | |
722 | " allocation per node"); | |
723 | } | |
96d0e26c WG |
724 | } |
725 | ||
2b631ec2 | 726 | numa_total = 0; |
aa570207 | 727 | for (i = 0; i < ms->numa_state->num_nodes; i++) { |
8c85901e | 728 | numa_total += numa_info[i].node_mem; |
2b631ec2 WG |
729 | } |
730 | if (numa_total != ram_size) { | |
c68233ae HT |
731 | error_report("total memory for NUMA nodes (0x%" PRIx64 ")" |
732 | " should equal RAM size (0x" RAM_ADDR_FMT ")", | |
2b631ec2 WG |
733 | numa_total, ram_size); |
734 | exit(1); | |
735 | } | |
736 | ||
0f203430 HC |
737 | /* QEMU needs at least all unique node pair distances to build |
738 | * the whole NUMA distance table. QEMU treats the distance table | |
739 | * as symmetric by default, i.e. distance A->B == distance B->A. | |
740 | * Thus, QEMU is able to complete the distance table | |
741 | * initialization even though only distance A->B is provided and | |
742 | * distance B->A is not. QEMU knows the distance of a node to | |
743 | * itself is always 10, so A->A distances may be omitted. When | |
744 | * the distances of two nodes of a pair differ, i.e. distance | |
745 | * A->B != distance B->A, then that means the distance table is | |
746 | * asymmetric. In this case, the distances for both directions | |
747 | * of all node pairs are required. | |
748 | */ | |
118154b7 | 749 | if (ms->numa_state->have_numa_distance) { |
0f203430 | 750 | /* Validate enough NUMA distance information was provided. */ |
aa570207 | 751 | validate_numa_distance(ms); |
96d0e26c | 752 | |
0f203430 | 753 | /* Validation succeeded, now fill in any missing distances. */ |
aa570207 | 754 | complete_init_numa_distance(ms); |
96d0e26c WG |
755 | } |
756 | } | |
757 | } | |
dfabb8b9 | 758 | |
7a3099fc IM |
759 | void parse_numa_opts(MachineState *ms) |
760 | { | |
4f7ec696 | 761 | qemu_opts_foreach(qemu_find_opts("numa"), parse_numa, ms, &error_fatal); |
7a3099fc IM |
762 | } |
763 | ||
a0ceb640 IM |
764 | void numa_cpu_pre_plug(const CPUArchId *slot, DeviceState *dev, Error **errp) |
765 | { | |
a0ceb640 IM |
766 | int node_id = object_property_get_int(OBJECT(dev), "node-id", &error_abort); |
767 | ||
a0ceb640 IM |
768 | if (node_id == CPU_UNSET_NUMA_NODE_ID) { |
769 | /* due to bug in libvirt, it doesn't pass node-id from props on | |
770 | * device_add as expected, so we have to fix it up here */ | |
d41f3e75 IM |
771 | if (slot->props.has_node_id) { |
772 | object_property_set_int(OBJECT(dev), slot->props.node_id, | |
773 | "node-id", errp); | |
774 | } | |
775 | } else if (node_id != slot->props.node_id) { | |
a5bf9fbc LV |
776 | error_setg(errp, "invalid node-id, must be %"PRId64, |
777 | slot->props.node_id); | |
a0ceb640 IM |
778 | } |
779 | } | |
780 | ||
7febe36f PB |
781 | static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner, |
782 | const char *name, | |
783 | uint64_t ram_size) | |
784 | { | |
0b183fc8 PB |
785 | if (mem_path) { |
786 | #ifdef __linux__ | |
7f56e740 | 787 | Error *err = NULL; |
cbfc0171 | 788 | memory_region_init_ram_from_file(mr, owner, name, ram_size, 0, 0, |
7f56e740 | 789 | mem_path, &err); |
c3ba3095 | 790 | if (err) { |
29b762f5 | 791 | error_report_err(err); |
fae947b0 LC |
792 | if (mem_prealloc) { |
793 | exit(1); | |
794 | } | |
cb79224b IM |
795 | warn_report("falling back to regular RAM allocation"); |
796 | error_printf("This is deprecated. Make sure that -mem-path " | |
797 | " specified path has sufficient resources to allocate" | |
88ed5db1 | 798 | " -m specified RAM amount\n"); |
fae947b0 LC |
799 | /* Legacy behavior: if allocation failed, fall back to |
800 | * regular RAM allocation. | |
801 | */ | |
6233b679 | 802 | mem_path = NULL; |
1cfe48c1 | 803 | memory_region_init_ram_nomigrate(mr, owner, name, ram_size, &error_fatal); |
7f56e740 | 804 | } |
0b183fc8 PB |
805 | #else |
806 | fprintf(stderr, "-mem-path not supported on this host\n"); | |
807 | exit(1); | |
808 | #endif | |
809 | } else { | |
1cfe48c1 | 810 | memory_region_init_ram_nomigrate(mr, owner, name, ram_size, &error_fatal); |
0b183fc8 | 811 | } |
7febe36f PB |
812 | vmstate_register_ram_global(mr); |
813 | } | |
814 | ||
dfabb8b9 PB |
815 | void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner, |
816 | const char *name, | |
817 | uint64_t ram_size) | |
818 | { | |
7febe36f PB |
819 | uint64_t addr = 0; |
820 | int i; | |
aa570207 | 821 | MachineState *ms = MACHINE(qdev_get_machine()); |
7febe36f | 822 | |
aa570207 TX |
823 | if (ms->numa_state == NULL || |
824 | ms->numa_state->num_nodes == 0 || !have_memdevs) { | |
7febe36f PB |
825 | allocate_system_memory_nonnuma(mr, owner, name, ram_size); |
826 | return; | |
827 | } | |
828 | ||
829 | memory_region_init(mr, owner, name, ram_size); | |
aa570207 | 830 | for (i = 0; i < ms->numa_state->num_nodes; i++) { |
7e721e7b TX |
831 | uint64_t size = ms->numa_state->nodes[i].node_mem; |
832 | HostMemoryBackend *backend = ms->numa_state->nodes[i].node_memdev; | |
7febe36f PB |
833 | if (!backend) { |
834 | continue; | |
835 | } | |
7943e97b | 836 | MemoryRegion *seg = host_memory_backend_get_memory(backend); |
7febe36f | 837 | |
0462faee HT |
838 | if (memory_region_is_mapped(seg)) { |
839 | char *path = object_get_canonical_path_component(OBJECT(backend)); | |
840 | error_report("memory backend %s is used multiple times. Each " | |
841 | "-numa option must use a different memdev value.", | |
842 | path); | |
2920bd64 | 843 | g_free(path); |
0462faee HT |
844 | exit(1); |
845 | } | |
846 | ||
0b217571 | 847 | host_memory_backend_set_mapped(backend, true); |
7febe36f PB |
848 | memory_region_add_subregion(mr, addr, seg); |
849 | vmstate_register_ram_global(seg); | |
850 | addr += size; | |
851 | } | |
dfabb8b9 | 852 | } |
76b5d850 | 853 | |
31959e82 | 854 | static void numa_stat_memory_devices(NumaNodeMem node_mem[]) |
5b009e40 | 855 | { |
2cc0e2e8 | 856 | MemoryDeviceInfoList *info_list = qmp_memory_device_list(); |
5b009e40 | 857 | MemoryDeviceInfoList *info; |
31959e82 | 858 | PCDIMMDeviceInfo *pcdimm_info; |
cae02c34 | 859 | VirtioPMEMDeviceInfo *vpi; |
5b009e40 | 860 | |
5b009e40 HZ |
861 | for (info = info_list; info; info = info->next) { |
862 | MemoryDeviceInfo *value = info->value; | |
863 | ||
864 | if (value) { | |
1fd5d4fe | 865 | switch (value->type) { |
6388e18d | 866 | case MEMORY_DEVICE_INFO_KIND_DIMM: |
6388e18d | 867 | case MEMORY_DEVICE_INFO_KIND_NVDIMM: |
cae02c34 DH |
868 | pcdimm_info = value->type == MEMORY_DEVICE_INFO_KIND_DIMM ? |
869 | value->u.dimm.data : value->u.nvdimm.data; | |
31959e82 | 870 | node_mem[pcdimm_info->node].node_mem += pcdimm_info->size; |
178003ea DH |
871 | node_mem[pcdimm_info->node].node_plugged_mem += |
872 | pcdimm_info->size; | |
cae02c34 DH |
873 | break; |
874 | case MEMORY_DEVICE_INFO_KIND_VIRTIO_PMEM: | |
875 | vpi = value->u.virtio_pmem.data; | |
876 | /* TODO: once we support numa, assign to right node */ | |
877 | node_mem[0].node_mem += vpi->size; | |
878 | node_mem[0].node_plugged_mem += vpi->size; | |
879 | break; | |
880 | default: | |
881 | g_assert_not_reached(); | |
5b009e40 HZ |
882 | } |
883 | } | |
884 | } | |
885 | qapi_free_MemoryDeviceInfoList(info_list); | |
886 | } | |
887 | ||
aa570207 | 888 | void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms) |
5b009e40 HZ |
889 | { |
890 | int i; | |
891 | ||
aa570207 | 892 | if (ms->numa_state == NULL || ms->numa_state->num_nodes <= 0) { |
5b009e40 HZ |
893 | return; |
894 | } | |
895 | ||
896 | numa_stat_memory_devices(node_mem); | |
aa570207 | 897 | for (i = 0; i < ms->numa_state->num_nodes; i++) { |
7e721e7b | 898 | node_mem[i].node_mem += ms->numa_state->nodes[i].node_mem; |
5b009e40 HZ |
899 | } |
900 | } | |
901 | ||
0987d735 PB |
902 | void ram_block_notifier_add(RAMBlockNotifier *n) |
903 | { | |
904 | QLIST_INSERT_HEAD(&ram_list.ramblock_notifiers, n, next); | |
905 | } | |
906 | ||
907 | void ram_block_notifier_remove(RAMBlockNotifier *n) | |
908 | { | |
909 | QLIST_REMOVE(n, next); | |
910 | } | |
911 | ||
912 | void ram_block_notify_add(void *host, size_t size) | |
913 | { | |
914 | RAMBlockNotifier *notifier; | |
915 | ||
916 | QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) { | |
917 | notifier->ram_block_added(notifier, host, size); | |
918 | } | |
919 | } | |
920 | ||
921 | void ram_block_notify_remove(void *host, size_t size) | |
922 | { | |
923 | RAMBlockNotifier *notifier; | |
924 | ||
925 | QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) { | |
926 | notifier->ram_block_removed(notifier, host, size); | |
927 | } | |
928 | } |