X-Git-Url: https://repo.jachan.dev/qemu.git/blobdiff_plain/0462faee67eb9ee39e51f764891fb6b767602eed..2115f2a1d8c77c2a735add1ed02ae0c1feae0d9a:/numa.c diff --git a/numa.c b/numa.c index 7bf7834b7f..c975fb2682 100644 --- a/numa.c +++ b/numa.c @@ -22,7 +22,7 @@ * THE SOFTWARE. */ -#include "sysemu/sysemu.h" +#include "sysemu/numa.h" #include "exec/cpu-common.h" #include "qemu/bitmap.h" #include "qom/cpu.h" @@ -35,6 +35,9 @@ #include "hw/boards.h" #include "sysemu/hostmem.h" #include "qmp-commands.h" +#include "hw/mem/pc-dimm.h" +#include "qemu/option.h" +#include "qemu/config-file.h" QemuOptsList qemu_numa_opts = { .name = "numa", @@ -44,6 +47,11 @@ QemuOptsList qemu_numa_opts = { }; static int have_memdevs = -1; +static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one. + * For all nodes, nodeid < max_numa_nodeid + */ +int nb_numa_nodes; +NodeInfo numa_info[MAX_NODES]; static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) { @@ -58,7 +66,7 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) if (nodenr >= MAX_NODES) { error_setg(errp, "Max number of NUMA nodes reached: %" - PRIu16 "\n", nodenr); + PRIu16 "", nodenr); return; } @@ -68,16 +76,18 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) } for (cpus = node->cpus; cpus; cpus = cpus->next) { - if (cpus->value > MAX_CPUMASK_BITS) { - error_setg(errp, "CPU number %" PRIu16 " is bigger than %d", - cpus->value, MAX_CPUMASK_BITS); + if (cpus->value >= max_cpus) { + error_setg(errp, + "CPU index (%" PRIu16 ")" + " should be smaller than maxcpus (%d)", + cpus->value, max_cpus); return; } bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); } if (node->has_mem && node->has_memdev) { - error_setg(errp, "qemu: cannot specify both mem= and memdev=\n"); + error_setg(errp, "qemu: cannot specify both mem= and memdev="); return; } @@ -86,7 +96,7 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) } if (node->has_memdev != have_memdevs) { error_setg(errp, "qemu: memdev option must be specified for either " - "all or no nodes\n"); + "all or no nodes"); return; } @@ -115,7 +125,7 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1); } -int numa_init_func(QemuOpts *opts, void *opaque) +static int parse_numa(QemuOpts *opts, void *opaque) { NumaOptions *object = NULL; Error *err = NULL; @@ -145,8 +155,7 @@ int numa_init_func(QemuOpts *opts, void *opaque) return 0; error: - qerror_report_err(err); - error_free(err); + error_report_err(err); if (object) { QapiDeallocVisitor *dv = qapi_dealloc_visitor_new(); @@ -158,10 +167,60 @@ error: return -1; } -void set_numa_nodes(void) +static char *enumerate_cpus(unsigned long *cpus, int max_cpus) +{ + int cpu; + bool first = true; + GString *s = g_string_new(NULL); + + for (cpu = find_first_bit(cpus, max_cpus); + cpu < max_cpus; + cpu = find_next_bit(cpus, max_cpus, cpu + 1)) { + g_string_append_printf(s, "%s%d", first ? "" : " ", cpu); + first = false; + } + return g_string_free(s, FALSE); +} + +static void validate_numa_cpus(void) +{ + int i; + DECLARE_BITMAP(seen_cpus, MAX_CPUMASK_BITS); + + bitmap_zero(seen_cpus, MAX_CPUMASK_BITS); + for (i = 0; i < nb_numa_nodes; i++) { + if (bitmap_intersects(seen_cpus, numa_info[i].node_cpu, + MAX_CPUMASK_BITS)) { + bitmap_and(seen_cpus, seen_cpus, + numa_info[i].node_cpu, MAX_CPUMASK_BITS); + error_report("CPU(s) present in multiple NUMA nodes: %s", + enumerate_cpus(seen_cpus, max_cpus));; + exit(EXIT_FAILURE); + } + bitmap_or(seen_cpus, seen_cpus, + numa_info[i].node_cpu, MAX_CPUMASK_BITS); + } + + if (!bitmap_full(seen_cpus, max_cpus)) { + char *msg; + bitmap_complement(seen_cpus, seen_cpus, max_cpus); + msg = enumerate_cpus(seen_cpus, max_cpus); + error_report("warning: CPU(s) not present in any NUMA nodes: %s", msg); + error_report("warning: All CPU(s) up to maxcpus should be described " + "in NUMA config"); + g_free(msg); + } +} + +void parse_numa_opts(MachineClass *mc) { int i; + if (qemu_opts_foreach(qemu_find_opts("numa"), parse_numa, + NULL, 1) != 0) { + exit(1); + } + assert(max_numa_nodeid <= MAX_NODES); /* No support for sparse NUMA node IDs yet: */ @@ -210,8 +269,8 @@ void set_numa_nodes(void) numa_total += numa_info[i].node_mem; } if (numa_total != ram_size) { - error_report("total memory for NUMA nodes (%" PRIu64 ")" - " should equal RAM size (" RAM_ADDR_FMT ")", + error_report("total memory for NUMA nodes (0x%" PRIx64 ")" + " should equal RAM size (0x" RAM_ADDR_FMT ")", numa_total, ram_size); exit(1); } @@ -221,19 +280,29 @@ void set_numa_nodes(void) break; } } - /* assigning the VCPUs round-robin is easier to implement, guest OSes - * must cope with this anyway, because there are BIOSes out there in - * real machines which also use this scheme. + /* Historically VCPUs were assigned in round-robin order to NUMA + * nodes. However it causes issues with guest not handling it nice + * in case where cores/threads from a multicore CPU appear on + * different nodes. So allow boards to override default distribution + * rule grouping VCPUs by socket so that VCPUs from the same socket + * would be on the same node. */ if (i == nb_numa_nodes) { for (i = 0; i < max_cpus; i++) { - set_bit(i, numa_info[i % nb_numa_nodes].node_cpu); + unsigned node_id = i % nb_numa_nodes; + if (mc->cpu_index_to_socket_id) { + node_id = mc->cpu_index_to_socket_id(i) % nb_numa_nodes; + } + + set_bit(i, numa_info[node_id].node_cpu); } } + + validate_numa_cpus(); } } -void set_numa_modes(void) +void numa_post_machine_init(void) { CPUState *cpu; int i; @@ -261,16 +330,15 @@ static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner, * regular RAM allocation. */ if (err) { - qerror_report_err(err); - error_free(err); - memory_region_init_ram(mr, owner, name, ram_size); + error_report_err(err); + memory_region_init_ram(mr, owner, name, ram_size, &error_abort); } #else fprintf(stderr, "-mem-path not supported on this host\n"); exit(1); #endif } else { - memory_region_init_ram(mr, owner, name, ram_size); + memory_region_init_ram(mr, owner, name, ram_size, &error_abort); } vmstate_register_ram_global(mr); } @@ -297,7 +365,7 @@ void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner, } MemoryRegion *seg = host_memory_backend_get_memory(backend, &local_err); if (local_err) { - qerror_report_err(local_err); + error_report_err(local_err); exit(1); } @@ -315,13 +383,51 @@ void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner, } } +static void numa_stat_memory_devices(uint64_t node_mem[]) +{ + MemoryDeviceInfoList *info_list = NULL; + MemoryDeviceInfoList **prev = &info_list; + MemoryDeviceInfoList *info; + + qmp_pc_dimm_device_list(qdev_get_machine(), &prev); + for (info = info_list; info; info = info->next) { + MemoryDeviceInfo *value = info->value; + + if (value) { + switch (value->kind) { + case MEMORY_DEVICE_INFO_KIND_DIMM: + node_mem[value->dimm->node] += value->dimm->size; + break; + default: + break; + } + } + } + qapi_free_MemoryDeviceInfoList(info_list); +} + +void query_numa_node_mem(uint64_t node_mem[]) +{ + int i; + + if (nb_numa_nodes <= 0) { + return; + } + + numa_stat_memory_devices(node_mem); + for (i = 0; i < nb_numa_nodes; i++) { + node_mem[i] += numa_info[i].node_mem; + } +} + static int query_memdev(Object *obj, void *opaque) { MemdevList **list = opaque; + MemdevList *m = NULL; Error *err = NULL; if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) { - MemdevList *m = g_malloc0(sizeof(*m)); + m = g_malloc0(sizeof(*m)); m->value = g_malloc0(sizeof(*m->value)); @@ -369,13 +475,16 @@ static int query_memdev(Object *obj, void *opaque) return 0; error: + g_free(m->value); + g_free(m); + return -1; } MemdevList *qmp_query_memdev(Error **errp) { Object *obj; - MemdevList *list = NULL, *m; + MemdevList *list = NULL; obj = object_resolve_path("/objects", NULL); if (obj == NULL) { @@ -389,11 +498,6 @@ MemdevList *qmp_query_memdev(Error **errp) return list; error: - while (list) { - m = list; - list = list->next; - g_free(m->value); - g_free(m); - } + qapi_free_MemdevList(list); return NULL; }