Git Repo - qemu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* NUMA parameter parsing routines
	3	*
	4	* Copyright (c) 2014 Fujitsu Ltd.
	5	*
	6	* Permission is hereby granted, free of charge, to any person obtaining a copy
	7	* of this software and associated documentation files (the "Software"), to deal
	8	* in the Software without restriction, including without limitation the rights
	9	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	10	* copies of the Software, and to permit persons to whom the Software is
	11	* furnished to do so, subject to the following conditions:
	12	*
	13	* The above copyright notice and this permission notice shall be included in
	14	* all copies or substantial portions of the Software.
	15	*
	16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	19	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	22	* THE SOFTWARE.
	23	*/
	24
	25	#include "sysemu/numa.h"
	26	#include "exec/cpu-common.h"
	27	#include "qemu/bitmap.h"
	28	#include "qom/cpu.h"
	29	#include "qemu/error-report.h"
	30	#include "include/exec/cpu-common.h" /* for RAM_ADDR_FMT */
	31	#include "qapi-visit.h"
	32	#include "qapi/opts-visitor.h"
	33	#include "qapi/dealloc-visitor.h"
	34	#include "qapi/qmp/qerror.h"
	35	#include "hw/boards.h"
	36	#include "sysemu/hostmem.h"
	37	#include "qmp-commands.h"
	38	#include "hw/mem/pc-dimm.h"
	39	#include "qemu/option.h"
	40	#include "qemu/config-file.h"
	41
	42	QemuOptsList qemu_numa_opts = {
	43	.name = "numa",
	44	.implied_opt_name = "type",
	45	.head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head),
	46	.desc = { { 0 } } /* validated with OptsVisitor */
	47	};
	48
	49	static int have_memdevs = -1;
	50	static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
	51	* For all nodes, nodeid < max_numa_nodeid
	52	*/
	53	int nb_numa_nodes;
	54	NodeInfo numa_info[MAX_NODES];
	55
	56	static void numa_node_parse(NumaNodeOptions node, QemuOpts opts, Error **errp)
	57	{
	58	uint16_t nodenr;
	59	uint16List *cpus = NULL;
	60
	61	if (node->has_nodeid) {
	62	nodenr = node->nodeid;
	63	} else {
	64	nodenr = nb_numa_nodes;
	65	}
	66
	67	if (nodenr >= MAX_NODES) {
	68	error_setg(errp, "Max number of NUMA nodes reached: %"
	69	PRIu16 "", nodenr);
	70	return;
	71	}
	72
	73	if (numa_info[nodenr].present) {
	74	error_setg(errp, "Duplicate NUMA nodeid: %" PRIu16, nodenr);
	75	return;
	76	}
	77
	78	for (cpus = node->cpus; cpus; cpus = cpus->next) {
	79	if (cpus->value >= max_cpus) {
	80	error_setg(errp,
	81	"CPU index (%" PRIu16 ")"
	82	" should be smaller than maxcpus (%d)",
	83	cpus->value, max_cpus);
	84	return;
	85	}
	86	bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1);
	87	}
	88
	89	if (node->has_mem && node->has_memdev) {
	90	error_setg(errp, "qemu: cannot specify both mem= and memdev=");
	91	return;
	92	}
	93
	94	if (have_memdevs == -1) {
	95	have_memdevs = node->has_memdev;
	96	}
	97	if (node->has_memdev != have_memdevs) {
	98	error_setg(errp, "qemu: memdev option must be specified for either "
	99	"all or no nodes");
	100	return;
	101	}
	102
	103	if (node->has_mem) {
	104	uint64_t mem_size = node->mem;
	105	const char *mem_str = qemu_opt_get(opts, "mem");
	106	/* Fix up legacy suffix-less format */
	107	if (g_ascii_isdigit(mem_str[strlen(mem_str) - 1])) {
	108	mem_size <<= 20;
	109	}
	110	numa_info[nodenr].node_mem = mem_size;
	111	}
	112	if (node->has_memdev) {
	113	Object *o;
	114	o = object_resolve_path_type(node->memdev, TYPE_MEMORY_BACKEND, NULL);
	115	if (!o) {
	116	error_setg(errp, "memdev=%s is ambiguous", node->memdev);
	117	return;
	118	}
	119
	120	object_ref(o);
	121	numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
	122	numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
	123	}
	124	numa_info[nodenr].present = true;
	125	max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
	126	}
	127
	128	static int parse_numa(void opaque, QemuOpts opts, Error **errp)
	129	{
	130	NumaOptions *object = NULL;
	131	Error *err = NULL;
	132
	133	{
	134	OptsVisitor *ov = opts_visitor_new(opts);
	135	visit_type_NumaOptions(opts_get_visitor(ov), &object, NULL, &err);
	136	opts_visitor_cleanup(ov);
	137	}
	138
	139	if (err) {
	140	goto error;
	141	}
	142
	143	switch (object->kind) {
	144	case NUMA_OPTIONS_KIND_NODE:
	145	numa_node_parse(object->node, opts, &err);
	146	if (err) {
	147	goto error;
	148	}
	149	nb_numa_nodes++;
	150	break;
	151	default:
	152	abort();
	153	}
	154
	155	return 0;
	156
	157	error:
	158	error_report_err(err);
	159
	160	if (object) {
	161	QapiDeallocVisitor *dv = qapi_dealloc_visitor_new();
	162	visit_type_NumaOptions(qapi_dealloc_get_visitor(dv),
	163	&object, NULL, NULL);
	164	qapi_dealloc_visitor_cleanup(dv);
	165	}
	166
	167	return -1;
	168	}
	169
	170	static char enumerate_cpus(unsigned long cpus, int max_cpus)
	171	{
	172	int cpu;
	173	bool first = true;
	174	GString *s = g_string_new(NULL);
	175
	176	for (cpu = find_first_bit(cpus, max_cpus);
	177	cpu < max_cpus;
	178	cpu = find_next_bit(cpus, max_cpus, cpu + 1)) {
	179	g_string_append_printf(s, "%s%d", first ? "" : " ", cpu);
	180	first = false;
	181	}
	182	return g_string_free(s, FALSE);
	183	}
	184
	185	static void validate_numa_cpus(void)
	186	{
	187	int i;
	188	DECLARE_BITMAP(seen_cpus, MAX_CPUMASK_BITS);
	189
	190	bitmap_zero(seen_cpus, MAX_CPUMASK_BITS);
	191	for (i = 0; i < nb_numa_nodes; i++) {
	192	if (bitmap_intersects(seen_cpus, numa_info[i].node_cpu,
	193	MAX_CPUMASK_BITS)) {
	194	bitmap_and(seen_cpus, seen_cpus,
	195	numa_info[i].node_cpu, MAX_CPUMASK_BITS);
	196	error_report("CPU(s) present in multiple NUMA nodes: %s",
	197	enumerate_cpus(seen_cpus, max_cpus));;
	198	exit(EXIT_FAILURE);
	199	}
	200	bitmap_or(seen_cpus, seen_cpus,
	201	numa_info[i].node_cpu, MAX_CPUMASK_BITS);
	202	}
	203
	204	if (!bitmap_full(seen_cpus, max_cpus)) {
	205	char *msg;
	206	bitmap_complement(seen_cpus, seen_cpus, max_cpus);
	207	msg = enumerate_cpus(seen_cpus, max_cpus);
	208	error_report("warning: CPU(s) not present in any NUMA nodes: %s", msg);
	209	error_report("warning: All CPU(s) up to maxcpus should be described "
	210	"in NUMA config");
	211	g_free(msg);
	212	}
	213	}
	214
	215	void parse_numa_opts(MachineClass *mc)
	216	{
	217	int i;
	218
	219	if (qemu_opts_foreach(qemu_find_opts("numa"), parse_numa, NULL, NULL)) {
	220	exit(1);
	221	}
	222
	223	assert(max_numa_nodeid <= MAX_NODES);
	224
	225	/* No support for sparse NUMA node IDs yet: */
	226	for (i = max_numa_nodeid - 1; i >= 0; i--) {
	227	/* Report large node IDs first, to make mistakes easier to spot */
	228	if (!numa_info[i].present) {
	229	error_report("numa: Node ID missing: %d", i);
	230	exit(1);
	231	}
	232	}
	233
	234	/* This must be always true if all nodes are present: */
	235	assert(nb_numa_nodes == max_numa_nodeid);
	236
	237	if (nb_numa_nodes > 0) {
	238	uint64_t numa_total;
	239
	240	if (nb_numa_nodes > MAX_NODES) {
	241	nb_numa_nodes = MAX_NODES;
	242	}
	243
	244	/* If no memory size is given for any node, assume the default case
	245	* and distribute the available memory equally across all nodes
	246	*/
	247	for (i = 0; i < nb_numa_nodes; i++) {
	248	if (numa_info[i].node_mem != 0) {
	249	break;
	250	}
	251	}
	252	if (i == nb_numa_nodes) {
	253	uint64_t usedmem = 0;
	254
	255	/* On Linux, each node's border has to be 8MB aligned,
	256	* the final node gets the rest.
	257	*/
	258	for (i = 0; i < nb_numa_nodes - 1; i++) {
	259	numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
	260	~((1 << 23UL) - 1);
	261	usedmem += numa_info[i].node_mem;
	262	}
	263	numa_info[i].node_mem = ram_size - usedmem;
	264	}
	265
	266	numa_total = 0;
	267	for (i = 0; i < nb_numa_nodes; i++) {
	268	numa_total += numa_info[i].node_mem;
	269	}
	270	if (numa_total != ram_size) {
	271	error_report("total memory for NUMA nodes (0x%" PRIx64 ")"
	272	" should equal RAM size (0x" RAM_ADDR_FMT ")",
	273	numa_total, ram_size);
	274	exit(1);
	275	}
	276
	277	for (i = 0; i < nb_numa_nodes; i++) {
	278	if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
	279	break;
	280	}
	281	}
	282	/* Historically VCPUs were assigned in round-robin order to NUMA
	283	* nodes. However it causes issues with guest not handling it nice
	284	* in case where cores/threads from a multicore CPU appear on
	285	* different nodes. So allow boards to override default distribution
	286	* rule grouping VCPUs by socket so that VCPUs from the same socket
	287	* would be on the same node.
	288	*/
	289	if (i == nb_numa_nodes) {
	290	for (i = 0; i < max_cpus; i++) {
	291	unsigned node_id = i % nb_numa_nodes;
	292	if (mc->cpu_index_to_socket_id) {
	293	node_id = mc->cpu_index_to_socket_id(i) % nb_numa_nodes;
	294	}
	295
	296	set_bit(i, numa_info[node_id].node_cpu);
	297	}
	298	}
	299
	300	validate_numa_cpus();
	301	}
	302	}
	303
	304	void numa_post_machine_init(void)
	305	{
	306	CPUState *cpu;
	307	int i;
	308
	309	CPU_FOREACH(cpu) {
	310	for (i = 0; i < nb_numa_nodes; i++) {
	311	if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
	312	cpu->numa_node = i;
	313	}
	314	}
	315	}
	316	}
	317
	318	static void allocate_system_memory_nonnuma(MemoryRegion mr, Object owner,
	319	const char *name,
	320	uint64_t ram_size)
	321	{
	322	if (mem_path) {
	323	#ifdef __linux__
	324	Error *err = NULL;
	325	memory_region_init_ram_from_file(mr, owner, name, ram_size, false,
	326	mem_path, &err);
	327
	328	/* Legacy behavior: if allocation failed, fall back to
	329	* regular RAM allocation.
	330	*/
	331	if (err) {
	332	error_report_err(err);
	333	memory_region_init_ram(mr, owner, name, ram_size, &error_abort);
	334	}
	335	#else
	336	fprintf(stderr, "-mem-path not supported on this host\n");
	337	exit(1);
	338	#endif
	339	} else {
	340	memory_region_init_ram(mr, owner, name, ram_size, &error_abort);
	341	}
	342	vmstate_register_ram_global(mr);
	343	}
	344
	345	void memory_region_allocate_system_memory(MemoryRegion mr, Object owner,
	346	const char *name,
	347	uint64_t ram_size)
	348	{
	349	uint64_t addr = 0;
	350	int i;
	351
	352	if (nb_numa_nodes == 0 \|\| !have_memdevs) {
	353	allocate_system_memory_nonnuma(mr, owner, name, ram_size);
	354	return;
	355	}
	356
	357	memory_region_init(mr, owner, name, ram_size);
	358	for (i = 0; i < MAX_NODES; i++) {
	359	Error *local_err = NULL;
	360	uint64_t size = numa_info[i].node_mem;
	361	HostMemoryBackend *backend = numa_info[i].node_memdev;
	362	if (!backend) {
	363	continue;
	364	}
	365	MemoryRegion *seg = host_memory_backend_get_memory(backend, &local_err);
	366	if (local_err) {
	367	error_report_err(local_err);
	368	exit(1);
	369	}
	370
	371	if (memory_region_is_mapped(seg)) {
	372	char *path = object_get_canonical_path_component(OBJECT(backend));
	373	error_report("memory backend %s is used multiple times. Each "
	374	"-numa option must use a different memdev value.",
	375	path);
	376	exit(1);
	377	}
	378
	379	memory_region_add_subregion(mr, addr, seg);
	380	vmstate_register_ram_global(seg);
	381	addr += size;
	382	}
	383	}
	384
	385	static void numa_stat_memory_devices(uint64_t node_mem[])
	386	{
	387	MemoryDeviceInfoList *info_list = NULL;
	388	MemoryDeviceInfoList **prev = &info_list;
	389	MemoryDeviceInfoList *info;
	390
	391	qmp_pc_dimm_device_list(qdev_get_machine(), &prev);
	392	for (info = info_list; info; info = info->next) {
	393	MemoryDeviceInfo *value = info->value;
	394
	395	if (value) {
	396	switch (value->kind) {
	397	case MEMORY_DEVICE_INFO_KIND_DIMM:
	398	node_mem[value->dimm->node] += value->dimm->size;
	399	break;
	400	default:
	401	break;
	402	}
	403	}
	404	}
	405	qapi_free_MemoryDeviceInfoList(info_list);
	406	}
	407
	408	void query_numa_node_mem(uint64_t node_mem[])
	409	{
	410	int i;
	411
	412	if (nb_numa_nodes <= 0) {
	413	return;
	414	}
	415
	416	numa_stat_memory_devices(node_mem);
	417	for (i = 0; i < nb_numa_nodes; i++) {
	418	node_mem[i] += numa_info[i].node_mem;
	419	}
	420	}
	421
	422	static int query_memdev(Object obj, void opaque)
	423	{
	424	MemdevList **list = opaque;
	425	MemdevList *m = NULL;
	426	Error *err = NULL;
	427
	428	if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
	429	m = g_malloc0(sizeof(*m));
	430
	431	m->value = g_malloc0(sizeof(*m->value));
	432
	433	m->value->size = object_property_get_int(obj, "size",
	434	&err);
	435	if (err) {
	436	goto error;
	437	}
	438
	439	m->value->merge = object_property_get_bool(obj, "merge",
	440	&err);
	441	if (err) {
	442	goto error;
	443	}
	444
	445	m->value->dump = object_property_get_bool(obj, "dump",
	446	&err);
	447	if (err) {
	448	goto error;
	449	}
	450
	451	m->value->prealloc = object_property_get_bool(obj,
	452	"prealloc", &err);
	453	if (err) {
	454	goto error;
	455	}
	456
	457	m->value->policy = object_property_get_enum(obj,
	458	"policy",
	459	"HostMemPolicy",
	460	&err);
	461	if (err) {
	462	goto error;
	463	}
	464
	465	object_property_get_uint16List(obj, "host-nodes",
	466	&m->value->host_nodes, &err);
	467	if (err) {
	468	goto error;
	469	}
	470
	471	m->next = *list;
	472	*list = m;
	473	}
	474
	475	return 0;
	476	error:
	477	g_free(m->value);
	478	g_free(m);
	479
	480	return -1;
	481	}
	482
	483	MemdevList qmp_query_memdev(Error *errp)
	484	{
	485	Object *obj;
	486	MemdevList *list = NULL;
	487
	488	obj = object_get_objects_root();
	489	if (obj == NULL) {
	490	return NULL;
	491	}
	492
	493	if (object_child_foreach(obj, query_memdev, &list) != 0) {
	494	goto error;
	495	}
	496
	497	return list;
	498
	499	error:
	500	qapi_free_MemdevList(list);
	501	return NULL;
	502	}