Git Repo - qemu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* QEMU Enhanced Disk Format
	3	*
	4	* Copyright IBM, Corp. 2010
	5	*
	6	* Authors:
	7	* Stefan Hajnoczi <[email protected]>
	8	* Anthony Liguori <[email protected]>
	9	*
	10	* This work is licensed under the terms of the GNU LGPL, version 2 or later.
	11	* See the COPYING.LIB file in the top-level directory.
	12	*
	13	*/
	14
	15	#include "qemu/osdep.h"
	16	#include "block/qdict.h"
	17	#include "qapi/error.h"
	18	#include "qemu/timer.h"
	19	#include "qemu/bswap.h"
	20	#include "qemu/main-loop.h"
	21	#include "qemu/module.h"
	22	#include "qemu/option.h"
	23	#include "trace.h"
	24	#include "qed.h"
	25	#include "sysemu/block-backend.h"
	26	#include "qapi/qmp/qdict.h"
	27	#include "qapi/qobject-input-visitor.h"
	28	#include "qapi/qapi-visit-block-core.h"
	29
	30	static QemuOptsList qed_create_opts;
	31
	32	static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
	33	const char *filename)
	34	{
	35	const QEDHeader header = (const QEDHeader )buf;
	36
	37	if (buf_size < sizeof(*header)) {
	38	return 0;
	39	}
	40	if (le32_to_cpu(header->magic) != QED_MAGIC) {
	41	return 0;
	42	}
	43	return 100;
	44	}
	45
	46	/**
	47	* Check whether an image format is raw
	48	*
	49	* @fmt: Backing file format, may be NULL
	50	*/
	51	static bool qed_fmt_is_raw(const char *fmt)
	52	{
	53	return fmt && strcmp(fmt, "raw") == 0;
	54	}
	55
	56	static void qed_header_le_to_cpu(const QEDHeader le, QEDHeader cpu)
	57	{
	58	cpu->magic = le32_to_cpu(le->magic);
	59	cpu->cluster_size = le32_to_cpu(le->cluster_size);
	60	cpu->table_size = le32_to_cpu(le->table_size);
	61	cpu->header_size = le32_to_cpu(le->header_size);
	62	cpu->features = le64_to_cpu(le->features);
	63	cpu->compat_features = le64_to_cpu(le->compat_features);
	64	cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
	65	cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
	66	cpu->image_size = le64_to_cpu(le->image_size);
	67	cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
	68	cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
	69	}
	70
	71	static void qed_header_cpu_to_le(const QEDHeader cpu, QEDHeader le)
	72	{
	73	le->magic = cpu_to_le32(cpu->magic);
	74	le->cluster_size = cpu_to_le32(cpu->cluster_size);
	75	le->table_size = cpu_to_le32(cpu->table_size);
	76	le->header_size = cpu_to_le32(cpu->header_size);
	77	le->features = cpu_to_le64(cpu->features);
	78	le->compat_features = cpu_to_le64(cpu->compat_features);
	79	le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
	80	le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
	81	le->image_size = cpu_to_le64(cpu->image_size);
	82	le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
	83	le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
	84	}
	85
	86	int qed_write_header_sync(BDRVQEDState *s)
	87	{
	88	QEDHeader le;
	89	int ret;
	90
	91	qed_header_cpu_to_le(&s->header, &le);
	92	ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
	93	if (ret != sizeof(le)) {
	94	return ret;
	95	}
	96	return 0;
	97	}
	98
	99	/**
	100	* Update header in-place (does not rewrite backing filename or other strings)
	101	*
	102	* This function only updates known header fields in-place and does not affect
	103	* extra data after the QED header.
	104	*
	105	* No new allocating reqs can start while this function runs.
	106	*/
	107	static int coroutine_fn qed_write_header(BDRVQEDState *s)
	108	{
	109	/* We must write full sectors for O_DIRECT but cannot necessarily generate
	110	* the data following the header if an unrecognized compat feature is
	111	* active. Therefore, first read the sectors containing the header, update
	112	* them, and write back.
	113	*/
	114
	115	int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
	116	size_t len = nsectors * BDRV_SECTOR_SIZE;
	117	uint8_t *buf;
	118	int ret;
	119
	120	assert(s->allocating_acb \|\| s->allocating_write_reqs_plugged);
	121
	122	buf = qemu_blockalign(s->bs, len);
	123
	124	ret = bdrv_co_pread(s->bs->file, 0, len, buf, 0);
	125	if (ret < 0) {
	126	goto out;
	127	}
	128
	129	/* Update header */
	130	qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
	131
	132	ret = bdrv_co_pwrite(s->bs->file, 0, len, buf, 0);
	133	if (ret < 0) {
	134	goto out;
	135	}
	136
	137	ret = 0;
	138	out:
	139	qemu_vfree(buf);
	140	return ret;
	141	}
	142
	143	static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
	144	{
	145	uint64_t table_entries;
	146	uint64_t l2_size;
	147
	148	table_entries = (table_size * cluster_size) / sizeof(uint64_t);
	149	l2_size = table_entries * cluster_size;
	150
	151	return l2_size * table_entries;
	152	}
	153
	154	static bool qed_is_cluster_size_valid(uint32_t cluster_size)
	155	{
	156	if (cluster_size < QED_MIN_CLUSTER_SIZE \|\|
	157	cluster_size > QED_MAX_CLUSTER_SIZE) {
	158	return false;
	159	}
	160	if (cluster_size & (cluster_size - 1)) {
	161	return false; /* not power of 2 */
	162	}
	163	return true;
	164	}
	165
	166	static bool qed_is_table_size_valid(uint32_t table_size)
	167	{
	168	if (table_size < QED_MIN_TABLE_SIZE \|\|
	169	table_size > QED_MAX_TABLE_SIZE) {
	170	return false;
	171	}
	172	if (table_size & (table_size - 1)) {
	173	return false; /* not power of 2 */
	174	}
	175	return true;
	176	}
	177
	178	static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
	179	uint32_t table_size)
	180	{
	181	if (image_size % BDRV_SECTOR_SIZE != 0) {
	182	return false; /* not multiple of sector size */
	183	}
	184	if (image_size > qed_max_image_size(cluster_size, table_size)) {
	185	return false; /* image is too large */
	186	}
	187	return true;
	188	}
	189
	190	/**
	191	* Read a string of known length from the image file
	192	*
	193	* @file: Image file
	194	* @offset: File offset to start of string, in bytes
	195	* @n: String length in bytes
	196	* @buf: Destination buffer
	197	* @buflen: Destination buffer length in bytes
	198	* @ret: 0 on success, -errno on failure
	199	*
	200	* The string is NUL-terminated.
	201	*/
	202	static int qed_read_string(BdrvChild *file, uint64_t offset, size_t n,
	203	char *buf, size_t buflen)
	204	{
	205	int ret;
	206	if (n >= buflen) {
	207	return -EINVAL;
	208	}
	209	ret = bdrv_pread(file, offset, buf, n);
	210	if (ret < 0) {
	211	return ret;
	212	}
	213	buf[n] = '\0';
	214	return 0;
	215	}
	216
	217	/**
	218	* Allocate new clusters
	219	*
	220	* @s: QED state
	221	* @n: Number of contiguous clusters to allocate
	222	* @ret: Offset of first allocated cluster
	223	*
	224	* This function only produces the offset where the new clusters should be
	225	* written. It updates BDRVQEDState but does not make any changes to the image
	226	* file.
	227	*
	228	* Called with table_lock held.
	229	*/
	230	static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
	231	{
	232	uint64_t offset = s->file_size;
	233	s->file_size += n * s->header.cluster_size;
	234	return offset;
	235	}
	236
	237	QEDTable qed_alloc_table(BDRVQEDState s)
	238	{
	239	/* Honor O_DIRECT memory alignment requirements */
	240	return qemu_blockalign(s->bs,
	241	s->header.cluster_size * s->header.table_size);
	242	}
	243
	244	/**
	245	* Allocate a new zeroed L2 table
	246	*
	247	* Called with table_lock held.
	248	*/
	249	static CachedL2Table qed_new_l2_table(BDRVQEDState s)
	250	{
	251	CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
	252
	253	l2_table->table = qed_alloc_table(s);
	254	l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
	255
	256	memset(l2_table->table->offsets, 0,
	257	s->header.cluster_size * s->header.table_size);
	258	return l2_table;
	259	}
	260
	261	static bool qed_plug_allocating_write_reqs(BDRVQEDState *s)
	262	{
	263	qemu_co_mutex_lock(&s->table_lock);
	264
	265	/* No reentrancy is allowed. */
	266	assert(!s->allocating_write_reqs_plugged);
	267	if (s->allocating_acb != NULL) {
	268	/* Another allocating write came concurrently. This cannot happen
	269	* from bdrv_qed_co_drain_begin, but it can happen when the timer runs.
	270	*/
	271	qemu_co_mutex_unlock(&s->table_lock);
	272	return false;
	273	}
	274
	275	s->allocating_write_reqs_plugged = true;
	276	qemu_co_mutex_unlock(&s->table_lock);
	277	return true;
	278	}
	279
	280	static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
	281	{
	282	qemu_co_mutex_lock(&s->table_lock);
	283	assert(s->allocating_write_reqs_plugged);
	284	s->allocating_write_reqs_plugged = false;
	285	qemu_co_queue_next(&s->allocating_write_reqs);
	286	qemu_co_mutex_unlock(&s->table_lock);
	287	}
	288
	289	static void coroutine_fn qed_need_check_timer_entry(void *opaque)
	290	{
	291	BDRVQEDState *s = opaque;
	292	int ret;
	293
	294	trace_qed_need_check_timer_cb(s);
	295
	296	if (!qed_plug_allocating_write_reqs(s)) {
	297	return;
	298	}
	299
	300	/* Ensure writes are on disk before clearing flag */
	301	ret = bdrv_co_flush(s->bs->file->bs);
	302	if (ret < 0) {
	303	qed_unplug_allocating_write_reqs(s);
	304	return;
	305	}
	306
	307	s->header.features &= ~QED_F_NEED_CHECK;
	308	ret = qed_write_header(s);
	309	(void) ret;
	310
	311	qed_unplug_allocating_write_reqs(s);
	312
	313	ret = bdrv_co_flush(s->bs);
	314	(void) ret;
	315	}
	316
	317	static void qed_need_check_timer_cb(void *opaque)
	318	{
	319	Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
	320	qemu_coroutine_enter(co);
	321	}
	322
	323	static void qed_start_need_check_timer(BDRVQEDState *s)
	324	{
	325	trace_qed_start_need_check_timer(s);
	326
	327	/* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for
	328	* migration.
	329	*/
	330	timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
	331	NANOSECONDS_PER_SECOND * QED_NEED_CHECK_TIMEOUT);
	332	}
	333
	334	/* It's okay to call this multiple times or when no timer is started */
	335	static void qed_cancel_need_check_timer(BDRVQEDState *s)
	336	{
	337	trace_qed_cancel_need_check_timer(s);
	338	timer_del(s->need_check_timer);
	339	}
	340
	341	static void bdrv_qed_detach_aio_context(BlockDriverState *bs)
	342	{
	343	BDRVQEDState *s = bs->opaque;
	344
	345	qed_cancel_need_check_timer(s);
	346	timer_free(s->need_check_timer);
	347	}
	348
	349	static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
	350	AioContext *new_context)
	351	{
	352	BDRVQEDState *s = bs->opaque;
	353
	354	s->need_check_timer = aio_timer_new(new_context,
	355	QEMU_CLOCK_VIRTUAL, SCALE_NS,
	356	qed_need_check_timer_cb, s);
	357	if (s->header.features & QED_F_NEED_CHECK) {
	358	qed_start_need_check_timer(s);
	359	}
	360	}
	361
	362	static void coroutine_fn bdrv_qed_co_drain_begin(BlockDriverState *bs)
	363	{
	364	BDRVQEDState *s = bs->opaque;
	365
	366	/* Fire the timer immediately in order to start doing I/O as soon as the
	367	* header is flushed.
	368	*/
	369	if (s->need_check_timer && timer_pending(s->need_check_timer)) {
	370	qed_cancel_need_check_timer(s);
	371	qed_need_check_timer_entry(s);
	372	}
	373	}
	374
	375	static void bdrv_qed_init_state(BlockDriverState *bs)
	376	{
	377	BDRVQEDState *s = bs->opaque;
	378
	379	memset(s, 0, sizeof(BDRVQEDState));
	380	s->bs = bs;
	381	qemu_co_mutex_init(&s->table_lock);
	382	qemu_co_queue_init(&s->allocating_write_reqs);
	383	}
	384
	385	/* Called with table_lock held. */
	386	static int coroutine_fn bdrv_qed_do_open(BlockDriverState bs, QDict options,
	387	int flags, Error **errp)
	388	{
	389	BDRVQEDState *s = bs->opaque;
	390	QEDHeader le_header;
	391	int64_t file_size;
	392	int ret;
	393
	394	ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
	395	if (ret < 0) {
	396	return ret;
	397	}
	398	qed_header_le_to_cpu(&le_header, &s->header);
	399
	400	if (s->header.magic != QED_MAGIC) {
	401	error_setg(errp, "Image not in QED format");
	402	return -EINVAL;
	403	}
	404	if (s->header.features & ~QED_FEATURE_MASK) {
	405	/* image uses unsupported feature bits */
	406	error_setg(errp, "Unsupported QED features: %" PRIx64,
	407	s->header.features & ~QED_FEATURE_MASK);
	408	return -ENOTSUP;
	409	}
	410	if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
	411	return -EINVAL;
	412	}
	413
	414	/* Round down file size to the last cluster */
	415	file_size = bdrv_getlength(bs->file->bs);
	416	if (file_size < 0) {
	417	return file_size;
	418	}
	419	s->file_size = qed_start_of_cluster(s, file_size);
	420
	421	if (!qed_is_table_size_valid(s->header.table_size)) {
	422	return -EINVAL;
	423	}
	424	if (!qed_is_image_size_valid(s->header.image_size,
	425	s->header.cluster_size,
	426	s->header.table_size)) {
	427	return -EINVAL;
	428	}
	429	if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
	430	return -EINVAL;
	431	}
	432
	433	s->table_nelems = (s->header.cluster_size * s->header.table_size) /
	434	sizeof(uint64_t);
	435	s->l2_shift = ctz32(s->header.cluster_size);
	436	s->l2_mask = s->table_nelems - 1;
	437	s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
	438
	439	/* Header size calculation must not overflow uint32_t */
	440	if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
	441	return -EINVAL;
	442	}
	443
	444	if ((s->header.features & QED_F_BACKING_FILE)) {
	445	if ((uint64_t)s->header.backing_filename_offset +
	446	s->header.backing_filename_size >
	447	s->header.cluster_size * s->header.header_size) {
	448	return -EINVAL;
	449	}
	450
	451	ret = qed_read_string(bs->file, s->header.backing_filename_offset,
	452	s->header.backing_filename_size,
	453	bs->auto_backing_file,
	454	sizeof(bs->auto_backing_file));
	455	if (ret < 0) {
	456	return ret;
	457	}
	458	pstrcpy(bs->backing_file, sizeof(bs->backing_file),
	459	bs->auto_backing_file);
	460
	461	if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
	462	pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
	463	}
	464	}
	465
	466	/* Reset unknown autoclear feature bits. This is a backwards
	467	* compatibility mechanism that allows images to be opened by older
	468	* programs, which "knock out" unknown feature bits. When an image is
	469	* opened by a newer program again it can detect that the autoclear
	470	* feature is no longer valid.
	471	*/
	472	if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
	473	!bdrv_is_read_only(bs->file->bs) && !(flags & BDRV_O_INACTIVE)) {
	474	s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
	475
	476	ret = qed_write_header_sync(s);
	477	if (ret) {
	478	return ret;
	479	}
	480
	481	/* From here on only known autoclear feature bits are valid */
	482	bdrv_flush(bs->file->bs);
	483	}
	484
	485	s->l1_table = qed_alloc_table(s);
	486	qed_init_l2_cache(&s->l2_cache);
	487
	488	ret = qed_read_l1_table_sync(s);
	489	if (ret) {
	490	goto out;
	491	}
	492
	493	/* If image was not closed cleanly, check consistency */
	494	if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
	495	/* Read-only images cannot be fixed. There is no risk of corruption
	496	* since write operations are not possible. Therefore, allow
	497	* potentially inconsistent images to be opened read-only. This can
	498	* aid data recovery from an otherwise inconsistent image.
	499	*/
	500	if (!bdrv_is_read_only(bs->file->bs) &&
	501	!(flags & BDRV_O_INACTIVE)) {
	502	BdrvCheckResult result = {0};
	503
	504	ret = qed_check(s, &result, true);
	505	if (ret) {
	506	goto out;
	507	}
	508	}
	509	}
	510
	511	bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs));
	512
	513	out:
	514	if (ret) {
	515	qed_free_l2_cache(&s->l2_cache);
	516	qemu_vfree(s->l1_table);
	517	}
	518	return ret;
	519	}
	520
	521	typedef struct QEDOpenCo {
	522	BlockDriverState *bs;
	523	QDict *options;
	524	int flags;
	525	Error **errp;
	526	int ret;
	527	} QEDOpenCo;
	528
	529	static void coroutine_fn bdrv_qed_open_entry(void *opaque)
	530	{
	531	QEDOpenCo *qoc = opaque;
	532	BDRVQEDState *s = qoc->bs->opaque;
	533
	534	qemu_co_mutex_lock(&s->table_lock);
	535	qoc->ret = bdrv_qed_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
	536	qemu_co_mutex_unlock(&s->table_lock);
	537	}
	538
	539	static int bdrv_qed_open(BlockDriverState bs, QDict options, int flags,
	540	Error **errp)
	541	{
	542	QEDOpenCo qoc = {
	543	.bs = bs,
	544	.options = options,
	545	.flags = flags,
	546	.errp = errp,
	547	.ret = -EINPROGRESS
	548	};
	549
	550	bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
	551	false, errp);
	552	if (!bs->file) {
	553	return -EINVAL;
	554	}
	555
	556	bdrv_qed_init_state(bs);
	557	if (qemu_in_coroutine()) {
	558	bdrv_qed_open_entry(&qoc);
	559	} else {
	560	assert(qemu_get_current_aio_context() == qemu_get_aio_context());
	561	qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, &qoc));
	562	BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
	563	}
	564	BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
	565	return qoc.ret;
	566	}
	567
	568	static void bdrv_qed_refresh_limits(BlockDriverState bs, Error *errp)
	569	{
	570	BDRVQEDState *s = bs->opaque;
	571
	572	bs->bl.pwrite_zeroes_alignment = s->header.cluster_size;
	573	}
	574
	575	/* We have nothing to do for QED reopen, stubs just return
	576	* success */
	577	static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
	578	BlockReopenQueue queue, Error *errp)
	579	{
	580	return 0;
	581	}
	582
	583	static void bdrv_qed_close(BlockDriverState *bs)
	584	{
	585	BDRVQEDState *s = bs->opaque;
	586
	587	bdrv_qed_detach_aio_context(bs);
	588
	589	/* Ensure writes reach stable storage */
	590	bdrv_flush(bs->file->bs);
	591
	592	/* Clean shutdown, no check required on next open */
	593	if (s->header.features & QED_F_NEED_CHECK) {
	594	s->header.features &= ~QED_F_NEED_CHECK;
	595	qed_write_header_sync(s);
	596	}
	597
	598	qed_free_l2_cache(&s->l2_cache);
	599	qemu_vfree(s->l1_table);
	600	}
	601
	602	static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
	603	Error **errp)
	604	{
	605	BlockdevCreateOptionsQed *qed_opts;
	606	BlockBackend *blk = NULL;
	607	BlockDriverState *bs = NULL;
	608
	609	QEDHeader header;
	610	QEDHeader le_header;
	611	uint8_t *l1_table = NULL;
	612	size_t l1_size;
	613	int ret = 0;
	614
	615	assert(opts->driver == BLOCKDEV_DRIVER_QED);
	616	qed_opts = &opts->u.qed;
	617
	618	/* Validate options and set default values */
	619	if (!qed_opts->has_cluster_size) {
	620	qed_opts->cluster_size = QED_DEFAULT_CLUSTER_SIZE;
	621	}
	622	if (!qed_opts->has_table_size) {
	623	qed_opts->table_size = QED_DEFAULT_TABLE_SIZE;
	624	}
	625
	626	if (!qed_is_cluster_size_valid(qed_opts->cluster_size)) {
	627	error_setg(errp, "QED cluster size must be within range [%u, %u] "
	628	"and power of 2",
	629	QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
	630	return -EINVAL;
	631	}
	632	if (!qed_is_table_size_valid(qed_opts->table_size)) {
	633	error_setg(errp, "QED table size must be within range [%u, %u] "
	634	"and power of 2",
	635	QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
	636	return -EINVAL;
	637	}
	638	if (!qed_is_image_size_valid(qed_opts->size, qed_opts->cluster_size,
	639	qed_opts->table_size))
	640	{
	641	error_setg(errp, "QED image size must be a non-zero multiple of "
	642	"cluster size and less than %" PRIu64 " bytes",
	643	qed_max_image_size(qed_opts->cluster_size,
	644	qed_opts->table_size));
	645	return -EINVAL;
	646	}
	647
	648	/* Create BlockBackend to write to the image */
	649	bs = bdrv_open_blockdev_ref(qed_opts->file, errp);
	650	if (bs == NULL) {
	651	return -EIO;
	652	}
	653
	654	blk = blk_new(bdrv_get_aio_context(bs),
	655	BLK_PERM_WRITE \| BLK_PERM_RESIZE, BLK_PERM_ALL);
	656	ret = blk_insert_bs(blk, bs, errp);
	657	if (ret < 0) {
	658	goto out;
	659	}
	660	blk_set_allow_write_beyond_eof(blk, true);
	661
	662	/* Prepare image format */
	663	header = (QEDHeader) {
	664	.magic = QED_MAGIC,
	665	.cluster_size = qed_opts->cluster_size,
	666	.table_size = qed_opts->table_size,
	667	.header_size = 1,
	668	.features = 0,
	669	.compat_features = 0,
	670	.l1_table_offset = qed_opts->cluster_size,
	671	.image_size = qed_opts->size,
	672	};
	673
	674	l1_size = header.cluster_size * header.table_size;
	675
	676	/* File must start empty and grow, check truncate is supported */
	677	ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp);
	678	if (ret < 0) {
	679	goto out;
	680	}
	681
	682	if (qed_opts->has_backing_file) {
	683	header.features \|= QED_F_BACKING_FILE;
	684	header.backing_filename_offset = sizeof(le_header);
	685	header.backing_filename_size = strlen(qed_opts->backing_file);
	686
	687	if (qed_opts->has_backing_fmt) {
	688	const char *backing_fmt = BlockdevDriver_str(qed_opts->backing_fmt);
	689	if (qed_fmt_is_raw(backing_fmt)) {
	690	header.features \|= QED_F_BACKING_FORMAT_NO_PROBE;
	691	}
	692	}
	693	}
	694
	695	qed_header_cpu_to_le(&header, &le_header);
	696	ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header), 0);
	697	if (ret < 0) {
	698	goto out;
	699	}
	700	ret = blk_pwrite(blk, sizeof(le_header), qed_opts->backing_file,
	701	header.backing_filename_size, 0);
	702	if (ret < 0) {
	703	goto out;
	704	}
	705
	706	l1_table = g_malloc0(l1_size);
	707	ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size, 0);
	708	if (ret < 0) {
	709	goto out;
	710	}
	711
	712	ret = 0; /* success */
	713	out:
	714	g_free(l1_table);
	715	blk_unref(blk);
	716	bdrv_unref(bs);
	717	return ret;
	718	}
	719
	720	static int coroutine_fn bdrv_qed_co_create_opts(const char *filename,
	721	QemuOpts *opts,
	722	Error **errp)
	723	{
	724	BlockdevCreateOptions *create_options = NULL;
	725	QDict *qdict;
	726	Visitor *v;
	727	BlockDriverState *bs = NULL;
	728	Error *local_err = NULL;
	729	int ret;
	730
	731	static const QDictRenames opt_renames[] = {
	732	{ BLOCK_OPT_BACKING_FILE, "backing-file" },
	733	{ BLOCK_OPT_BACKING_FMT, "backing-fmt" },
	734	{ BLOCK_OPT_CLUSTER_SIZE, "cluster-size" },
	735	{ BLOCK_OPT_TABLE_SIZE, "table-size" },
	736	{ NULL, NULL },
	737	};
	738
	739	/* Parse options and convert legacy syntax */
	740	qdict = qemu_opts_to_qdict_filtered(opts, NULL, &qed_create_opts, true);
	741
	742	if (!qdict_rename_keys(qdict, opt_renames, errp)) {
	743	ret = -EINVAL;
	744	goto fail;
	745	}
	746
	747	/* Create and open the file (protocol layer) */
	748	ret = bdrv_create_file(filename, opts, &local_err);
	749	if (ret < 0) {
	750	error_propagate(errp, local_err);
	751	goto fail;
	752	}
	753
	754	bs = bdrv_open(filename, NULL, NULL,
	755	BDRV_O_RDWR \| BDRV_O_RESIZE \| BDRV_O_PROTOCOL, errp);
	756	if (bs == NULL) {
	757	ret = -EIO;
	758	goto fail;
	759	}
	760
	761	/* Now get the QAPI type BlockdevCreateOptions */
	762	qdict_put_str(qdict, "driver", "qed");
	763	qdict_put_str(qdict, "file", bs->node_name);
	764
	765	v = qobject_input_visitor_new_flat_confused(qdict, errp);
	766	if (!v) {
	767	ret = -EINVAL;
	768	goto fail;
	769	}
	770
	771	visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
	772	visit_free(v);
	773
	774	if (local_err) {
	775	error_propagate(errp, local_err);
	776	ret = -EINVAL;
	777	goto fail;
	778	}
	779
	780	/* Silently round up size */
	781	assert(create_options->driver == BLOCKDEV_DRIVER_QED);
	782	create_options->u.qed.size =
	783	ROUND_UP(create_options->u.qed.size, BDRV_SECTOR_SIZE);
	784
	785	/* Create the qed image (format layer) */
	786	ret = bdrv_qed_co_create(create_options, errp);
	787
	788	fail:
	789	qobject_unref(qdict);
	790	bdrv_unref(bs);
	791	qapi_free_BlockdevCreateOptions(create_options);
	792	return ret;
	793	}
	794
	795	static int coroutine_fn bdrv_qed_co_block_status(BlockDriverState *bs,
	796	bool want_zero,
	797	int64_t pos, int64_t bytes,
	798	int64_t pnum, int64_t map,
	799	BlockDriverState **file)
	800	{
	801	BDRVQEDState *s = bs->opaque;
	802	size_t len = MIN(bytes, SIZE_MAX);
	803	int status;
	804	QEDRequest request = { .l2_table = NULL };
	805	uint64_t offset;
	806	int ret;
	807
	808	qemu_co_mutex_lock(&s->table_lock);
	809	ret = qed_find_cluster(s, &request, pos, &len, &offset);
	810
	811	*pnum = len;
	812	switch (ret) {
	813	case QED_CLUSTER_FOUND:
	814	*map = offset \| qed_offset_into_cluster(s, pos);
	815	status = BDRV_BLOCK_DATA \| BDRV_BLOCK_OFFSET_VALID;
	816	*file = bs->file->bs;
	817	break;
	818	case QED_CLUSTER_ZERO:
	819	status = BDRV_BLOCK_ZERO;
	820	break;
	821	case QED_CLUSTER_L2:
	822	case QED_CLUSTER_L1:
	823	status = 0;
	824	break;
	825	default:
	826	assert(ret < 0);
	827	status = ret;
	828	break;
	829	}
	830
	831	qed_unref_l2_cache_entry(request.l2_table);
	832	qemu_co_mutex_unlock(&s->table_lock);
	833
	834	return status;
	835	}
	836
	837	static BDRVQEDState acb_to_s(QEDAIOCB acb)
	838	{
	839	return acb->bs->opaque;
	840	}
	841
	842	/**
	843	* Read from the backing file or zero-fill if no backing file
	844	*
	845	* @s: QED state
	846	* @pos: Byte position in device
	847	* @qiov: Destination I/O vector
	848	* @backing_qiov: Possibly shortened copy of qiov, to be allocated here
	849	* @cb: Completion function
	850	* @opaque: User data for completion function
	851	*
	852	* This function reads qiov->size bytes starting at pos from the backing file.
	853	* If there is no backing file then zeroes are read.
	854	*/
	855	static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
	856	QEMUIOVector *qiov,
	857	QEMUIOVector **backing_qiov)
	858	{
	859	uint64_t backing_length = 0;
	860	size_t size;
	861	int ret;
	862
	863	/* If there is a backing file, get its length. Treat the absence of a
	864	* backing file like a zero length backing file.
	865	*/
	866	if (s->bs->backing) {
	867	int64_t l = bdrv_getlength(s->bs->backing->bs);
	868	if (l < 0) {
	869	return l;
	870	}
	871	backing_length = l;
	872	}
	873
	874	/* Zero all sectors if reading beyond the end of the backing file */
	875	if (pos >= backing_length \|\|
	876	pos + qiov->size > backing_length) {
	877	qemu_iovec_memset(qiov, 0, 0, qiov->size);
	878	}
	879
	880	/* Complete now if there are no backing file sectors to read */
	881	if (pos >= backing_length) {
	882	return 0;
	883	}
	884
	885	/* If the read straddles the end of the backing file, shorten it */
	886	size = MIN((uint64_t)backing_length - pos, qiov->size);
	887
	888	assert(*backing_qiov == NULL);
	889	*backing_qiov = g_new(QEMUIOVector, 1);
	890	qemu_iovec_init(*backing_qiov, qiov->niov);
	891	qemu_iovec_concat(*backing_qiov, qiov, 0, size);
	892
	893	BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
	894	ret = bdrv_co_preadv(s->bs->backing, pos, size, *backing_qiov, 0);
	895	if (ret < 0) {
	896	return ret;
	897	}
	898	return 0;
	899	}
	900
	901	/**
	902	* Copy data from backing file into the image
	903	*
	904	* @s: QED state
	905	* @pos: Byte position in device
	906	* @len: Number of bytes
	907	* @offset: Byte offset in image file
	908	*/
	909	static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
	910	uint64_t pos, uint64_t len,
	911	uint64_t offset)
	912	{
	913	QEMUIOVector qiov;
	914	QEMUIOVector *backing_qiov = NULL;
	915	int ret;
	916
	917	/* Skip copy entirely if there is no work to do */
	918	if (len == 0) {
	919	return 0;
	920	}
	921
	922	qemu_iovec_init_buf(&qiov, qemu_blockalign(s->bs, len), len);
	923
	924	ret = qed_read_backing_file(s, pos, &qiov, &backing_qiov);
	925
	926	if (backing_qiov) {
	927	qemu_iovec_destroy(backing_qiov);
	928	g_free(backing_qiov);
	929	backing_qiov = NULL;
	930	}
	931
	932	if (ret) {
	933	goto out;
	934	}
	935
	936	BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
	937	ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
	938	if (ret < 0) {
	939	goto out;
	940	}
	941	ret = 0;
	942	out:
	943	qemu_vfree(qemu_iovec_buf(&qiov));
	944	return ret;
	945	}
	946
	947	/**
	948	* Link one or more contiguous clusters into a table
	949	*
	950	* @s: QED state
	951	* @table: L2 table
	952	* @index: First cluster index
	953	* @n: Number of contiguous clusters
	954	* @cluster: First cluster offset
	955	*
	956	* The cluster offset may be an allocated byte offset in the image file, the
	957	* zero cluster marker, or the unallocated cluster marker.
	958	*
	959	* Called with table_lock held.
	960	*/
	961	static void coroutine_fn qed_update_l2_table(BDRVQEDState s, QEDTable table,
	962	int index, unsigned int n,
	963	uint64_t cluster)
	964	{
	965	int i;
	966	for (i = index; i < index + n; i++) {
	967	table->offsets[i] = cluster;
	968	if (!qed_offset_is_unalloc_cluster(cluster) &&
	969	!qed_offset_is_zero_cluster(cluster)) {
	970	cluster += s->header.cluster_size;
	971	}
	972	}
	973	}
	974
	975	/* Called with table_lock held. */
	976	static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
	977	{
	978	BDRVQEDState *s = acb_to_s(acb);
	979
	980	/* Free resources */
	981	qemu_iovec_destroy(&acb->cur_qiov);
	982	qed_unref_l2_cache_entry(acb->request.l2_table);
	983
	984	/* Free the buffer we may have allocated for zero writes */
	985	if (acb->flags & QED_AIOCB_ZERO) {
	986	qemu_vfree(acb->qiov->iov[0].iov_base);
	987	acb->qiov->iov[0].iov_base = NULL;
	988	}
	989
	990	/* Start next allocating write request waiting behind this one. Note that
	991	* requests enqueue themselves when they first hit an unallocated cluster
	992	* but they wait until the entire request is finished before waking up the
	993	* next request in the queue. This ensures that we don't cycle through
	994	* requests multiple times but rather finish one at a time completely.
	995	*/
	996	if (acb == s->allocating_acb) {
	997	s->allocating_acb = NULL;
	998	if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
	999	qemu_co_queue_next(&s->allocating_write_reqs);
	1000	} else if (s->header.features & QED_F_NEED_CHECK) {
	1001	qed_start_need_check_timer(s);
	1002	}
	1003	}
	1004	}
	1005
	1006	/**
	1007	* Update L1 table with new L2 table offset and write it out
	1008	*
	1009	* Called with table_lock held.
	1010	*/
	1011	static int coroutine_fn qed_aio_write_l1_update(QEDAIOCB *acb)
	1012	{
	1013	BDRVQEDState *s = acb_to_s(acb);
	1014	CachedL2Table *l2_table = acb->request.l2_table;
	1015	uint64_t l2_offset = l2_table->offset;
	1016	int index, ret;
	1017
	1018	index = qed_l1_index(s, acb->cur_pos);
	1019	s->l1_table->offsets[index] = l2_table->offset;
	1020
	1021	ret = qed_write_l1_table(s, index, 1);
	1022
	1023	/* Commit the current L2 table to the cache */
	1024	qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
	1025
	1026	/* This is guaranteed to succeed because we just committed the entry to the
	1027	* cache.
	1028	*/
	1029	acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
	1030	assert(acb->request.l2_table != NULL);
	1031
	1032	return ret;
	1033	}
	1034
	1035
	1036	/**
	1037	* Update L2 table with new cluster offsets and write them out
	1038	*
	1039	* Called with table_lock held.
	1040	*/
	1041	static int coroutine_fn qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
	1042	{
	1043	BDRVQEDState *s = acb_to_s(acb);
	1044	bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
	1045	int index, ret;
	1046
	1047	if (need_alloc) {
	1048	qed_unref_l2_cache_entry(acb->request.l2_table);
	1049	acb->request.l2_table = qed_new_l2_table(s);
	1050	}
	1051
	1052	index = qed_l2_index(s, acb->cur_pos);
	1053	qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
	1054	offset);
	1055
	1056	if (need_alloc) {
	1057	/* Write out the whole new L2 table */
	1058	ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
	1059	if (ret) {
	1060	return ret;
	1061	}
	1062	return qed_aio_write_l1_update(acb);
	1063	} else {
	1064	/* Write out only the updated part of the L2 table */
	1065	ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
	1066	false);
	1067	if (ret) {
	1068	return ret;
	1069	}
	1070	}
	1071	return 0;
	1072	}
	1073
	1074	/**
	1075	* Write data to the image file
	1076	*
	1077	* Called with table_lock not held.
	1078	*/
	1079	static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
	1080	{
	1081	BDRVQEDState *s = acb_to_s(acb);
	1082	uint64_t offset = acb->cur_cluster +
	1083	qed_offset_into_cluster(s, acb->cur_pos);
	1084
	1085	trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
	1086
	1087	BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
	1088	return bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
	1089	&acb->cur_qiov, 0);
	1090	}
	1091
	1092	/**
	1093	* Populate untouched regions of new data cluster
	1094	*
	1095	* Called with table_lock held.
	1096	*/
	1097	static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb)
	1098	{
	1099	BDRVQEDState *s = acb_to_s(acb);
	1100	uint64_t start, len, offset;
	1101	int ret;
	1102
	1103	qemu_co_mutex_unlock(&s->table_lock);
	1104
	1105	/* Populate front untouched region of new data cluster */
	1106	start = qed_start_of_cluster(s, acb->cur_pos);
	1107	len = qed_offset_into_cluster(s, acb->cur_pos);
	1108
	1109	trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
	1110	ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
	1111	if (ret < 0) {
	1112	goto out;
	1113	}
	1114
	1115	/* Populate back untouched region of new data cluster */
	1116	start = acb->cur_pos + acb->cur_qiov.size;
	1117	len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
	1118	offset = acb->cur_cluster +
	1119	qed_offset_into_cluster(s, acb->cur_pos) +
	1120	acb->cur_qiov.size;
	1121
	1122	trace_qed_aio_write_postfill(s, acb, start, len, offset);
	1123	ret = qed_copy_from_backing_file(s, start, len, offset);
	1124	if (ret < 0) {
	1125	goto out;
	1126	}
	1127
	1128	ret = qed_aio_write_main(acb);
	1129	if (ret < 0) {
	1130	goto out;
	1131	}
	1132
	1133	if (s->bs->backing) {
	1134	/*
	1135	* Flush new data clusters before updating the L2 table
	1136	*
	1137	* This flush is necessary when a backing file is in use. A crash
	1138	* during an allocating write could result in empty clusters in the
	1139	* image. If the write only touched a subregion of the cluster,
	1140	* then backing image sectors have been lost in the untouched
	1141	* region. The solution is to flush after writing a new data
	1142	* cluster and before updating the L2 table.
	1143	*/
	1144	ret = bdrv_co_flush(s->bs->file->bs);
	1145	}
	1146
	1147	out:
	1148	qemu_co_mutex_lock(&s->table_lock);
	1149	return ret;
	1150	}
	1151
	1152	/**
	1153	* Check if the QED_F_NEED_CHECK bit should be set during allocating write
	1154	*/
	1155	static bool qed_should_set_need_check(BDRVQEDState *s)
	1156	{
	1157	/* The flush before L2 update path ensures consistency */
	1158	if (s->bs->backing) {
	1159	return false;
	1160	}
	1161
	1162	return !(s->header.features & QED_F_NEED_CHECK);
	1163	}
	1164
	1165	/**
	1166	* Write new data cluster
	1167	*
	1168	* @acb: Write request
	1169	* @len: Length in bytes
	1170	*
	1171	* This path is taken when writing to previously unallocated clusters.
	1172	*
	1173	* Called with table_lock held.
	1174	*/
	1175	static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
	1176	{
	1177	BDRVQEDState *s = acb_to_s(acb);
	1178	int ret;
	1179
	1180	/* Cancel timer when the first allocating request comes in */
	1181	if (s->allocating_acb == NULL) {
	1182	qed_cancel_need_check_timer(s);
	1183	}
	1184
	1185	/* Freeze this request if another allocating write is in progress */
	1186	if (s->allocating_acb != acb \|\| s->allocating_write_reqs_plugged) {
	1187	if (s->allocating_acb != NULL) {
	1188	qemu_co_queue_wait(&s->allocating_write_reqs, &s->table_lock);
	1189	assert(s->allocating_acb == NULL);
	1190	}
	1191	s->allocating_acb = acb;
	1192	return -EAGAIN; /* start over with looking up table entries */
	1193	}
	1194
	1195	acb->cur_nclusters = qed_bytes_to_clusters(s,
	1196	qed_offset_into_cluster(s, acb->cur_pos) + len);
	1197	qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
	1198
	1199	if (acb->flags & QED_AIOCB_ZERO) {
	1200	/* Skip ahead if the clusters are already zero */
	1201	if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
	1202	return 0;
	1203	}
	1204	acb->cur_cluster = 1;
	1205	} else {
	1206	acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
	1207	}
	1208
	1209	if (qed_should_set_need_check(s)) {
	1210	s->header.features \|= QED_F_NEED_CHECK;
	1211	ret = qed_write_header(s);
	1212	if (ret < 0) {
	1213	return ret;
	1214	}
	1215	}
	1216
	1217	if (!(acb->flags & QED_AIOCB_ZERO)) {
	1218	ret = qed_aio_write_cow(acb);
	1219	if (ret < 0) {
	1220	return ret;
	1221	}
	1222	}
	1223
	1224	return qed_aio_write_l2_update(acb, acb->cur_cluster);
	1225	}
	1226
	1227	/**
	1228	* Write data cluster in place
	1229	*
	1230	* @acb: Write request
	1231	* @offset: Cluster offset in bytes
	1232	* @len: Length in bytes
	1233	*
	1234	* This path is taken when writing to already allocated clusters.
	1235	*
	1236	* Called with table_lock held.
	1237	*/
	1238	static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset,
	1239	size_t len)
	1240	{
	1241	BDRVQEDState *s = acb_to_s(acb);
	1242	int r;
	1243
	1244	qemu_co_mutex_unlock(&s->table_lock);
	1245
	1246	/* Allocate buffer for zero writes */
	1247	if (acb->flags & QED_AIOCB_ZERO) {
	1248	struct iovec *iov = acb->qiov->iov;
	1249
	1250	if (!iov->iov_base) {
	1251	iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
	1252	if (iov->iov_base == NULL) {
	1253	r = -ENOMEM;
	1254	goto out;
	1255	}
	1256	memset(iov->iov_base, 0, iov->iov_len);
	1257	}
	1258	}
	1259
	1260	/* Calculate the I/O vector */
	1261	acb->cur_cluster = offset;
	1262	qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
	1263
	1264	/* Do the actual write. */
	1265	r = qed_aio_write_main(acb);
	1266	out:
	1267	qemu_co_mutex_lock(&s->table_lock);
	1268	return r;
	1269	}
	1270
	1271	/**
	1272	* Write data cluster
	1273	*
	1274	* @opaque: Write request
	1275	* @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
	1276	* @offset: Cluster offset in bytes
	1277	* @len: Length in bytes
	1278	*
	1279	* Called with table_lock held.
	1280	*/
	1281	static int coroutine_fn qed_aio_write_data(void *opaque, int ret,
	1282	uint64_t offset, size_t len)
	1283	{
	1284	QEDAIOCB *acb = opaque;
	1285
	1286	trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
	1287
	1288	acb->find_cluster_ret = ret;
	1289
	1290	switch (ret) {
	1291	case QED_CLUSTER_FOUND:
	1292	return qed_aio_write_inplace(acb, offset, len);
	1293
	1294	case QED_CLUSTER_L2:
	1295	case QED_CLUSTER_L1:
	1296	case QED_CLUSTER_ZERO:
	1297	return qed_aio_write_alloc(acb, len);
	1298
	1299	default:
	1300	g_assert_not_reached();
	1301	}
	1302	}
	1303
	1304	/**
	1305	* Read data cluster
	1306	*
	1307	* @opaque: Read request
	1308	* @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
	1309	* @offset: Cluster offset in bytes
	1310	* @len: Length in bytes
	1311	*
	1312	* Called with table_lock held.
	1313	*/
	1314	static int coroutine_fn qed_aio_read_data(void *opaque, int ret,
	1315	uint64_t offset, size_t len)
	1316	{
	1317	QEDAIOCB *acb = opaque;
	1318	BDRVQEDState *s = acb_to_s(acb);
	1319	BlockDriverState *bs = acb->bs;
	1320	int r;
	1321
	1322	qemu_co_mutex_unlock(&s->table_lock);
	1323
	1324	/* Adjust offset into cluster */
	1325	offset += qed_offset_into_cluster(s, acb->cur_pos);
	1326
	1327	trace_qed_aio_read_data(s, acb, ret, offset, len);
	1328
	1329	qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
	1330
	1331	/* Handle zero cluster and backing file reads, otherwise read
	1332	* data cluster directly.
	1333	*/
	1334	if (ret == QED_CLUSTER_ZERO) {
	1335	qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
	1336	r = 0;
	1337	} else if (ret != QED_CLUSTER_FOUND) {
	1338	r = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
	1339	&acb->backing_qiov);
	1340	} else {
	1341	BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
	1342	r = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
	1343	&acb->cur_qiov, 0);
	1344	}
	1345
	1346	qemu_co_mutex_lock(&s->table_lock);
	1347	return r;
	1348	}
	1349
	1350	/**
	1351	* Begin next I/O or complete the request
	1352	*/
	1353	static int coroutine_fn qed_aio_next_io(QEDAIOCB *acb)
	1354	{
	1355	BDRVQEDState *s = acb_to_s(acb);
	1356	uint64_t offset;
	1357	size_t len;
	1358	int ret;
	1359
	1360	qemu_co_mutex_lock(&s->table_lock);
	1361	while (1) {
	1362	trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
	1363
	1364	if (acb->backing_qiov) {
	1365	qemu_iovec_destroy(acb->backing_qiov);
	1366	g_free(acb->backing_qiov);
	1367	acb->backing_qiov = NULL;
	1368	}
	1369
	1370	acb->qiov_offset += acb->cur_qiov.size;
	1371	acb->cur_pos += acb->cur_qiov.size;
	1372	qemu_iovec_reset(&acb->cur_qiov);
	1373
	1374	/* Complete request */
	1375	if (acb->cur_pos >= acb->end_pos) {
	1376	ret = 0;
	1377	break;
	1378	}
	1379
	1380	/* Find next cluster and start I/O */
	1381	len = acb->end_pos - acb->cur_pos;
	1382	ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
	1383	if (ret < 0) {
	1384	break;
	1385	}
	1386
	1387	if (acb->flags & QED_AIOCB_WRITE) {
	1388	ret = qed_aio_write_data(acb, ret, offset, len);
	1389	} else {
	1390	ret = qed_aio_read_data(acb, ret, offset, len);
	1391	}
	1392
	1393	if (ret < 0 && ret != -EAGAIN) {
	1394	break;
	1395	}
	1396	}
	1397
	1398	trace_qed_aio_complete(s, acb, ret);
	1399	qed_aio_complete(acb);
	1400	qemu_co_mutex_unlock(&s->table_lock);
	1401	return ret;
	1402	}
	1403
	1404	static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num,
	1405	QEMUIOVector *qiov, int nb_sectors,
	1406	int flags)
	1407	{
	1408	QEDAIOCB acb = {
	1409	.bs = bs,
	1410	.cur_pos = (uint64_t) sector_num * BDRV_SECTOR_SIZE,
	1411	.end_pos = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE,
	1412	.qiov = qiov,
	1413	.flags = flags,
	1414	};
	1415	qemu_iovec_init(&acb.cur_qiov, qiov->niov);
	1416
	1417	trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags);
	1418
	1419	/* Start request */
	1420	return qed_aio_next_io(&acb);
	1421	}
	1422
	1423	static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
	1424	int64_t sector_num, int nb_sectors,
	1425	QEMUIOVector *qiov)
	1426	{
	1427	return qed_co_request(bs, sector_num, qiov, nb_sectors, 0);
	1428	}
	1429
	1430	static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
	1431	int64_t sector_num, int nb_sectors,
	1432	QEMUIOVector *qiov, int flags)
	1433	{
	1434	assert(!flags);
	1435	return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
	1436	}
	1437
	1438	static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
	1439	int64_t offset,
	1440	int bytes,
	1441	BdrvRequestFlags flags)
	1442	{
	1443	BDRVQEDState *s = bs->opaque;
	1444
	1445	/*
	1446	* Zero writes start without an I/O buffer. If a buffer becomes necessary
	1447	* then it will be allocated during request processing.
	1448	*/
	1449	QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
	1450
	1451	/* Fall back if the request is not aligned */
	1452	if (qed_offset_into_cluster(s, offset) \|\|
	1453	qed_offset_into_cluster(s, bytes)) {
	1454	return -ENOTSUP;
	1455	}
	1456
	1457	return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
	1458	bytes >> BDRV_SECTOR_BITS,
	1459	QED_AIOCB_WRITE \| QED_AIOCB_ZERO);
	1460	}
	1461
	1462	static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs,
	1463	int64_t offset,
	1464	PreallocMode prealloc,
	1465	Error **errp)
	1466	{
	1467	BDRVQEDState *s = bs->opaque;
	1468	uint64_t old_image_size;
	1469	int ret;
	1470
	1471	if (prealloc != PREALLOC_MODE_OFF) {
	1472	error_setg(errp, "Unsupported preallocation mode '%s'",
	1473	PreallocMode_str(prealloc));
	1474	return -ENOTSUP;
	1475	}
	1476
	1477	if (!qed_is_image_size_valid(offset, s->header.cluster_size,
	1478	s->header.table_size)) {
	1479	error_setg(errp, "Invalid image size specified");
	1480	return -EINVAL;
	1481	}
	1482
	1483	if ((uint64_t)offset < s->header.image_size) {
	1484	error_setg(errp, "Shrinking images is currently not supported");
	1485	return -ENOTSUP;
	1486	}
	1487
	1488	old_image_size = s->header.image_size;
	1489	s->header.image_size = offset;
	1490	ret = qed_write_header_sync(s);
	1491	if (ret < 0) {
	1492	s->header.image_size = old_image_size;
	1493	error_setg_errno(errp, -ret, "Failed to update the image size");
	1494	}
	1495	return ret;
	1496	}
	1497
	1498	static int64_t bdrv_qed_getlength(BlockDriverState *bs)
	1499	{
	1500	BDRVQEDState *s = bs->opaque;
	1501	return s->header.image_size;
	1502	}
	1503
	1504	static int bdrv_qed_get_info(BlockDriverState bs, BlockDriverInfo bdi)
	1505	{
	1506	BDRVQEDState *s = bs->opaque;
	1507
	1508	memset(bdi, 0, sizeof(*bdi));
	1509	bdi->cluster_size = s->header.cluster_size;
	1510	bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
	1511	bdi->unallocated_blocks_are_zero = true;
	1512	return 0;
	1513	}
	1514
	1515	static int bdrv_qed_change_backing_file(BlockDriverState *bs,
	1516	const char *backing_file,
	1517	const char *backing_fmt)
	1518	{
	1519	BDRVQEDState *s = bs->opaque;
	1520	QEDHeader new_header, le_header;
	1521	void *buffer;
	1522	size_t buffer_len, backing_file_len;
	1523	int ret;
	1524
	1525	/* Refuse to set backing filename if unknown compat feature bits are
	1526	* active. If the image uses an unknown compat feature then we may not
	1527	* know the layout of data following the header structure and cannot safely
	1528	* add a new string.
	1529	*/
	1530	if (backing_file && (s->header.compat_features &
	1531	~QED_COMPAT_FEATURE_MASK)) {
	1532	return -ENOTSUP;
	1533	}
	1534
	1535	memcpy(&new_header, &s->header, sizeof(new_header));
	1536
	1537	new_header.features &= ~(QED_F_BACKING_FILE \|
	1538	QED_F_BACKING_FORMAT_NO_PROBE);
	1539
	1540	/* Adjust feature flags */
	1541	if (backing_file) {
	1542	new_header.features \|= QED_F_BACKING_FILE;
	1543
	1544	if (qed_fmt_is_raw(backing_fmt)) {
	1545	new_header.features \|= QED_F_BACKING_FORMAT_NO_PROBE;
	1546	}
	1547	}
	1548
	1549	/* Calculate new header size */
	1550	backing_file_len = 0;
	1551
	1552	if (backing_file) {
	1553	backing_file_len = strlen(backing_file);
	1554	}
	1555
	1556	buffer_len = sizeof(new_header);
	1557	new_header.backing_filename_offset = buffer_len;
	1558	new_header.backing_filename_size = backing_file_len;
	1559	buffer_len += backing_file_len;
	1560
	1561	/* Make sure we can rewrite header without failing */
	1562	if (buffer_len > new_header.header_size * new_header.cluster_size) {
	1563	return -ENOSPC;
	1564	}
	1565
	1566	/* Prepare new header */
	1567	buffer = g_malloc(buffer_len);
	1568
	1569	qed_header_cpu_to_le(&new_header, &le_header);
	1570	memcpy(buffer, &le_header, sizeof(le_header));
	1571	buffer_len = sizeof(le_header);
	1572
	1573	if (backing_file) {
	1574	memcpy(buffer + buffer_len, backing_file, backing_file_len);
	1575	buffer_len += backing_file_len;
	1576	}
	1577
	1578	/* Write new header */
	1579	ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len);
	1580	g_free(buffer);
	1581	if (ret == 0) {
	1582	memcpy(&s->header, &new_header, sizeof(new_header));
	1583	}
	1584	return ret;
	1585	}
	1586
	1587	static void coroutine_fn bdrv_qed_co_invalidate_cache(BlockDriverState *bs,
	1588	Error **errp)
	1589	{
	1590	BDRVQEDState *s = bs->opaque;
	1591	Error *local_err = NULL;
	1592	int ret;
	1593
	1594	bdrv_qed_close(bs);
	1595
	1596	bdrv_qed_init_state(bs);
	1597	qemu_co_mutex_lock(&s->table_lock);
	1598	ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, &local_err);
	1599	qemu_co_mutex_unlock(&s->table_lock);
	1600	if (local_err) {
	1601	error_propagate_prepend(errp, local_err,
	1602	"Could not reopen qed layer: ");
	1603	return;
	1604	} else if (ret < 0) {
	1605	error_setg_errno(errp, -ret, "Could not reopen qed layer");
	1606	return;
	1607	}
	1608	}
	1609
	1610	static int coroutine_fn bdrv_qed_co_check(BlockDriverState *bs,
	1611	BdrvCheckResult *result,
	1612	BdrvCheckMode fix)
	1613	{
	1614	BDRVQEDState *s = bs->opaque;
	1615	int ret;
	1616
	1617	qemu_co_mutex_lock(&s->table_lock);
	1618	ret = qed_check(s, result, !!fix);
	1619	qemu_co_mutex_unlock(&s->table_lock);
	1620
	1621	return ret;
	1622	}
	1623
	1624	static QemuOptsList qed_create_opts = {
	1625	.name = "qed-create-opts",
	1626	.head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head),
	1627	.desc = {
	1628	{
	1629	.name = BLOCK_OPT_SIZE,
	1630	.type = QEMU_OPT_SIZE,
	1631	.help = "Virtual disk size"
	1632	},
	1633	{
	1634	.name = BLOCK_OPT_BACKING_FILE,
	1635	.type = QEMU_OPT_STRING,
	1636	.help = "File name of a base image"
	1637	},
	1638	{
	1639	.name = BLOCK_OPT_BACKING_FMT,
	1640	.type = QEMU_OPT_STRING,
	1641	.help = "Image format of the base image"
	1642	},
	1643	{
	1644	.name = BLOCK_OPT_CLUSTER_SIZE,
	1645	.type = QEMU_OPT_SIZE,
	1646	.help = "Cluster size (in bytes)",
	1647	.def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE)
	1648	},
	1649	{
	1650	.name = BLOCK_OPT_TABLE_SIZE,
	1651	.type = QEMU_OPT_SIZE,
	1652	.help = "L1/L2 table size (in clusters)"
	1653	},
	1654	{ /* end of list */ }
	1655	}
	1656	};
	1657
	1658	static BlockDriver bdrv_qed = {
	1659	.format_name = "qed",
	1660	.instance_size = sizeof(BDRVQEDState),
	1661	.create_opts = &qed_create_opts,
	1662	.supports_backing = true,
	1663
	1664	.bdrv_probe = bdrv_qed_probe,
	1665	.bdrv_open = bdrv_qed_open,
	1666	.bdrv_close = bdrv_qed_close,
	1667	.bdrv_reopen_prepare = bdrv_qed_reopen_prepare,
	1668	.bdrv_child_perm = bdrv_format_default_perms,
	1669	.bdrv_co_create = bdrv_qed_co_create,
	1670	.bdrv_co_create_opts = bdrv_qed_co_create_opts,
	1671	.bdrv_has_zero_init = bdrv_has_zero_init_1,
	1672	.bdrv_co_block_status = bdrv_qed_co_block_status,
	1673	.bdrv_co_readv = bdrv_qed_co_readv,
	1674	.bdrv_co_writev = bdrv_qed_co_writev,
	1675	.bdrv_co_pwrite_zeroes = bdrv_qed_co_pwrite_zeroes,
	1676	.bdrv_co_truncate = bdrv_qed_co_truncate,
	1677	.bdrv_getlength = bdrv_qed_getlength,
	1678	.bdrv_get_info = bdrv_qed_get_info,
	1679	.bdrv_refresh_limits = bdrv_qed_refresh_limits,
	1680	.bdrv_change_backing_file = bdrv_qed_change_backing_file,
	1681	.bdrv_co_invalidate_cache = bdrv_qed_co_invalidate_cache,
	1682	.bdrv_co_check = bdrv_qed_co_check,
	1683	.bdrv_detach_aio_context = bdrv_qed_detach_aio_context,
	1684	.bdrv_attach_aio_context = bdrv_qed_attach_aio_context,
	1685	.bdrv_co_drain_begin = bdrv_qed_co_drain_begin,
	1686	};
	1687
	1688	static void bdrv_qed_init(void)
	1689	{
	1690	bdrv_register(&bdrv_qed);
	1691	}
	1692
	1693	block_init(bdrv_qed_init);