2 * Block driver for Hyper-V VHDX Images
4 * Copyright (c) 2013 Red Hat, Inc.,
9 * This is based on the "VHDX Format Specification v1.00", published 8/25/2012
11 * https://www.microsoft.com/en-us/download/details.aspx?id=34750
13 * This file covers the functionality of the metadata log writing, parsing, and
16 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
17 * See the COPYING.LIB file in the top-level directory.
20 #include "qemu-common.h"
21 #include "block/block_int.h"
22 #include "qemu/module.h"
23 #include "block/vhdx.h"
26 typedef struct VHDXLogSequence {
30 VHDXLogEntryHeader hdr;
33 typedef struct VHDXLogDescEntries {
34 VHDXLogEntryHeader hdr;
35 VHDXLogDescriptor desc[];
38 static const MSGUID zero_guid = { 0 };
40 /* The log located on the disk is circular buffer containing
41 * sectors of 4096 bytes each.
43 * It is assumed for the read/write functions below that the
44 * circular buffer scheme uses a 'one sector open' to indicate
45 * the buffer is full. Given the validation methods used for each
46 * sector, this method should be compatible with other methods that
47 * do not waste a sector.
51 /* Allow peeking at the hdr entry at the beginning of the current
52 * read index, without advancing the read index */
53 static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log,
54 VHDXLogEntryHeader *hdr)
62 /* peek is only supported on sector boundaries */
63 if (log->read % VHDX_LOG_SECTOR_SIZE) {
69 /* we are guaranteed that a) log sectors are 4096 bytes,
70 * and b) the log length is a multiple of 1MB. So, there
71 * is always a round number of sectors in the buffer */
72 if ((read + sizeof(VHDXLogEntryHeader)) > log->length) {
76 if (read == log->write) {
81 offset = log->offset + read;
83 ret = bdrv_pread(bs->file, offset, hdr, sizeof(VHDXLogEntryHeader));
92 /* Index increment for log, based on sector boundaries */
93 static int vhdx_log_inc_idx(uint32_t idx, uint64_t length)
95 idx += VHDX_LOG_SECTOR_SIZE;
96 /* we are guaranteed that a) log sectors are 4096 bytes,
97 * and b) the log length is a multiple of 1MB. So, there
98 * is always a round number of sectors in the buffer */
99 return idx >= length ? 0 : idx;
103 /* Reset the log to empty */
104 static void vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s)
107 s->log.read = s->log.write = 0;
108 /* a log guid of 0 indicates an empty log to any parser of v0
110 vhdx_update_headers(bs, s, false, &guid);
113 /* Reads num_sectors from the log (all log sectors are 4096 bytes),
114 * into buffer 'buffer'. Upon return, *sectors_read will contain
115 * the number of sectors successfully read.
117 * It is assumed that 'buffer' is already allocated, and of sufficient
118 * size (i.e. >= 4096*num_sectors).
120 * If 'peek' is true, then the tail (read) pointer for the circular buffer is
123 * 0 is returned on success, -errno otherwise. */
124 static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log,
125 uint32_t *sectors_read, void *buffer,
126 uint32_t num_sectors, bool peek)
135 while (num_sectors) {
136 if (read == log->write) {
140 offset = log->offset + read;
142 ret = bdrv_pread(bs->file, offset, buffer, VHDX_LOG_SECTOR_SIZE);
146 read = vhdx_log_inc_idx(read, log->length);
148 *sectors_read = *sectors_read + 1;
159 /* Validates a log entry header */
160 static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr,
165 if (memcmp(&hdr->signature, "loge", 4)) {
169 /* if the individual entry length is larger than the whole log
170 * buffer, that is obviously invalid */
171 if (log->length < hdr->entry_length) {
175 /* length of entire entry must be in units of 4KB (log sector size) */
176 if (hdr->entry_length % (VHDX_LOG_SECTOR_SIZE)) {
180 /* per spec, sequence # must be > 0 */
181 if (hdr->sequence_number == 0) {
185 /* log entries are only valid if they match the file-wide log guid
186 * found in the active header */
187 if (!guid_eq(hdr->log_guid, s->headers[s->curr_header]->log_guid)) {
191 if (hdr->descriptor_count * sizeof(VHDXLogDescriptor) > hdr->entry_length) {
202 * Given a log header, this will validate that the descriptors and the
203 * corresponding data sectors (if applicable)
205 * Validation consists of:
206 * 1. Making sure the sequence numbers matches the entry header
207 * 2. Verifying a valid signature ('zero' or 'desc' for descriptors)
208 * 3. File offset field is a multiple of 4KB
209 * 4. If a data descriptor, the corresponding data sector
210 * has its signature ('data') and matching sequence number
212 * @desc: the data buffer containing the descriptor
213 * @hdr: the log entry header
215 * Returns true if valid
217 static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc,
218 VHDXLogEntryHeader *hdr)
222 if (desc->sequence_number != hdr->sequence_number) {
225 if (desc->file_offset % VHDX_LOG_SECTOR_SIZE) {
229 if (!memcmp(&desc->signature, "zero", 4)) {
230 if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) {
234 } else if (!memcmp(&desc->signature, "desc", 4)) {
244 /* Prior to sector data for a log entry, there is the header
245 * and the descriptors referenced in the header:
249 * [ hdr, desc ][ desc ][ ... ][ data ][ ... ]
251 * The first sector in a log entry has a 64 byte header, and
252 * up to 126 32-byte descriptors. If more descriptors than
253 * 126 are required, then subsequent sectors can have up to 128
254 * descriptors. Each sector is 4KB. Data follows the descriptor
257 * This will return the number of sectors needed to encompass
258 * the passed number of descriptors in desc_cnt.
260 * This will never return 0, even if desc_cnt is 0.
262 static int vhdx_compute_desc_sectors(uint32_t desc_cnt)
264 uint32_t desc_sectors;
266 desc_cnt += 2; /* account for header in first sector */
267 desc_sectors = desc_cnt / 128;
268 if (desc_cnt % 128) {
276 /* Reads the log header, and subsequent descriptors (if any). This
277 * will allocate all the space for buffer, which must be NULL when
278 * passed into this function. Each descriptor will also be validated,
279 * and error returned if any are invalid. */
280 static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,
281 VHDXLogEntries *log, VHDXLogDescEntries **buffer)
284 uint32_t desc_sectors;
285 uint32_t sectors_read;
286 VHDXLogEntryHeader hdr;
287 VHDXLogDescEntries *desc_entries = NULL;
290 assert(*buffer == NULL);
292 ret = vhdx_log_peek_hdr(bs, log, &hdr);
296 vhdx_log_entry_hdr_le_import(&hdr);
297 if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
302 desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
303 desc_entries = qemu_blockalign(bs, desc_sectors * VHDX_LOG_SECTOR_SIZE);
305 ret = vhdx_log_read_sectors(bs, log, §ors_read, desc_entries,
306 desc_sectors, false);
310 if (sectors_read != desc_sectors) {
315 /* put in proper endianness, and validate each desc */
316 for (i = 0; i < hdr.descriptor_count; i++) {
317 vhdx_log_desc_le_import(&desc_entries->desc[i]);
318 if (vhdx_log_desc_is_valid(&desc_entries->desc[i], &hdr) == false) {
324 *buffer = desc_entries;
328 qemu_vfree(desc_entries);
334 /* Flushes the descriptor described by desc to the VHDX image file.
335 * If the descriptor is a data descriptor, than 'data' must be non-NULL,
336 * and >= 4096 bytes (VHDX_LOG_SECTOR_SIZE), containing the data to be
339 * Verification is performed to make sure the sequence numbers of a data
340 * descriptor match the sequence number in the desc.
342 * For a zero descriptor, it may describe multiple sectors to fill with zeroes.
343 * In this case, it should be noted that zeroes are written to disk, and the
344 * image file is not extended as a sparse file. */
345 static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,
346 VHDXLogDataSector *data)
349 uint64_t seq, file_offset;
355 buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
357 if (!memcmp(&desc->signature, "desc", 4)) {
364 /* The sequence number of the data sector must match that
365 * in the descriptor */
366 seq = data->sequence_high;
368 seq |= data->sequence_low & 0xffffffff;
370 if (seq != desc->sequence_number) {
375 /* Each data sector is in total 4096 bytes, however the first
376 * 8 bytes, and last 4 bytes, are located in the descriptor */
377 memcpy(buffer, &desc->leading_bytes, 8);
380 memcpy(buffer+offset, data->data, 4084);
383 memcpy(buffer+offset, &desc->trailing_bytes, 4);
385 } else if (!memcmp(&desc->signature, "zero", 4)) {
386 /* write 'count' sectors of sector */
387 memset(buffer, 0, VHDX_LOG_SECTOR_SIZE);
388 count = desc->zero_length / VHDX_LOG_SECTOR_SIZE;
391 file_offset = desc->file_offset;
393 /* count is only > 1 if we are writing zeroes */
394 for (i = 0; i < count; i++) {
395 ret = bdrv_pwrite_sync(bs->file, file_offset, buffer,
396 VHDX_LOG_SECTOR_SIZE);
400 file_offset += VHDX_LOG_SECTOR_SIZE;
408 /* Flush the entire log (as described by 'logs') to the VHDX image
409 * file, and then set the log to 'empty' status once complete.
411 * The log entries should be validate prior to flushing */
412 static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
413 VHDXLogSequence *logs)
417 uint32_t cnt, sectors_read;
418 uint64_t new_file_size;
420 VHDXLogDescEntries *desc_entries = NULL;
421 VHDXLogEntryHeader hdr_tmp = { 0 };
425 data = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
427 ret = vhdx_user_visible_write(bs, s);
432 /* each iteration represents one log sequence, which may span multiple
435 ret = vhdx_log_peek_hdr(bs, &logs->log, &hdr_tmp);
439 /* if the log shows a FlushedFileOffset larger than our current file
440 * size, then that means the file has been truncated / corrupted, and
441 * we must refused to open it / use it */
442 if (hdr_tmp.flushed_file_offset > bdrv_getlength(bs->file)) {
447 ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries);
452 for (i = 0; i < desc_entries->hdr.descriptor_count; i++) {
453 if (!memcmp(&desc_entries->desc[i].signature, "desc", 4)) {
454 /* data sector, so read a sector to flush */
455 ret = vhdx_log_read_sectors(bs, &logs->log, §ors_read,
460 if (sectors_read != 1) {
466 ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data);
471 if (bdrv_getlength(bs->file) < desc_entries->hdr.last_file_offset) {
472 new_file_size = desc_entries->hdr.last_file_offset;
473 if (new_file_size % (1024*1024)) {
474 /* round up to nearest 1MB boundary */
475 new_file_size = ((new_file_size >> 20) + 1) << 20;
476 bdrv_truncate(bs->file, new_file_size);
479 qemu_vfree(desc_entries);
484 /* once the log is fully flushed, indicate that we have an empty log
485 * now. This also sets the log guid to 0, to indicate an empty log */
486 vhdx_log_reset(bs, s);
490 qemu_vfree(desc_entries);
494 static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
495 VHDXLogEntries *log, uint64_t seq,
496 bool *valid, VHDXLogEntryHeader *entry)
499 VHDXLogEntryHeader hdr;
501 uint32_t i, desc_sectors, total_sectors, crc;
502 uint32_t sectors_read = 0;
503 VHDXLogDescEntries *desc_buffer = NULL;
507 ret = vhdx_log_peek_hdr(bs, log, &hdr);
512 vhdx_log_entry_hdr_le_import(&hdr);
515 if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
520 if (hdr.sequence_number != seq + 1) {
525 desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
527 /* Read desc sectors, and calculate log checksum */
529 total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE;
532 /* read_desc() will incrememnt the read idx */
533 ret = vhdx_log_read_desc(bs, s, log, &desc_buffer);
538 crc = vhdx_checksum_calc(0xffffffff, (void *)desc_buffer,
539 desc_sectors * VHDX_LOG_SECTOR_SIZE, 4);
542 buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
543 if (total_sectors > desc_sectors) {
544 for (i = 0; i < total_sectors - desc_sectors; i++) {
546 ret = vhdx_log_read_sectors(bs, log, §ors_read, buffer,
548 if (ret < 0 || sectors_read != 1) {
551 crc = vhdx_checksum_calc(crc, buffer, VHDX_LOG_SECTOR_SIZE, -1);
556 if (crc != desc_buffer->hdr.checksum) {
565 log->read = vhdx_log_inc_idx(log->read, log->length);
569 qemu_vfree(desc_buffer);
573 /* Search through the log circular buffer, and find the valid, active
574 * log sequence, if any exists
576 static int vhdx_log_search(BlockDriverState *bs, BDRVVHDXState *s,
577 VHDXLogSequence *logs)
581 bool seq_valid = false;
582 VHDXLogSequence candidate = { 0 };
583 VHDXLogEntryHeader hdr = { 0 };
584 VHDXLogEntries curr_log;
586 memcpy(&curr_log, &s->log, sizeof(VHDXLogEntries));
587 curr_log.write = curr_log.length; /* assume log is full */
591 /* now we will go through the whole log sector by sector, until
592 * we find a valid, active log sequence, or reach the end of the
595 uint64_t curr_seq = 0;
596 VHDXLogSequence current = { 0 };
598 tail = curr_log.read;
600 ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq,
607 current.valid = true;
608 current.log = curr_log;
609 current.log.read = tail;
610 current.log.write = curr_log.read;
616 ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq,
621 if (seq_valid == false) {
624 current.log.write = curr_log.read;
627 curr_seq = hdr.sequence_number;
632 if (candidate.valid == false ||
633 current.hdr.sequence_number > candidate.hdr.sequence_number) {
638 if (curr_log.read < tail) {
645 if (candidate.valid) {
646 /* this is the next sequence number, for writes */
647 s->log.sequence = candidate.hdr.sequence_number + 1;
655 /* Parse the replay log. Per the VHDX spec, if the log is present
656 * it must be replayed prior to opening the file, even read-only.
658 * If read-only, we must replay the log in RAM (or refuse to open
659 * a dirty VHDX file read-only) */
660 int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed)
664 VHDXLogSequence logs = { 0 };
666 hdr = s->headers[s->curr_header];
670 /* s->log.hdr is freed in vhdx_close() */
671 if (s->log.hdr == NULL) {
672 s->log.hdr = qemu_blockalign(bs, sizeof(VHDXLogEntryHeader));
675 s->log.offset = hdr->log_offset;
676 s->log.length = hdr->log_length;
678 if (s->log.offset < VHDX_LOG_MIN_SIZE ||
679 s->log.offset % VHDX_LOG_MIN_SIZE) {
684 /* per spec, only log version of 0 is supported */
685 if (hdr->log_version != 0) {
690 /* If either the log guid, or log length is zero,
691 * then a replay log is not present */
692 if (guid_eq(hdr->log_guid, zero_guid)) {
696 if (hdr->log_length == 0) {
700 if (hdr->log_length % VHDX_LOG_MIN_SIZE) {
706 /* The log is present, we need to find if and where there is an active
707 * sequence of valid entries present in the log. */
709 ret = vhdx_log_search(bs, s, &logs);
715 /* now flush the log */
716 ret = vhdx_log_flush(bs, s, &logs);