]> Git Repo - qemu.git/blob - block/qcow2-refcount.c
qcow2: Check snapshot L1 table in qcow2_snapshot_delete()
[qemu.git] / block / qcow2-refcount.c
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "block/qcow2.h"
30 #include "qemu/range.h"
31 #include "qemu/bswap.h"
32 #include "qemu/cutils.h"
33
34 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
35 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
36                             int64_t offset, int64_t length, uint64_t addend,
37                             bool decrease, enum qcow2_discard_type type);
38
39 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
40 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
41 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
42 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
43 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
44 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
45 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
46
47 static void set_refcount_ro0(void *refcount_array, uint64_t index,
48                              uint64_t value);
49 static void set_refcount_ro1(void *refcount_array, uint64_t index,
50                              uint64_t value);
51 static void set_refcount_ro2(void *refcount_array, uint64_t index,
52                              uint64_t value);
53 static void set_refcount_ro3(void *refcount_array, uint64_t index,
54                              uint64_t value);
55 static void set_refcount_ro4(void *refcount_array, uint64_t index,
56                              uint64_t value);
57 static void set_refcount_ro5(void *refcount_array, uint64_t index,
58                              uint64_t value);
59 static void set_refcount_ro6(void *refcount_array, uint64_t index,
60                              uint64_t value);
61
62
63 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
64     &get_refcount_ro0,
65     &get_refcount_ro1,
66     &get_refcount_ro2,
67     &get_refcount_ro3,
68     &get_refcount_ro4,
69     &get_refcount_ro5,
70     &get_refcount_ro6
71 };
72
73 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
74     &set_refcount_ro0,
75     &set_refcount_ro1,
76     &set_refcount_ro2,
77     &set_refcount_ro3,
78     &set_refcount_ro4,
79     &set_refcount_ro5,
80     &set_refcount_ro6
81 };
82
83
84 /*********************************************************/
85 /* refcount handling */
86
87 static void update_max_refcount_table_index(BDRVQcow2State *s)
88 {
89     unsigned i = s->refcount_table_size - 1;
90     while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
91         i--;
92     }
93     /* Set s->max_refcount_table_index to the index of the last used entry */
94     s->max_refcount_table_index = i;
95 }
96
97 int qcow2_refcount_init(BlockDriverState *bs)
98 {
99     BDRVQcow2State *s = bs->opaque;
100     unsigned int refcount_table_size2, i;
101     int ret;
102
103     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
104
105     s->get_refcount = get_refcount_funcs[s->refcount_order];
106     s->set_refcount = set_refcount_funcs[s->refcount_order];
107
108     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
109     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
110     s->refcount_table = g_try_malloc(refcount_table_size2);
111
112     if (s->refcount_table_size > 0) {
113         if (s->refcount_table == NULL) {
114             ret = -ENOMEM;
115             goto fail;
116         }
117         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
118         ret = bdrv_pread(bs->file, s->refcount_table_offset,
119                          s->refcount_table, refcount_table_size2);
120         if (ret < 0) {
121             goto fail;
122         }
123         for(i = 0; i < s->refcount_table_size; i++)
124             be64_to_cpus(&s->refcount_table[i]);
125         update_max_refcount_table_index(s);
126     }
127     return 0;
128  fail:
129     return ret;
130 }
131
132 void qcow2_refcount_close(BlockDriverState *bs)
133 {
134     BDRVQcow2State *s = bs->opaque;
135     g_free(s->refcount_table);
136 }
137
138
139 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
140 {
141     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
142 }
143
144 static void set_refcount_ro0(void *refcount_array, uint64_t index,
145                              uint64_t value)
146 {
147     assert(!(value >> 1));
148     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
149     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
150 }
151
152 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
153 {
154     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
155            & 0x3;
156 }
157
158 static void set_refcount_ro1(void *refcount_array, uint64_t index,
159                              uint64_t value)
160 {
161     assert(!(value >> 2));
162     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
163     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
164 }
165
166 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
167 {
168     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
169            & 0xf;
170 }
171
172 static void set_refcount_ro2(void *refcount_array, uint64_t index,
173                              uint64_t value)
174 {
175     assert(!(value >> 4));
176     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
177     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
178 }
179
180 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
181 {
182     return ((const uint8_t *)refcount_array)[index];
183 }
184
185 static void set_refcount_ro3(void *refcount_array, uint64_t index,
186                              uint64_t value)
187 {
188     assert(!(value >> 8));
189     ((uint8_t *)refcount_array)[index] = value;
190 }
191
192 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
193 {
194     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
195 }
196
197 static void set_refcount_ro4(void *refcount_array, uint64_t index,
198                              uint64_t value)
199 {
200     assert(!(value >> 16));
201     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
202 }
203
204 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
205 {
206     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
207 }
208
209 static void set_refcount_ro5(void *refcount_array, uint64_t index,
210                              uint64_t value)
211 {
212     assert(!(value >> 32));
213     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
214 }
215
216 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
217 {
218     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
219 }
220
221 static void set_refcount_ro6(void *refcount_array, uint64_t index,
222                              uint64_t value)
223 {
224     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
225 }
226
227
228 static int load_refcount_block(BlockDriverState *bs,
229                                int64_t refcount_block_offset,
230                                void **refcount_block)
231 {
232     BDRVQcow2State *s = bs->opaque;
233
234     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
235     return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
236                            refcount_block);
237 }
238
239 /*
240  * Retrieves the refcount of the cluster given by its index and stores it in
241  * *refcount. Returns 0 on success and -errno on failure.
242  */
243 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
244                        uint64_t *refcount)
245 {
246     BDRVQcow2State *s = bs->opaque;
247     uint64_t refcount_table_index, block_index;
248     int64_t refcount_block_offset;
249     int ret;
250     void *refcount_block;
251
252     refcount_table_index = cluster_index >> s->refcount_block_bits;
253     if (refcount_table_index >= s->refcount_table_size) {
254         *refcount = 0;
255         return 0;
256     }
257     refcount_block_offset =
258         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
259     if (!refcount_block_offset) {
260         *refcount = 0;
261         return 0;
262     }
263
264     if (offset_into_cluster(s, refcount_block_offset)) {
265         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
266                                 " unaligned (reftable index: %#" PRIx64 ")",
267                                 refcount_block_offset, refcount_table_index);
268         return -EIO;
269     }
270
271     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
272                           &refcount_block);
273     if (ret < 0) {
274         return ret;
275     }
276
277     block_index = cluster_index & (s->refcount_block_size - 1);
278     *refcount = s->get_refcount(refcount_block, block_index);
279
280     qcow2_cache_put(s->refcount_block_cache, &refcount_block);
281
282     return 0;
283 }
284
285 /* Checks if two offsets are described by the same refcount block */
286 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
287     uint64_t offset_b)
288 {
289     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
290     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
291
292     return (block_a == block_b);
293 }
294
295 /*
296  * Loads a refcount block. If it doesn't exist yet, it is allocated first
297  * (including growing the refcount table if needed).
298  *
299  * Returns 0 on success or -errno in error case
300  */
301 static int alloc_refcount_block(BlockDriverState *bs,
302                                 int64_t cluster_index, void **refcount_block)
303 {
304     BDRVQcow2State *s = bs->opaque;
305     unsigned int refcount_table_index;
306     int64_t ret;
307
308     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
309
310     /* Find the refcount block for the given cluster */
311     refcount_table_index = cluster_index >> s->refcount_block_bits;
312
313     if (refcount_table_index < s->refcount_table_size) {
314
315         uint64_t refcount_block_offset =
316             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
317
318         /* If it's already there, we're done */
319         if (refcount_block_offset) {
320             if (offset_into_cluster(s, refcount_block_offset)) {
321                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
322                                         PRIx64 " unaligned (reftable index: "
323                                         "%#x)", refcount_block_offset,
324                                         refcount_table_index);
325                 return -EIO;
326             }
327
328              return load_refcount_block(bs, refcount_block_offset,
329                                         refcount_block);
330         }
331     }
332
333     /*
334      * If we came here, we need to allocate something. Something is at least
335      * a cluster for the new refcount block. It may also include a new refcount
336      * table if the old refcount table is too small.
337      *
338      * Note that allocating clusters here needs some special care:
339      *
340      * - We can't use the normal qcow2_alloc_clusters(), it would try to
341      *   increase the refcount and very likely we would end up with an endless
342      *   recursion. Instead we must place the refcount blocks in a way that
343      *   they can describe them themselves.
344      *
345      * - We need to consider that at this point we are inside update_refcounts
346      *   and potentially doing an initial refcount increase. This means that
347      *   some clusters have already been allocated by the caller, but their
348      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
349      *   need to return -EAGAIN to signal the caller that it needs to restart
350      *   the search for free clusters.
351      *
352      * - alloc_clusters_noref and qcow2_free_clusters may load a different
353      *   refcount block into the cache
354      */
355
356     *refcount_block = NULL;
357
358     /* We write to the refcount table, so we might depend on L2 tables */
359     ret = qcow2_cache_flush(bs, s->l2_table_cache);
360     if (ret < 0) {
361         return ret;
362     }
363
364     /* Allocate the refcount block itself and mark it as used */
365     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
366     if (new_block < 0) {
367         return new_block;
368     }
369
370     /* If we're allocating the block at offset 0 then something is wrong */
371     if (new_block == 0) {
372         qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
373                                 "allocation of refcount block at offset 0");
374         return -EIO;
375     }
376
377 #ifdef DEBUG_ALLOC2
378     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
379         " at %" PRIx64 "\n",
380         refcount_table_index, cluster_index << s->cluster_bits, new_block);
381 #endif
382
383     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
384         /* Zero the new refcount block before updating it */
385         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
386                                     refcount_block);
387         if (ret < 0) {
388             goto fail;
389         }
390
391         memset(*refcount_block, 0, s->cluster_size);
392
393         /* The block describes itself, need to update the cache */
394         int block_index = (new_block >> s->cluster_bits) &
395             (s->refcount_block_size - 1);
396         s->set_refcount(*refcount_block, block_index, 1);
397     } else {
398         /* Described somewhere else. This can recurse at most twice before we
399          * arrive at a block that describes itself. */
400         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
401                               QCOW2_DISCARD_NEVER);
402         if (ret < 0) {
403             goto fail;
404         }
405
406         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
407         if (ret < 0) {
408             goto fail;
409         }
410
411         /* Initialize the new refcount block only after updating its refcount,
412          * update_refcount uses the refcount cache itself */
413         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
414                                     refcount_block);
415         if (ret < 0) {
416             goto fail;
417         }
418
419         memset(*refcount_block, 0, s->cluster_size);
420     }
421
422     /* Now the new refcount block needs to be written to disk */
423     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
424     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
425     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
426     if (ret < 0) {
427         goto fail;
428     }
429
430     /* If the refcount table is big enough, just hook the block up there */
431     if (refcount_table_index < s->refcount_table_size) {
432         uint64_t data64 = cpu_to_be64(new_block);
433         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
434         ret = bdrv_pwrite_sync(bs->file,
435             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
436             &data64, sizeof(data64));
437         if (ret < 0) {
438             goto fail;
439         }
440
441         s->refcount_table[refcount_table_index] = new_block;
442         /* If there's a hole in s->refcount_table then it can happen
443          * that refcount_table_index < s->max_refcount_table_index */
444         s->max_refcount_table_index =
445             MAX(s->max_refcount_table_index, refcount_table_index);
446
447         /* The new refcount block may be where the caller intended to put its
448          * data, so let it restart the search. */
449         return -EAGAIN;
450     }
451
452     qcow2_cache_put(s->refcount_block_cache, refcount_block);
453
454     /*
455      * If we come here, we need to grow the refcount table. Again, a new
456      * refcount table needs some space and we can't simply allocate to avoid
457      * endless recursion.
458      *
459      * Therefore let's grab new refcount blocks at the end of the image, which
460      * will describe themselves and the new refcount table. This way we can
461      * reference them only in the new table and do the switch to the new
462      * refcount table at once without producing an inconsistent state in
463      * between.
464      */
465     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
466
467     /* Calculate the number of refcount blocks needed so far; this will be the
468      * basis for calculating the index of the first cluster used for the
469      * self-describing refcount structures which we are about to create.
470      *
471      * Because we reached this point, there cannot be any refcount entries for
472      * cluster_index or higher indices yet. However, because new_block has been
473      * allocated to describe that cluster (and it will assume this role later
474      * on), we cannot use that index; also, new_block may actually have a higher
475      * cluster index than cluster_index, so it needs to be taken into account
476      * here (and 1 needs to be added to its value because that cluster is used).
477      */
478     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
479                                             (new_block >> s->cluster_bits) + 1),
480                                         s->refcount_block_size);
481
482     /* Create the new refcount table and blocks */
483     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
484         s->cluster_size;
485
486     ret = qcow2_refcount_area(bs, meta_offset, 0, false,
487                               refcount_table_index, new_block);
488     if (ret < 0) {
489         return ret;
490     }
491
492     ret = load_refcount_block(bs, new_block, refcount_block);
493     if (ret < 0) {
494         return ret;
495     }
496
497     /* If we were trying to do the initial refcount update for some cluster
498      * allocation, we might have used the same clusters to store newly
499      * allocated metadata. Make the caller search some new space. */
500     return -EAGAIN;
501
502 fail:
503     if (*refcount_block != NULL) {
504         qcow2_cache_put(s->refcount_block_cache, refcount_block);
505     }
506     return ret;
507 }
508
509 /*
510  * Starting at @start_offset, this function creates new self-covering refcount
511  * structures: A new refcount table and refcount blocks which cover all of
512  * themselves, and a number of @additional_clusters beyond their end.
513  * @start_offset must be at the end of the image file, that is, there must be
514  * only empty space beyond it.
515  * If @exact_size is false, the refcount table will have 50 % more entries than
516  * necessary so it will not need to grow again soon.
517  * If @new_refblock_offset is not zero, it contains the offset of a refcount
518  * block that should be entered into the new refcount table at index
519  * @new_refblock_index.
520  *
521  * Returns: The offset after the new refcount structures (i.e. where the
522  *          @additional_clusters may be placed) on success, -errno on error.
523  */
524 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
525                             uint64_t additional_clusters, bool exact_size,
526                             int new_refblock_index,
527                             uint64_t new_refblock_offset)
528 {
529     BDRVQcow2State *s = bs->opaque;
530     uint64_t total_refblock_count_u64, additional_refblock_count;
531     int total_refblock_count, table_size, area_reftable_index, table_clusters;
532     int i;
533     uint64_t table_offset, block_offset, end_offset;
534     int ret;
535     uint64_t *new_table;
536
537     assert(!(start_offset % s->cluster_size));
538
539     qcow2_refcount_metadata_size(start_offset / s->cluster_size +
540                                  additional_clusters,
541                                  s->cluster_size, s->refcount_order,
542                                  !exact_size, &total_refblock_count_u64);
543     if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
544         return -EFBIG;
545     }
546     total_refblock_count = total_refblock_count_u64;
547
548     /* Index in the refcount table of the first refcount block to cover the area
549      * of refcount structures we are about to create; we know that
550      * @total_refblock_count can cover @start_offset, so this will definitely
551      * fit into an int. */
552     area_reftable_index = (start_offset / s->cluster_size) /
553                           s->refcount_block_size;
554
555     if (exact_size) {
556         table_size = total_refblock_count;
557     } else {
558         table_size = total_refblock_count +
559                      DIV_ROUND_UP(total_refblock_count, 2);
560     }
561     /* The qcow2 file can only store the reftable size in number of clusters */
562     table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t));
563     table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size;
564
565     if (table_size > QCOW_MAX_REFTABLE_SIZE) {
566         return -EFBIG;
567     }
568
569     new_table = g_try_new0(uint64_t, table_size);
570
571     assert(table_size > 0);
572     if (new_table == NULL) {
573         ret = -ENOMEM;
574         goto fail;
575     }
576
577     /* Fill the new refcount table */
578     if (table_size > s->max_refcount_table_index) {
579         /* We're actually growing the reftable */
580         memcpy(new_table, s->refcount_table,
581                (s->max_refcount_table_index + 1) * sizeof(uint64_t));
582     } else {
583         /* Improbable case: We're shrinking the reftable. However, the caller
584          * has assured us that there is only empty space beyond @start_offset,
585          * so we can simply drop all of the refblocks that won't fit into the
586          * new reftable. */
587         memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t));
588     }
589
590     if (new_refblock_offset) {
591         assert(new_refblock_index < total_refblock_count);
592         new_table[new_refblock_index] = new_refblock_offset;
593     }
594
595     /* Count how many new refblocks we have to create */
596     additional_refblock_count = 0;
597     for (i = area_reftable_index; i < total_refblock_count; i++) {
598         if (!new_table[i]) {
599             additional_refblock_count++;
600         }
601     }
602
603     table_offset = start_offset + additional_refblock_count * s->cluster_size;
604     end_offset = table_offset + table_clusters * s->cluster_size;
605
606     /* Fill the refcount blocks, and create new ones, if necessary */
607     block_offset = start_offset;
608     for (i = area_reftable_index; i < total_refblock_count; i++) {
609         void *refblock_data;
610         uint64_t first_offset_covered;
611
612         /* Reuse an existing refblock if possible, create a new one otherwise */
613         if (new_table[i]) {
614             ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
615                                   &refblock_data);
616             if (ret < 0) {
617                 goto fail;
618             }
619         } else {
620             ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
621                                         block_offset, &refblock_data);
622             if (ret < 0) {
623                 goto fail;
624             }
625             memset(refblock_data, 0, s->cluster_size);
626             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
627                                          refblock_data);
628
629             new_table[i] = block_offset;
630             block_offset += s->cluster_size;
631         }
632
633         /* First host offset covered by this refblock */
634         first_offset_covered = (uint64_t)i * s->refcount_block_size *
635                                s->cluster_size;
636         if (first_offset_covered < end_offset) {
637             int j, end_index;
638
639             /* Set the refcount of all of the new refcount structures to 1 */
640
641             if (first_offset_covered < start_offset) {
642                 assert(i == area_reftable_index);
643                 j = (start_offset - first_offset_covered) / s->cluster_size;
644                 assert(j < s->refcount_block_size);
645             } else {
646                 j = 0;
647             }
648
649             end_index = MIN((end_offset - first_offset_covered) /
650                             s->cluster_size,
651                             s->refcount_block_size);
652
653             for (; j < end_index; j++) {
654                 /* The caller guaranteed us this space would be empty */
655                 assert(s->get_refcount(refblock_data, j) == 0);
656                 s->set_refcount(refblock_data, j, 1);
657             }
658
659             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
660                                          refblock_data);
661         }
662
663         qcow2_cache_put(s->refcount_block_cache, &refblock_data);
664     }
665
666     assert(block_offset == table_offset);
667
668     /* Write refcount blocks to disk */
669     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
670     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
671     if (ret < 0) {
672         goto fail;
673     }
674
675     /* Write refcount table to disk */
676     for (i = 0; i < total_refblock_count; i++) {
677         cpu_to_be64s(&new_table[i]);
678     }
679
680     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
681     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
682         table_size * sizeof(uint64_t));
683     if (ret < 0) {
684         goto fail;
685     }
686
687     for (i = 0; i < total_refblock_count; i++) {
688         be64_to_cpus(&new_table[i]);
689     }
690
691     /* Hook up the new refcount table in the qcow2 header */
692     struct QEMU_PACKED {
693         uint64_t d64;
694         uint32_t d32;
695     } data;
696     data.d64 = cpu_to_be64(table_offset);
697     data.d32 = cpu_to_be32(table_clusters);
698     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
699     ret = bdrv_pwrite_sync(bs->file,
700                            offsetof(QCowHeader, refcount_table_offset),
701                            &data, sizeof(data));
702     if (ret < 0) {
703         goto fail;
704     }
705
706     /* And switch it in memory */
707     uint64_t old_table_offset = s->refcount_table_offset;
708     uint64_t old_table_size = s->refcount_table_size;
709
710     g_free(s->refcount_table);
711     s->refcount_table = new_table;
712     s->refcount_table_size = table_size;
713     s->refcount_table_offset = table_offset;
714     update_max_refcount_table_index(s);
715
716     /* Free old table. */
717     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
718                         QCOW2_DISCARD_OTHER);
719
720     return end_offset;
721
722 fail:
723     g_free(new_table);
724     return ret;
725 }
726
727 void qcow2_process_discards(BlockDriverState *bs, int ret)
728 {
729     BDRVQcow2State *s = bs->opaque;
730     Qcow2DiscardRegion *d, *next;
731
732     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
733         QTAILQ_REMOVE(&s->discards, d, next);
734
735         /* Discard is optional, ignore the return value */
736         if (ret >= 0) {
737             bdrv_pdiscard(bs->file->bs, d->offset, d->bytes);
738         }
739
740         g_free(d);
741     }
742 }
743
744 static void update_refcount_discard(BlockDriverState *bs,
745                                     uint64_t offset, uint64_t length)
746 {
747     BDRVQcow2State *s = bs->opaque;
748     Qcow2DiscardRegion *d, *p, *next;
749
750     QTAILQ_FOREACH(d, &s->discards, next) {
751         uint64_t new_start = MIN(offset, d->offset);
752         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
753
754         if (new_end - new_start <= length + d->bytes) {
755             /* There can't be any overlap, areas ending up here have no
756              * references any more and therefore shouldn't get freed another
757              * time. */
758             assert(d->bytes + length == new_end - new_start);
759             d->offset = new_start;
760             d->bytes = new_end - new_start;
761             goto found;
762         }
763     }
764
765     d = g_malloc(sizeof(*d));
766     *d = (Qcow2DiscardRegion) {
767         .bs     = bs,
768         .offset = offset,
769         .bytes  = length,
770     };
771     QTAILQ_INSERT_TAIL(&s->discards, d, next);
772
773 found:
774     /* Merge discard requests if they are adjacent now */
775     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
776         if (p == d
777             || p->offset > d->offset + d->bytes
778             || d->offset > p->offset + p->bytes)
779         {
780             continue;
781         }
782
783         /* Still no overlap possible */
784         assert(p->offset == d->offset + d->bytes
785             || d->offset == p->offset + p->bytes);
786
787         QTAILQ_REMOVE(&s->discards, p, next);
788         d->offset = MIN(d->offset, p->offset);
789         d->bytes += p->bytes;
790         g_free(p);
791     }
792 }
793
794 /* XXX: cache several refcount block clusters ? */
795 /* @addend is the absolute value of the addend; if @decrease is set, @addend
796  * will be subtracted from the current refcount, otherwise it will be added */
797 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
798                                                    int64_t offset,
799                                                    int64_t length,
800                                                    uint64_t addend,
801                                                    bool decrease,
802                                                    enum qcow2_discard_type type)
803 {
804     BDRVQcow2State *s = bs->opaque;
805     int64_t start, last, cluster_offset;
806     void *refcount_block = NULL;
807     int64_t old_table_index = -1;
808     int ret;
809
810 #ifdef DEBUG_ALLOC2
811     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
812             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
813             addend);
814 #endif
815     if (length < 0) {
816         return -EINVAL;
817     } else if (length == 0) {
818         return 0;
819     }
820
821     if (decrease) {
822         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
823             s->l2_table_cache);
824     }
825
826     start = start_of_cluster(s, offset);
827     last = start_of_cluster(s, offset + length - 1);
828     for(cluster_offset = start; cluster_offset <= last;
829         cluster_offset += s->cluster_size)
830     {
831         int block_index;
832         uint64_t refcount;
833         int64_t cluster_index = cluster_offset >> s->cluster_bits;
834         int64_t table_index = cluster_index >> s->refcount_block_bits;
835
836         /* Load the refcount block and allocate it if needed */
837         if (table_index != old_table_index) {
838             if (refcount_block) {
839                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
840             }
841             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
842             if (ret < 0) {
843                 goto fail;
844             }
845         }
846         old_table_index = table_index;
847
848         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
849
850         /* we can update the count and save it */
851         block_index = cluster_index & (s->refcount_block_size - 1);
852
853         refcount = s->get_refcount(refcount_block, block_index);
854         if (decrease ? (refcount - addend > refcount)
855                      : (refcount + addend < refcount ||
856                         refcount + addend > s->refcount_max))
857         {
858             ret = -EINVAL;
859             goto fail;
860         }
861         if (decrease) {
862             refcount -= addend;
863         } else {
864             refcount += addend;
865         }
866         if (refcount == 0 && cluster_index < s->free_cluster_index) {
867             s->free_cluster_index = cluster_index;
868         }
869         s->set_refcount(refcount_block, block_index, refcount);
870
871         if (refcount == 0) {
872             void *table;
873
874             table = qcow2_cache_is_table_offset(s->refcount_block_cache,
875                                                 offset);
876             if (table != NULL) {
877                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
878                 qcow2_cache_discard(s->refcount_block_cache, table);
879             }
880
881             table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
882             if (table != NULL) {
883                 qcow2_cache_discard(s->l2_table_cache, table);
884             }
885
886             if (s->discard_passthrough[type]) {
887                 update_refcount_discard(bs, cluster_offset, s->cluster_size);
888             }
889         }
890     }
891
892     ret = 0;
893 fail:
894     if (!s->cache_discards) {
895         qcow2_process_discards(bs, ret);
896     }
897
898     /* Write last changed block to disk */
899     if (refcount_block) {
900         qcow2_cache_put(s->refcount_block_cache, &refcount_block);
901     }
902
903     /*
904      * Try do undo any updates if an error is returned (This may succeed in
905      * some cases like ENOSPC for allocating a new refcount block)
906      */
907     if (ret < 0) {
908         int dummy;
909         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
910                                 !decrease, QCOW2_DISCARD_NEVER);
911         (void)dummy;
912     }
913
914     return ret;
915 }
916
917 /*
918  * Increases or decreases the refcount of a given cluster.
919  *
920  * @addend is the absolute value of the addend; if @decrease is set, @addend
921  * will be subtracted from the current refcount, otherwise it will be added.
922  *
923  * On success 0 is returned; on failure -errno is returned.
924  */
925 int qcow2_update_cluster_refcount(BlockDriverState *bs,
926                                   int64_t cluster_index,
927                                   uint64_t addend, bool decrease,
928                                   enum qcow2_discard_type type)
929 {
930     BDRVQcow2State *s = bs->opaque;
931     int ret;
932
933     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
934                           decrease, type);
935     if (ret < 0) {
936         return ret;
937     }
938
939     return 0;
940 }
941
942
943
944 /*********************************************************/
945 /* cluster allocation functions */
946
947
948
949 /* return < 0 if error */
950 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
951 {
952     BDRVQcow2State *s = bs->opaque;
953     uint64_t i, nb_clusters, refcount;
954     int ret;
955
956     /* We can't allocate clusters if they may still be queued for discard. */
957     if (s->cache_discards) {
958         qcow2_process_discards(bs, 0);
959     }
960
961     nb_clusters = size_to_clusters(s, size);
962 retry:
963     for(i = 0; i < nb_clusters; i++) {
964         uint64_t next_cluster_index = s->free_cluster_index++;
965         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
966
967         if (ret < 0) {
968             return ret;
969         } else if (refcount != 0) {
970             goto retry;
971         }
972     }
973
974     /* Make sure that all offsets in the "allocated" range are representable
975      * in an int64_t */
976     if (s->free_cluster_index > 0 &&
977         s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
978     {
979         return -EFBIG;
980     }
981
982 #ifdef DEBUG_ALLOC2
983     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
984             size,
985             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
986 #endif
987     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
988 }
989
990 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
991 {
992     int64_t offset;
993     int ret;
994
995     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
996     do {
997         offset = alloc_clusters_noref(bs, size);
998         if (offset < 0) {
999             return offset;
1000         }
1001
1002         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1003     } while (ret == -EAGAIN);
1004
1005     if (ret < 0) {
1006         return ret;
1007     }
1008
1009     return offset;
1010 }
1011
1012 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1013                                 int64_t nb_clusters)
1014 {
1015     BDRVQcow2State *s = bs->opaque;
1016     uint64_t cluster_index, refcount;
1017     uint64_t i;
1018     int ret;
1019
1020     assert(nb_clusters >= 0);
1021     if (nb_clusters == 0) {
1022         return 0;
1023     }
1024
1025     do {
1026         /* Check how many clusters there are free */
1027         cluster_index = offset >> s->cluster_bits;
1028         for(i = 0; i < nb_clusters; i++) {
1029             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1030             if (ret < 0) {
1031                 return ret;
1032             } else if (refcount != 0) {
1033                 break;
1034             }
1035         }
1036
1037         /* And then allocate them */
1038         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1039                               QCOW2_DISCARD_NEVER);
1040     } while (ret == -EAGAIN);
1041
1042     if (ret < 0) {
1043         return ret;
1044     }
1045
1046     return i;
1047 }
1048
1049 /* only used to allocate compressed sectors. We try to allocate
1050    contiguous sectors. size must be <= cluster_size */
1051 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
1052 {
1053     BDRVQcow2State *s = bs->opaque;
1054     int64_t offset;
1055     size_t free_in_cluster;
1056     int ret;
1057
1058     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1059     assert(size > 0 && size <= s->cluster_size);
1060     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1061
1062     offset = s->free_byte_offset;
1063
1064     if (offset) {
1065         uint64_t refcount;
1066         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1067         if (ret < 0) {
1068             return ret;
1069         }
1070
1071         if (refcount == s->refcount_max) {
1072             offset = 0;
1073         }
1074     }
1075
1076     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1077     do {
1078         if (!offset || free_in_cluster < size) {
1079             int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
1080             if (new_cluster < 0) {
1081                 return new_cluster;
1082             }
1083
1084             if (new_cluster == 0) {
1085                 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1086                                         "allocation of compressed cluster "
1087                                         "at offset 0");
1088                 return -EIO;
1089             }
1090
1091             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1092                 offset = new_cluster;
1093                 free_in_cluster = s->cluster_size;
1094             } else {
1095                 free_in_cluster += s->cluster_size;
1096             }
1097         }
1098
1099         assert(offset);
1100         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1101         if (ret < 0) {
1102             offset = 0;
1103         }
1104     } while (ret == -EAGAIN);
1105     if (ret < 0) {
1106         return ret;
1107     }
1108
1109     /* The cluster refcount was incremented; refcount blocks must be flushed
1110      * before the caller's L2 table updates. */
1111     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1112
1113     s->free_byte_offset = offset + size;
1114     if (!offset_into_cluster(s, s->free_byte_offset)) {
1115         s->free_byte_offset = 0;
1116     }
1117
1118     return offset;
1119 }
1120
1121 void qcow2_free_clusters(BlockDriverState *bs,
1122                           int64_t offset, int64_t size,
1123                           enum qcow2_discard_type type)
1124 {
1125     int ret;
1126
1127     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1128     ret = update_refcount(bs, offset, size, 1, true, type);
1129     if (ret < 0) {
1130         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1131         /* TODO Remember the clusters to free them later and avoid leaking */
1132     }
1133 }
1134
1135 /*
1136  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1137  * normal cluster, compressed cluster, etc.)
1138  */
1139 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1140                              int nb_clusters, enum qcow2_discard_type type)
1141 {
1142     BDRVQcow2State *s = bs->opaque;
1143
1144     switch (qcow2_get_cluster_type(l2_entry)) {
1145     case QCOW2_CLUSTER_COMPRESSED:
1146         {
1147             int nb_csectors;
1148             nb_csectors = ((l2_entry >> s->csize_shift) &
1149                            s->csize_mask) + 1;
1150             qcow2_free_clusters(bs,
1151                 (l2_entry & s->cluster_offset_mask) & ~511,
1152                 nb_csectors * 512, type);
1153         }
1154         break;
1155     case QCOW2_CLUSTER_NORMAL:
1156     case QCOW2_CLUSTER_ZERO_ALLOC:
1157         if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1158             qcow2_signal_corruption(bs, false, -1, -1,
1159                                     "Cannot free unaligned cluster %#llx",
1160                                     l2_entry & L2E_OFFSET_MASK);
1161         } else {
1162             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1163                                 nb_clusters << s->cluster_bits, type);
1164         }
1165         break;
1166     case QCOW2_CLUSTER_ZERO_PLAIN:
1167     case QCOW2_CLUSTER_UNALLOCATED:
1168         break;
1169     default:
1170         abort();
1171     }
1172 }
1173
1174 int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
1175 {
1176     BDRVQcow2State *s = bs->opaque;
1177     int ret;
1178
1179     ret = qcow2_cache_write(bs, s->l2_table_cache);
1180     if (ret < 0) {
1181         return ret;
1182     }
1183
1184     if (qcow2_need_accurate_refcounts(s)) {
1185         ret = qcow2_cache_write(bs, s->refcount_block_cache);
1186         if (ret < 0) {
1187             return ret;
1188         }
1189     }
1190
1191     return 0;
1192 }
1193
1194 int coroutine_fn qcow2_flush_caches(BlockDriverState *bs)
1195 {
1196     int ret = qcow2_write_caches(bs);
1197     if (ret < 0) {
1198         return ret;
1199     }
1200
1201     return bdrv_flush(bs->file->bs);
1202 }
1203
1204 /*********************************************************/
1205 /* snapshots and image creation */
1206
1207
1208
1209 /* update the refcounts of snapshots and the copied flag */
1210 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1211     int64_t l1_table_offset, int l1_size, int addend)
1212 {
1213     BDRVQcow2State *s = bs->opaque;
1214     uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1215     bool l1_allocated = false;
1216     int64_t old_entry, old_l2_offset;
1217     unsigned slice, slice_size2, n_slices;
1218     int i, j, l1_modified = 0, nb_csectors;
1219     int ret;
1220
1221     assert(addend >= -1 && addend <= 1);
1222
1223     l2_slice = NULL;
1224     l1_table = NULL;
1225     l1_size2 = l1_size * sizeof(uint64_t);
1226     slice_size2 = s->l2_slice_size * sizeof(uint64_t);
1227     n_slices = s->cluster_size / slice_size2;
1228
1229     s->cache_discards = true;
1230
1231     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1232      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1233      * when changing this! */
1234     if (l1_table_offset != s->l1_table_offset) {
1235         l1_table = g_try_malloc0(ROUND_UP(l1_size2, 512));
1236         if (l1_size2 && l1_table == NULL) {
1237             ret = -ENOMEM;
1238             goto fail;
1239         }
1240         l1_allocated = true;
1241
1242         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1243         if (ret < 0) {
1244             goto fail;
1245         }
1246
1247         for (i = 0; i < l1_size; i++) {
1248             be64_to_cpus(&l1_table[i]);
1249         }
1250     } else {
1251         assert(l1_size == s->l1_size);
1252         l1_table = s->l1_table;
1253         l1_allocated = false;
1254     }
1255
1256     for (i = 0; i < l1_size; i++) {
1257         l2_offset = l1_table[i];
1258         if (l2_offset) {
1259             old_l2_offset = l2_offset;
1260             l2_offset &= L1E_OFFSET_MASK;
1261
1262             if (offset_into_cluster(s, l2_offset)) {
1263                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1264                                         PRIx64 " unaligned (L1 index: %#x)",
1265                                         l2_offset, i);
1266                 ret = -EIO;
1267                 goto fail;
1268             }
1269
1270             for (slice = 0; slice < n_slices; slice++) {
1271                 ret = qcow2_cache_get(bs, s->l2_table_cache,
1272                                       l2_offset + slice * slice_size2,
1273                                       (void **) &l2_slice);
1274                 if (ret < 0) {
1275                     goto fail;
1276                 }
1277
1278                 for (j = 0; j < s->l2_slice_size; j++) {
1279                     uint64_t cluster_index;
1280                     uint64_t offset;
1281
1282                     entry = be64_to_cpu(l2_slice[j]);
1283                     old_entry = entry;
1284                     entry &= ~QCOW_OFLAG_COPIED;
1285                     offset = entry & L2E_OFFSET_MASK;
1286
1287                     switch (qcow2_get_cluster_type(entry)) {
1288                     case QCOW2_CLUSTER_COMPRESSED:
1289                         nb_csectors = ((entry >> s->csize_shift) &
1290                                        s->csize_mask) + 1;
1291                         if (addend != 0) {
1292                             ret = update_refcount(
1293                                 bs, (entry & s->cluster_offset_mask) & ~511,
1294                                 nb_csectors * 512, abs(addend), addend < 0,
1295                                 QCOW2_DISCARD_SNAPSHOT);
1296                             if (ret < 0) {
1297                                 goto fail;
1298                             }
1299                         }
1300                         /* compressed clusters are never modified */
1301                         refcount = 2;
1302                         break;
1303
1304                     case QCOW2_CLUSTER_NORMAL:
1305                     case QCOW2_CLUSTER_ZERO_ALLOC:
1306                         if (offset_into_cluster(s, offset)) {
1307                             /* Here l2_index means table (not slice) index */
1308                             int l2_index = slice * s->l2_slice_size + j;
1309                             qcow2_signal_corruption(
1310                                 bs, true, -1, -1, "Cluster "
1311                                 "allocation offset %#" PRIx64
1312                                 " unaligned (L2 offset: %#"
1313                                 PRIx64 ", L2 index: %#x)",
1314                                 offset, l2_offset, l2_index);
1315                             ret = -EIO;
1316                             goto fail;
1317                         }
1318
1319                         cluster_index = offset >> s->cluster_bits;
1320                         assert(cluster_index);
1321                         if (addend != 0) {
1322                             ret = qcow2_update_cluster_refcount(
1323                                 bs, cluster_index, abs(addend), addend < 0,
1324                                 QCOW2_DISCARD_SNAPSHOT);
1325                             if (ret < 0) {
1326                                 goto fail;
1327                             }
1328                         }
1329
1330                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1331                         if (ret < 0) {
1332                             goto fail;
1333                         }
1334                         break;
1335
1336                     case QCOW2_CLUSTER_ZERO_PLAIN:
1337                     case QCOW2_CLUSTER_UNALLOCATED:
1338                         refcount = 0;
1339                         break;
1340
1341                     default:
1342                         abort();
1343                     }
1344
1345                     if (refcount == 1) {
1346                         entry |= QCOW_OFLAG_COPIED;
1347                     }
1348                     if (entry != old_entry) {
1349                         if (addend > 0) {
1350                             qcow2_cache_set_dependency(bs, s->l2_table_cache,
1351                                                        s->refcount_block_cache);
1352                         }
1353                         l2_slice[j] = cpu_to_be64(entry);
1354                         qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1355                                                      l2_slice);
1356                     }
1357                 }
1358
1359                 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1360             }
1361
1362             if (addend != 0) {
1363                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1364                                                         s->cluster_bits,
1365                                                     abs(addend), addend < 0,
1366                                                     QCOW2_DISCARD_SNAPSHOT);
1367                 if (ret < 0) {
1368                     goto fail;
1369                 }
1370             }
1371             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1372                                      &refcount);
1373             if (ret < 0) {
1374                 goto fail;
1375             } else if (refcount == 1) {
1376                 l2_offset |= QCOW_OFLAG_COPIED;
1377             }
1378             if (l2_offset != old_l2_offset) {
1379                 l1_table[i] = l2_offset;
1380                 l1_modified = 1;
1381             }
1382         }
1383     }
1384
1385     ret = bdrv_flush(bs);
1386 fail:
1387     if (l2_slice) {
1388         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1389     }
1390
1391     s->cache_discards = false;
1392     qcow2_process_discards(bs, ret);
1393
1394     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1395     if (ret == 0 && addend >= 0 && l1_modified) {
1396         for (i = 0; i < l1_size; i++) {
1397             cpu_to_be64s(&l1_table[i]);
1398         }
1399
1400         ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
1401                                l1_table, l1_size2);
1402
1403         for (i = 0; i < l1_size; i++) {
1404             be64_to_cpus(&l1_table[i]);
1405         }
1406     }
1407     if (l1_allocated)
1408         g_free(l1_table);
1409     return ret;
1410 }
1411
1412
1413
1414
1415 /*********************************************************/
1416 /* refcount checking functions */
1417
1418
1419 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1420 {
1421     /* This assertion holds because there is no way we can address more than
1422      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1423      * offsets have to be representable in bytes); due to every cluster
1424      * corresponding to one refcount entry, we are well below that limit */
1425     assert(entries < (UINT64_C(1) << (64 - 9)));
1426
1427     /* Thanks to the assertion this will not overflow, because
1428      * s->refcount_order < 7.
1429      * (note: x << s->refcount_order == x * s->refcount_bits) */
1430     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1431 }
1432
1433 /**
1434  * Reallocates *array so that it can hold new_size entries. *size must contain
1435  * the current number of entries in *array. If the reallocation fails, *array
1436  * and *size will not be modified and -errno will be returned. If the
1437  * reallocation is successful, *array will be set to the new buffer, *size
1438  * will be set to new_size and 0 will be returned. The size of the reallocated
1439  * refcount array buffer will be aligned to a cluster boundary, and the newly
1440  * allocated area will be zeroed.
1441  */
1442 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1443                                   int64_t *size, int64_t new_size)
1444 {
1445     int64_t old_byte_size, new_byte_size;
1446     void *new_ptr;
1447
1448     /* Round to clusters so the array can be directly written to disk */
1449     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1450                     * s->cluster_size;
1451     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1452                     * s->cluster_size;
1453
1454     if (new_byte_size == old_byte_size) {
1455         *size = new_size;
1456         return 0;
1457     }
1458
1459     assert(new_byte_size > 0);
1460
1461     if (new_byte_size > SIZE_MAX) {
1462         return -ENOMEM;
1463     }
1464
1465     new_ptr = g_try_realloc(*array, new_byte_size);
1466     if (!new_ptr) {
1467         return -ENOMEM;
1468     }
1469
1470     if (new_byte_size > old_byte_size) {
1471         memset((char *)new_ptr + old_byte_size, 0,
1472                new_byte_size - old_byte_size);
1473     }
1474
1475     *array = new_ptr;
1476     *size  = new_size;
1477
1478     return 0;
1479 }
1480
1481 /*
1482  * Increases the refcount for a range of clusters in a given refcount table.
1483  * This is used to construct a temporary refcount table out of L1 and L2 tables
1484  * which can be compared to the refcount table saved in the image.
1485  *
1486  * Modifies the number of errors in res.
1487  */
1488 int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1489                              void **refcount_table,
1490                              int64_t *refcount_table_size,
1491                              int64_t offset, int64_t size)
1492 {
1493     BDRVQcow2State *s = bs->opaque;
1494     uint64_t start, last, cluster_offset, k, refcount;
1495     int ret;
1496
1497     if (size <= 0) {
1498         return 0;
1499     }
1500
1501     start = start_of_cluster(s, offset);
1502     last = start_of_cluster(s, offset + size - 1);
1503     for(cluster_offset = start; cluster_offset <= last;
1504         cluster_offset += s->cluster_size) {
1505         k = cluster_offset >> s->cluster_bits;
1506         if (k >= *refcount_table_size) {
1507             ret = realloc_refcount_array(s, refcount_table,
1508                                          refcount_table_size, k + 1);
1509             if (ret < 0) {
1510                 res->check_errors++;
1511                 return ret;
1512             }
1513         }
1514
1515         refcount = s->get_refcount(*refcount_table, k);
1516         if (refcount == s->refcount_max) {
1517             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1518                     "\n", cluster_offset);
1519             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1520                     "width or qemu-img convert to create a clean copy if the "
1521                     "image cannot be opened for writing\n");
1522             res->corruptions++;
1523             continue;
1524         }
1525         s->set_refcount(*refcount_table, k, refcount + 1);
1526     }
1527
1528     return 0;
1529 }
1530
1531 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1532 enum {
1533     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1534 };
1535
1536 /*
1537  * Increases the refcount in the given refcount table for the all clusters
1538  * referenced in the L2 table. While doing so, performs some checks on L2
1539  * entries.
1540  *
1541  * Returns the number of errors found by the checks or -errno if an internal
1542  * error occurred.
1543  */
1544 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1545                               void **refcount_table,
1546                               int64_t *refcount_table_size, int64_t l2_offset,
1547                               int flags, BdrvCheckMode fix)
1548 {
1549     BDRVQcow2State *s = bs->opaque;
1550     uint64_t *l2_table, l2_entry;
1551     uint64_t next_contiguous_offset = 0;
1552     int i, l2_size, nb_csectors, ret;
1553
1554     /* Read L2 table from disk */
1555     l2_size = s->l2_size * sizeof(uint64_t);
1556     l2_table = g_malloc(l2_size);
1557
1558     ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
1559     if (ret < 0) {
1560         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1561         res->check_errors++;
1562         goto fail;
1563     }
1564
1565     /* Do the actual checks */
1566     for(i = 0; i < s->l2_size; i++) {
1567         l2_entry = be64_to_cpu(l2_table[i]);
1568
1569         switch (qcow2_get_cluster_type(l2_entry)) {
1570         case QCOW2_CLUSTER_COMPRESSED:
1571             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1572             if (l2_entry & QCOW_OFLAG_COPIED) {
1573                 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1574                     "copied flag must never be set for compressed "
1575                     "clusters\n", l2_entry >> s->cluster_bits);
1576                 l2_entry &= ~QCOW_OFLAG_COPIED;
1577                 res->corruptions++;
1578             }
1579
1580             /* Mark cluster as used */
1581             nb_csectors = ((l2_entry >> s->csize_shift) &
1582                            s->csize_mask) + 1;
1583             l2_entry &= s->cluster_offset_mask;
1584             ret = qcow2_inc_refcounts_imrt(bs, res,
1585                                            refcount_table, refcount_table_size,
1586                                            l2_entry & ~511, nb_csectors * 512);
1587             if (ret < 0) {
1588                 goto fail;
1589             }
1590
1591             if (flags & CHECK_FRAG_INFO) {
1592                 res->bfi.allocated_clusters++;
1593                 res->bfi.compressed_clusters++;
1594
1595                 /* Compressed clusters are fragmented by nature.  Since they
1596                  * take up sub-sector space but we only have sector granularity
1597                  * I/O we need to re-read the same sectors even for adjacent
1598                  * compressed clusters.
1599                  */
1600                 res->bfi.fragmented_clusters++;
1601             }
1602             break;
1603
1604         case QCOW2_CLUSTER_ZERO_ALLOC:
1605         case QCOW2_CLUSTER_NORMAL:
1606         {
1607             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1608
1609             if (flags & CHECK_FRAG_INFO) {
1610                 res->bfi.allocated_clusters++;
1611                 if (next_contiguous_offset &&
1612                     offset != next_contiguous_offset) {
1613                     res->bfi.fragmented_clusters++;
1614                 }
1615                 next_contiguous_offset = offset + s->cluster_size;
1616             }
1617
1618             /* Correct offsets are cluster aligned */
1619             if (offset_into_cluster(s, offset)) {
1620                 if (qcow2_get_cluster_type(l2_entry) ==
1621                     QCOW2_CLUSTER_ZERO_ALLOC)
1622                 {
1623                     fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated zero "
1624                             "cluster is not properly aligned; L2 entry "
1625                             "corrupted.\n",
1626                             fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1627                             offset);
1628                     if (fix & BDRV_FIX_ERRORS) {
1629                         uint64_t l2e_offset =
1630                             l2_offset + (uint64_t)i * sizeof(uint64_t);
1631
1632                         l2_entry = QCOW_OFLAG_ZERO;
1633                         l2_table[i] = cpu_to_be64(l2_entry);
1634                         ret = qcow2_pre_write_overlap_check(bs,
1635                                 QCOW2_OL_ACTIVE_L2 | QCOW2_OL_INACTIVE_L2,
1636                                 l2e_offset, sizeof(uint64_t));
1637                         if (ret < 0) {
1638                             fprintf(stderr, "ERROR: Overlap check failed\n");
1639                             res->check_errors++;
1640                             /* Something is seriously wrong, so abort checking
1641                              * this L2 table */
1642                             goto fail;
1643                         }
1644
1645                         ret = bdrv_pwrite_sync(bs->file, l2e_offset,
1646                                                &l2_table[i], sizeof(uint64_t));
1647                         if (ret < 0) {
1648                             fprintf(stderr, "ERROR: Failed to overwrite L2 "
1649                                     "table entry: %s\n", strerror(-ret));
1650                             res->check_errors++;
1651                             /* Do not abort, continue checking the rest of this
1652                              * L2 table's entries */
1653                         } else {
1654                             res->corruptions_fixed++;
1655                             /* Skip marking the cluster as used
1656                              * (it is unused now) */
1657                             continue;
1658                         }
1659                     } else {
1660                         res->corruptions++;
1661                     }
1662                 } else {
1663                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1664                         "not properly aligned; L2 entry corrupted.\n", offset);
1665                     res->corruptions++;
1666                 }
1667             }
1668
1669             /* Mark cluster as used */
1670             ret = qcow2_inc_refcounts_imrt(bs, res,
1671                                            refcount_table, refcount_table_size,
1672                                            offset, s->cluster_size);
1673             if (ret < 0) {
1674                 goto fail;
1675             }
1676             break;
1677         }
1678
1679         case QCOW2_CLUSTER_ZERO_PLAIN:
1680         case QCOW2_CLUSTER_UNALLOCATED:
1681             break;
1682
1683         default:
1684             abort();
1685         }
1686     }
1687
1688     g_free(l2_table);
1689     return 0;
1690
1691 fail:
1692     g_free(l2_table);
1693     return ret;
1694 }
1695
1696 /*
1697  * Increases the refcount for the L1 table, its L2 tables and all referenced
1698  * clusters in the given refcount table. While doing so, performs some checks
1699  * on L1 and L2 entries.
1700  *
1701  * Returns the number of errors found by the checks or -errno if an internal
1702  * error occurred.
1703  */
1704 static int check_refcounts_l1(BlockDriverState *bs,
1705                               BdrvCheckResult *res,
1706                               void **refcount_table,
1707                               int64_t *refcount_table_size,
1708                               int64_t l1_table_offset, int l1_size,
1709                               int flags, BdrvCheckMode fix)
1710 {
1711     BDRVQcow2State *s = bs->opaque;
1712     uint64_t *l1_table = NULL, l2_offset, l1_size2;
1713     int i, ret;
1714
1715     l1_size2 = l1_size * sizeof(uint64_t);
1716
1717     /* Mark L1 table as used */
1718     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1719                                    l1_table_offset, l1_size2);
1720     if (ret < 0) {
1721         goto fail;
1722     }
1723
1724     /* Read L1 table entries from disk */
1725     if (l1_size2 > 0) {
1726         l1_table = g_try_malloc(l1_size2);
1727         if (l1_table == NULL) {
1728             ret = -ENOMEM;
1729             res->check_errors++;
1730             goto fail;
1731         }
1732         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1733         if (ret < 0) {
1734             fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1735             res->check_errors++;
1736             goto fail;
1737         }
1738         for(i = 0;i < l1_size; i++)
1739             be64_to_cpus(&l1_table[i]);
1740     }
1741
1742     /* Do the actual checks */
1743     for(i = 0; i < l1_size; i++) {
1744         l2_offset = l1_table[i];
1745         if (l2_offset) {
1746             /* Mark L2 table as used */
1747             l2_offset &= L1E_OFFSET_MASK;
1748             ret = qcow2_inc_refcounts_imrt(bs, res,
1749                                            refcount_table, refcount_table_size,
1750                                            l2_offset, s->cluster_size);
1751             if (ret < 0) {
1752                 goto fail;
1753             }
1754
1755             /* L2 tables are cluster aligned */
1756             if (offset_into_cluster(s, l2_offset)) {
1757                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1758                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1759                 res->corruptions++;
1760             }
1761
1762             /* Process and check L2 entries */
1763             ret = check_refcounts_l2(bs, res, refcount_table,
1764                                      refcount_table_size, l2_offset, flags,
1765                                      fix);
1766             if (ret < 0) {
1767                 goto fail;
1768             }
1769         }
1770     }
1771     g_free(l1_table);
1772     return 0;
1773
1774 fail:
1775     g_free(l1_table);
1776     return ret;
1777 }
1778
1779 /*
1780  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1781  *
1782  * This function does not print an error message nor does it increment
1783  * check_errors if qcow2_get_refcount fails (this is because such an error will
1784  * have been already detected and sufficiently signaled by the calling function
1785  * (qcow2_check_refcounts) by the time this function is called).
1786  */
1787 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1788                               BdrvCheckMode fix)
1789 {
1790     BDRVQcow2State *s = bs->opaque;
1791     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1792     int ret;
1793     uint64_t refcount;
1794     int i, j;
1795
1796     for (i = 0; i < s->l1_size; i++) {
1797         uint64_t l1_entry = s->l1_table[i];
1798         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1799         bool l2_dirty = false;
1800
1801         if (!l2_offset) {
1802             continue;
1803         }
1804
1805         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1806                                  &refcount);
1807         if (ret < 0) {
1808             /* don't print message nor increment check_errors */
1809             continue;
1810         }
1811         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1812             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1813                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1814                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1815                                             "ERROR",
1816                     i, l1_entry, refcount);
1817             if (fix & BDRV_FIX_ERRORS) {
1818                 s->l1_table[i] = refcount == 1
1819                                ? l1_entry |  QCOW_OFLAG_COPIED
1820                                : l1_entry & ~QCOW_OFLAG_COPIED;
1821                 ret = qcow2_write_l1_entry(bs, i);
1822                 if (ret < 0) {
1823                     res->check_errors++;
1824                     goto fail;
1825                 }
1826                 res->corruptions_fixed++;
1827             } else {
1828                 res->corruptions++;
1829             }
1830         }
1831
1832         ret = bdrv_pread(bs->file, l2_offset, l2_table,
1833                          s->l2_size * sizeof(uint64_t));
1834         if (ret < 0) {
1835             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1836                     strerror(-ret));
1837             res->check_errors++;
1838             goto fail;
1839         }
1840
1841         for (j = 0; j < s->l2_size; j++) {
1842             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1843             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1844             QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
1845
1846             if (cluster_type == QCOW2_CLUSTER_NORMAL ||
1847                 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
1848                 ret = qcow2_get_refcount(bs,
1849                                          data_offset >> s->cluster_bits,
1850                                          &refcount);
1851                 if (ret < 0) {
1852                     /* don't print message nor increment check_errors */
1853                     continue;
1854                 }
1855                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1856                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1857                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1858                             fix & BDRV_FIX_ERRORS ? "Repairing" :
1859                                                     "ERROR",
1860                             l2_entry, refcount);
1861                     if (fix & BDRV_FIX_ERRORS) {
1862                         l2_table[j] = cpu_to_be64(refcount == 1
1863                                     ? l2_entry |  QCOW_OFLAG_COPIED
1864                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1865                         l2_dirty = true;
1866                         res->corruptions_fixed++;
1867                     } else {
1868                         res->corruptions++;
1869                     }
1870                 }
1871             }
1872         }
1873
1874         if (l2_dirty) {
1875             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1876                                                 l2_offset, s->cluster_size);
1877             if (ret < 0) {
1878                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1879                         "overlap check failed: %s\n", strerror(-ret));
1880                 res->check_errors++;
1881                 goto fail;
1882             }
1883
1884             ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
1885                               s->cluster_size);
1886             if (ret < 0) {
1887                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1888                         strerror(-ret));
1889                 res->check_errors++;
1890                 goto fail;
1891             }
1892         }
1893     }
1894
1895     ret = 0;
1896
1897 fail:
1898     qemu_vfree(l2_table);
1899     return ret;
1900 }
1901
1902 /*
1903  * Checks consistency of refblocks and accounts for each refblock in
1904  * *refcount_table.
1905  */
1906 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1907                            BdrvCheckMode fix, bool *rebuild,
1908                            void **refcount_table, int64_t *nb_clusters)
1909 {
1910     BDRVQcow2State *s = bs->opaque;
1911     int64_t i, size;
1912     int ret;
1913
1914     for(i = 0; i < s->refcount_table_size; i++) {
1915         uint64_t offset, cluster;
1916         offset = s->refcount_table[i];
1917         cluster = offset >> s->cluster_bits;
1918
1919         /* Refcount blocks are cluster aligned */
1920         if (offset_into_cluster(s, offset)) {
1921             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1922                 "cluster aligned; refcount table entry corrupted\n", i);
1923             res->corruptions++;
1924             *rebuild = true;
1925             continue;
1926         }
1927
1928         if (cluster >= *nb_clusters) {
1929             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1930                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1931
1932             if (fix & BDRV_FIX_ERRORS) {
1933                 int64_t new_nb_clusters;
1934                 Error *local_err = NULL;
1935
1936                 if (offset > INT64_MAX - s->cluster_size) {
1937                     ret = -EINVAL;
1938                     goto resize_fail;
1939                 }
1940
1941                 ret = bdrv_truncate(bs->file, offset + s->cluster_size,
1942                                     PREALLOC_MODE_OFF, &local_err);
1943                 if (ret < 0) {
1944                     error_report_err(local_err);
1945                     goto resize_fail;
1946                 }
1947                 size = bdrv_getlength(bs->file->bs);
1948                 if (size < 0) {
1949                     ret = size;
1950                     goto resize_fail;
1951                 }
1952
1953                 new_nb_clusters = size_to_clusters(s, size);
1954                 assert(new_nb_clusters >= *nb_clusters);
1955
1956                 ret = realloc_refcount_array(s, refcount_table,
1957                                              nb_clusters, new_nb_clusters);
1958                 if (ret < 0) {
1959                     res->check_errors++;
1960                     return ret;
1961                 }
1962
1963                 if (cluster >= *nb_clusters) {
1964                     ret = -EINVAL;
1965                     goto resize_fail;
1966                 }
1967
1968                 res->corruptions_fixed++;
1969                 ret = qcow2_inc_refcounts_imrt(bs, res,
1970                                                refcount_table, nb_clusters,
1971                                                offset, s->cluster_size);
1972                 if (ret < 0) {
1973                     return ret;
1974                 }
1975                 /* No need to check whether the refcount is now greater than 1:
1976                  * This area was just allocated and zeroed, so it can only be
1977                  * exactly 1 after qcow2_inc_refcounts_imrt() */
1978                 continue;
1979
1980 resize_fail:
1981                 res->corruptions++;
1982                 *rebuild = true;
1983                 fprintf(stderr, "ERROR could not resize image: %s\n",
1984                         strerror(-ret));
1985             } else {
1986                 res->corruptions++;
1987             }
1988             continue;
1989         }
1990
1991         if (offset != 0) {
1992             ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1993                                            offset, s->cluster_size);
1994             if (ret < 0) {
1995                 return ret;
1996             }
1997             if (s->get_refcount(*refcount_table, cluster) != 1) {
1998                 fprintf(stderr, "ERROR refcount block %" PRId64
1999                         " refcount=%" PRIu64 "\n", i,
2000                         s->get_refcount(*refcount_table, cluster));
2001                 res->corruptions++;
2002                 *rebuild = true;
2003             }
2004         }
2005     }
2006
2007     return 0;
2008 }
2009
2010 /*
2011  * Calculates an in-memory refcount table.
2012  */
2013 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2014                                BdrvCheckMode fix, bool *rebuild,
2015                                void **refcount_table, int64_t *nb_clusters)
2016 {
2017     BDRVQcow2State *s = bs->opaque;
2018     int64_t i;
2019     QCowSnapshot *sn;
2020     int ret;
2021
2022     if (!*refcount_table) {
2023         int64_t old_size = 0;
2024         ret = realloc_refcount_array(s, refcount_table,
2025                                      &old_size, *nb_clusters);
2026         if (ret < 0) {
2027             res->check_errors++;
2028             return ret;
2029         }
2030     }
2031
2032     /* header */
2033     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2034                                    0, s->cluster_size);
2035     if (ret < 0) {
2036         return ret;
2037     }
2038
2039     /* current L1 table */
2040     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2041                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2042                              fix);
2043     if (ret < 0) {
2044         return ret;
2045     }
2046
2047     /* snapshots */
2048     for (i = 0; i < s->nb_snapshots; i++) {
2049         sn = s->snapshots + i;
2050         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2051                                  sn->l1_table_offset, sn->l1_size, 0, fix);
2052         if (ret < 0) {
2053             return ret;
2054         }
2055     }
2056     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2057                                    s->snapshots_offset, s->snapshots_size);
2058     if (ret < 0) {
2059         return ret;
2060     }
2061
2062     /* refcount data */
2063     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2064                                    s->refcount_table_offset,
2065                                    s->refcount_table_size * sizeof(uint64_t));
2066     if (ret < 0) {
2067         return ret;
2068     }
2069
2070     /* encryption */
2071     if (s->crypto_header.length) {
2072         ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2073                                        s->crypto_header.offset,
2074                                        s->crypto_header.length);
2075         if (ret < 0) {
2076             return ret;
2077         }
2078     }
2079
2080     /* bitmaps */
2081     ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2082     if (ret < 0) {
2083         return ret;
2084     }
2085
2086     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2087 }
2088
2089 /*
2090  * Compares the actual reference count for each cluster in the image against the
2091  * refcount as reported by the refcount structures on-disk.
2092  */
2093 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2094                               BdrvCheckMode fix, bool *rebuild,
2095                               int64_t *highest_cluster,
2096                               void *refcount_table, int64_t nb_clusters)
2097 {
2098     BDRVQcow2State *s = bs->opaque;
2099     int64_t i;
2100     uint64_t refcount1, refcount2;
2101     int ret;
2102
2103     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2104         ret = qcow2_get_refcount(bs, i, &refcount1);
2105         if (ret < 0) {
2106             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2107                     i, strerror(-ret));
2108             res->check_errors++;
2109             continue;
2110         }
2111
2112         refcount2 = s->get_refcount(refcount_table, i);
2113
2114         if (refcount1 > 0 || refcount2 > 0) {
2115             *highest_cluster = i;
2116         }
2117
2118         if (refcount1 != refcount2) {
2119             /* Check if we're allowed to fix the mismatch */
2120             int *num_fixed = NULL;
2121             if (refcount1 == 0) {
2122                 *rebuild = true;
2123             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2124                 num_fixed = &res->leaks_fixed;
2125             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2126                 num_fixed = &res->corruptions_fixed;
2127             }
2128
2129             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2130                     " reference=%" PRIu64 "\n",
2131                    num_fixed != NULL     ? "Repairing" :
2132                    refcount1 < refcount2 ? "ERROR" :
2133                                            "Leaked",
2134                    i, refcount1, refcount2);
2135
2136             if (num_fixed) {
2137                 ret = update_refcount(bs, i << s->cluster_bits, 1,
2138                                       refcount_diff(refcount1, refcount2),
2139                                       refcount1 > refcount2,
2140                                       QCOW2_DISCARD_ALWAYS);
2141                 if (ret >= 0) {
2142                     (*num_fixed)++;
2143                     continue;
2144                 }
2145             }
2146
2147             /* And if we couldn't, print an error */
2148             if (refcount1 < refcount2) {
2149                 res->corruptions++;
2150             } else {
2151                 res->leaks++;
2152             }
2153         }
2154     }
2155 }
2156
2157 /*
2158  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2159  * the on-disk refcount structures.
2160  *
2161  * On input, *first_free_cluster tells where to start looking, and need not
2162  * actually be a free cluster; the returned offset will not be before that
2163  * cluster.  On output, *first_free_cluster points to the first gap found, even
2164  * if that gap was too small to be used as the returned offset.
2165  *
2166  * Note that *first_free_cluster is a cluster index whereas the return value is
2167  * an offset.
2168  */
2169 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2170                                    int cluster_count,
2171                                    void **refcount_table,
2172                                    int64_t *imrt_nb_clusters,
2173                                    int64_t *first_free_cluster)
2174 {
2175     BDRVQcow2State *s = bs->opaque;
2176     int64_t cluster = *first_free_cluster, i;
2177     bool first_gap = true;
2178     int contiguous_free_clusters;
2179     int ret;
2180
2181     /* Starting at *first_free_cluster, find a range of at least cluster_count
2182      * continuously free clusters */
2183     for (contiguous_free_clusters = 0;
2184          cluster < *imrt_nb_clusters &&
2185          contiguous_free_clusters < cluster_count;
2186          cluster++)
2187     {
2188         if (!s->get_refcount(*refcount_table, cluster)) {
2189             contiguous_free_clusters++;
2190             if (first_gap) {
2191                 /* If this is the first free cluster found, update
2192                  * *first_free_cluster accordingly */
2193                 *first_free_cluster = cluster;
2194                 first_gap = false;
2195             }
2196         } else if (contiguous_free_clusters) {
2197             contiguous_free_clusters = 0;
2198         }
2199     }
2200
2201     /* If contiguous_free_clusters is greater than zero, it contains the number
2202      * of continuously free clusters until the current cluster; the first free
2203      * cluster in the current "gap" is therefore
2204      * cluster - contiguous_free_clusters */
2205
2206     /* If no such range could be found, grow the in-memory refcount table
2207      * accordingly to append free clusters at the end of the image */
2208     if (contiguous_free_clusters < cluster_count) {
2209         /* contiguous_free_clusters clusters are already empty at the image end;
2210          * we need cluster_count clusters; therefore, we have to allocate
2211          * cluster_count - contiguous_free_clusters new clusters at the end of
2212          * the image (which is the current value of cluster; note that cluster
2213          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2214          * the image end) */
2215         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2216                                      cluster + cluster_count
2217                                      - contiguous_free_clusters);
2218         if (ret < 0) {
2219             return ret;
2220         }
2221     }
2222
2223     /* Go back to the first free cluster */
2224     cluster -= contiguous_free_clusters;
2225     for (i = 0; i < cluster_count; i++) {
2226         s->set_refcount(*refcount_table, cluster + i, 1);
2227     }
2228
2229     return cluster << s->cluster_bits;
2230 }
2231
2232 /*
2233  * Creates a new refcount structure based solely on the in-memory information
2234  * given through *refcount_table. All necessary allocations will be reflected
2235  * in that array.
2236  *
2237  * On success, the old refcount structure is leaked (it will be covered by the
2238  * new refcount structure).
2239  */
2240 static int rebuild_refcount_structure(BlockDriverState *bs,
2241                                       BdrvCheckResult *res,
2242                                       void **refcount_table,
2243                                       int64_t *nb_clusters)
2244 {
2245     BDRVQcow2State *s = bs->opaque;
2246     int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2247     int64_t refblock_offset, refblock_start, refblock_index;
2248     uint32_t reftable_size = 0;
2249     uint64_t *on_disk_reftable = NULL;
2250     void *on_disk_refblock;
2251     int ret = 0;
2252     struct {
2253         uint64_t reftable_offset;
2254         uint32_t reftable_clusters;
2255     } QEMU_PACKED reftable_offset_and_clusters;
2256
2257     qcow2_cache_empty(bs, s->refcount_block_cache);
2258
2259 write_refblocks:
2260     for (; cluster < *nb_clusters; cluster++) {
2261         if (!s->get_refcount(*refcount_table, cluster)) {
2262             continue;
2263         }
2264
2265         refblock_index = cluster >> s->refcount_block_bits;
2266         refblock_start = refblock_index << s->refcount_block_bits;
2267
2268         /* Don't allocate a cluster in a refblock already written to disk */
2269         if (first_free_cluster < refblock_start) {
2270             first_free_cluster = refblock_start;
2271         }
2272         refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2273                                               nb_clusters, &first_free_cluster);
2274         if (refblock_offset < 0) {
2275             fprintf(stderr, "ERROR allocating refblock: %s\n",
2276                     strerror(-refblock_offset));
2277             res->check_errors++;
2278             ret = refblock_offset;
2279             goto fail;
2280         }
2281
2282         if (reftable_size <= refblock_index) {
2283             uint32_t old_reftable_size = reftable_size;
2284             uint64_t *new_on_disk_reftable;
2285
2286             reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2287                                      s->cluster_size) / sizeof(uint64_t);
2288             new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2289                                                  reftable_size *
2290                                                  sizeof(uint64_t));
2291             if (!new_on_disk_reftable) {
2292                 res->check_errors++;
2293                 ret = -ENOMEM;
2294                 goto fail;
2295             }
2296             on_disk_reftable = new_on_disk_reftable;
2297
2298             memset(on_disk_reftable + old_reftable_size, 0,
2299                    (reftable_size - old_reftable_size) * sizeof(uint64_t));
2300
2301             /* The offset we have for the reftable is now no longer valid;
2302              * this will leak that range, but we can easily fix that by running
2303              * a leak-fixing check after this rebuild operation */
2304             reftable_offset = -1;
2305         } else {
2306             assert(on_disk_reftable);
2307         }
2308         on_disk_reftable[refblock_index] = refblock_offset;
2309
2310         /* If this is apparently the last refblock (for now), try to squeeze the
2311          * reftable in */
2312         if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2313             reftable_offset < 0)
2314         {
2315             uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2316                                                           sizeof(uint64_t));
2317             reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2318                                                   refcount_table, nb_clusters,
2319                                                   &first_free_cluster);
2320             if (reftable_offset < 0) {
2321                 fprintf(stderr, "ERROR allocating reftable: %s\n",
2322                         strerror(-reftable_offset));
2323                 res->check_errors++;
2324                 ret = reftable_offset;
2325                 goto fail;
2326             }
2327         }
2328
2329         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2330                                             s->cluster_size);
2331         if (ret < 0) {
2332             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2333             goto fail;
2334         }
2335
2336         /* The size of *refcount_table is always cluster-aligned, therefore the
2337          * write operation will not overflow */
2338         on_disk_refblock = (void *)((char *) *refcount_table +
2339                                     refblock_index * s->cluster_size);
2340
2341         ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
2342                          on_disk_refblock, s->cluster_sectors);
2343         if (ret < 0) {
2344             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2345             goto fail;
2346         }
2347
2348         /* Go to the end of this refblock */
2349         cluster = refblock_start + s->refcount_block_size - 1;
2350     }
2351
2352     if (reftable_offset < 0) {
2353         uint64_t post_refblock_start, reftable_clusters;
2354
2355         post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2356         reftable_clusters = size_to_clusters(s,
2357                                              reftable_size * sizeof(uint64_t));
2358         /* Not pretty but simple */
2359         if (first_free_cluster < post_refblock_start) {
2360             first_free_cluster = post_refblock_start;
2361         }
2362         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2363                                               refcount_table, nb_clusters,
2364                                               &first_free_cluster);
2365         if (reftable_offset < 0) {
2366             fprintf(stderr, "ERROR allocating reftable: %s\n",
2367                     strerror(-reftable_offset));
2368             res->check_errors++;
2369             ret = reftable_offset;
2370             goto fail;
2371         }
2372
2373         goto write_refblocks;
2374     }
2375
2376     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2377         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2378     }
2379
2380     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2381                                         reftable_size * sizeof(uint64_t));
2382     if (ret < 0) {
2383         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2384         goto fail;
2385     }
2386
2387     assert(reftable_size < INT_MAX / sizeof(uint64_t));
2388     ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2389                       reftable_size * sizeof(uint64_t));
2390     if (ret < 0) {
2391         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2392         goto fail;
2393     }
2394
2395     /* Enter new reftable into the image header */
2396     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2397     reftable_offset_and_clusters.reftable_clusters =
2398         cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2399     ret = bdrv_pwrite_sync(bs->file,
2400                            offsetof(QCowHeader, refcount_table_offset),
2401                            &reftable_offset_and_clusters,
2402                            sizeof(reftable_offset_and_clusters));
2403     if (ret < 0) {
2404         fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2405         goto fail;
2406     }
2407
2408     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2409         be64_to_cpus(&on_disk_reftable[refblock_index]);
2410     }
2411     s->refcount_table = on_disk_reftable;
2412     s->refcount_table_offset = reftable_offset;
2413     s->refcount_table_size = reftable_size;
2414     update_max_refcount_table_index(s);
2415
2416     return 0;
2417
2418 fail:
2419     g_free(on_disk_reftable);
2420     return ret;
2421 }
2422
2423 /*
2424  * Checks an image for refcount consistency.
2425  *
2426  * Returns 0 if no errors are found, the number of errors in case the image is
2427  * detected as corrupted, and -errno when an internal error occurred.
2428  */
2429 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2430                           BdrvCheckMode fix)
2431 {
2432     BDRVQcow2State *s = bs->opaque;
2433     BdrvCheckResult pre_compare_res;
2434     int64_t size, highest_cluster, nb_clusters;
2435     void *refcount_table = NULL;
2436     bool rebuild = false;
2437     int ret;
2438
2439     size = bdrv_getlength(bs->file->bs);
2440     if (size < 0) {
2441         res->check_errors++;
2442         return size;
2443     }
2444
2445     nb_clusters = size_to_clusters(s, size);
2446     if (nb_clusters > INT_MAX) {
2447         res->check_errors++;
2448         return -EFBIG;
2449     }
2450
2451     res->bfi.total_clusters =
2452         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2453
2454     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2455                               &nb_clusters);
2456     if (ret < 0) {
2457         goto fail;
2458     }
2459
2460     /* In case we don't need to rebuild the refcount structure (but want to fix
2461      * something), this function is immediately called again, in which case the
2462      * result should be ignored */
2463     pre_compare_res = *res;
2464     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2465                       nb_clusters);
2466
2467     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2468         BdrvCheckResult old_res = *res;
2469         int fresh_leaks = 0;
2470
2471         fprintf(stderr, "Rebuilding refcount structure\n");
2472         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2473                                          &nb_clusters);
2474         if (ret < 0) {
2475             goto fail;
2476         }
2477
2478         res->corruptions = 0;
2479         res->leaks = 0;
2480
2481         /* Because the old reftable has been exchanged for a new one the
2482          * references have to be recalculated */
2483         rebuild = false;
2484         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2485         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2486                                   &nb_clusters);
2487         if (ret < 0) {
2488             goto fail;
2489         }
2490
2491         if (fix & BDRV_FIX_LEAKS) {
2492             /* The old refcount structures are now leaked, fix it; the result
2493              * can be ignored, aside from leaks which were introduced by
2494              * rebuild_refcount_structure() that could not be fixed */
2495             BdrvCheckResult saved_res = *res;
2496             *res = (BdrvCheckResult){ 0 };
2497
2498             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2499                               &highest_cluster, refcount_table, nb_clusters);
2500             if (rebuild) {
2501                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2502                         "broken\n");
2503             }
2504
2505             /* Any leaks accounted for here were introduced by
2506              * rebuild_refcount_structure() because that function has created a
2507              * new refcount structure from scratch */
2508             fresh_leaks = res->leaks;
2509             *res = saved_res;
2510         }
2511
2512         if (res->corruptions < old_res.corruptions) {
2513             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2514         }
2515         if (res->leaks < old_res.leaks) {
2516             res->leaks_fixed += old_res.leaks - res->leaks;
2517         }
2518         res->leaks += fresh_leaks;
2519     } else if (fix) {
2520         if (rebuild) {
2521             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2522             res->check_errors++;
2523             ret = -EIO;
2524             goto fail;
2525         }
2526
2527         if (res->leaks || res->corruptions) {
2528             *res = pre_compare_res;
2529             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2530                               refcount_table, nb_clusters);
2531         }
2532     }
2533
2534     /* check OFLAG_COPIED */
2535     ret = check_oflag_copied(bs, res, fix);
2536     if (ret < 0) {
2537         goto fail;
2538     }
2539
2540     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2541     ret = 0;
2542
2543 fail:
2544     g_free(refcount_table);
2545
2546     return ret;
2547 }
2548
2549 #define overlaps_with(ofs, sz) \
2550     ranges_overlap(offset, size, ofs, sz)
2551
2552 /*
2553  * Checks if the given offset into the image file is actually free to use by
2554  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2555  * i.e. a sanity check without relying on the refcount tables.
2556  *
2557  * The ign parameter specifies what checks not to perform (being a bitmask of
2558  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2559  *
2560  * Returns:
2561  * - 0 if writing to this offset will not affect the mentioned metadata
2562  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2563  * - a negative value (-errno) indicating an error while performing a check,
2564  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2565  */
2566 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2567                                  int64_t size)
2568 {
2569     BDRVQcow2State *s = bs->opaque;
2570     int chk = s->overlap_check & ~ign;
2571     int i, j;
2572
2573     if (!size) {
2574         return 0;
2575     }
2576
2577     if (chk & QCOW2_OL_MAIN_HEADER) {
2578         if (offset < s->cluster_size) {
2579             return QCOW2_OL_MAIN_HEADER;
2580         }
2581     }
2582
2583     /* align range to test to cluster boundaries */
2584     size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2585     offset = start_of_cluster(s, offset);
2586
2587     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2588         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2589             return QCOW2_OL_ACTIVE_L1;
2590         }
2591     }
2592
2593     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2594         if (overlaps_with(s->refcount_table_offset,
2595             s->refcount_table_size * sizeof(uint64_t))) {
2596             return QCOW2_OL_REFCOUNT_TABLE;
2597         }
2598     }
2599
2600     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2601         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2602             return QCOW2_OL_SNAPSHOT_TABLE;
2603         }
2604     }
2605
2606     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2607         for (i = 0; i < s->nb_snapshots; i++) {
2608             if (s->snapshots[i].l1_size &&
2609                 overlaps_with(s->snapshots[i].l1_table_offset,
2610                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
2611                 return QCOW2_OL_INACTIVE_L1;
2612             }
2613         }
2614     }
2615
2616     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2617         for (i = 0; i < s->l1_size; i++) {
2618             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2619                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2620                 s->cluster_size)) {
2621                 return QCOW2_OL_ACTIVE_L2;
2622             }
2623         }
2624     }
2625
2626     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2627         unsigned last_entry = s->max_refcount_table_index;
2628         assert(last_entry < s->refcount_table_size);
2629         assert(last_entry + 1 == s->refcount_table_size ||
2630                (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2631         for (i = 0; i <= last_entry; i++) {
2632             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2633                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2634                 s->cluster_size)) {
2635                 return QCOW2_OL_REFCOUNT_BLOCK;
2636             }
2637         }
2638     }
2639
2640     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2641         for (i = 0; i < s->nb_snapshots; i++) {
2642             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2643             uint32_t l1_sz  = s->snapshots[i].l1_size;
2644             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2645             uint64_t *l1;
2646             int ret;
2647
2648             ret = qcow2_validate_table(bs, l1_ofs, l1_sz, sizeof(uint64_t),
2649                                        QCOW_MAX_L1_SIZE, "", NULL);
2650             if (ret < 0) {
2651                 return ret;
2652             }
2653
2654             l1 = g_try_malloc(l1_sz2);
2655
2656             if (l1_sz2 && l1 == NULL) {
2657                 return -ENOMEM;
2658             }
2659
2660             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2661             if (ret < 0) {
2662                 g_free(l1);
2663                 return ret;
2664             }
2665
2666             for (j = 0; j < l1_sz; j++) {
2667                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2668                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2669                     g_free(l1);
2670                     return QCOW2_OL_INACTIVE_L2;
2671                 }
2672             }
2673
2674             g_free(l1);
2675         }
2676     }
2677
2678     return 0;
2679 }
2680
2681 static const char *metadata_ol_names[] = {
2682     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2683     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2684     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2685     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2686     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2687     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2688     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2689     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2690 };
2691
2692 /*
2693  * First performs a check for metadata overlaps (through
2694  * qcow2_check_metadata_overlap); if that fails with a negative value (error
2695  * while performing a check), that value is returned. If an impending overlap
2696  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2697  * and -EIO returned.
2698  *
2699  * Returns 0 if there were neither overlaps nor errors while checking for
2700  * overlaps; or a negative value (-errno) on error.
2701  */
2702 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2703                                   int64_t size)
2704 {
2705     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2706
2707     if (ret < 0) {
2708         return ret;
2709     } else if (ret > 0) {
2710         int metadata_ol_bitnr = ctz32(ret);
2711         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2712
2713         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2714                                 "write on metadata (overlaps with %s)",
2715                                 metadata_ol_names[metadata_ol_bitnr]);
2716         return -EIO;
2717     }
2718
2719     return 0;
2720 }
2721
2722 /* A pointer to a function of this type is given to walk_over_reftable(). That
2723  * function will create refblocks and pass them to a RefblockFinishOp once they
2724  * are completed (@refblock). @refblock_empty is set if the refblock is
2725  * completely empty.
2726  *
2727  * Along with the refblock, a corresponding reftable entry is passed, in the
2728  * reftable @reftable (which may be reallocated) at @reftable_index.
2729  *
2730  * @allocated should be set to true if a new cluster has been allocated.
2731  */
2732 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2733                                uint64_t reftable_index, uint64_t *reftable_size,
2734                                void *refblock, bool refblock_empty,
2735                                bool *allocated, Error **errp);
2736
2737 /**
2738  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2739  * it is not empty) and inserts its offset into the new reftable. The size of
2740  * this new reftable is increased as required.
2741  */
2742 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2743                           uint64_t reftable_index, uint64_t *reftable_size,
2744                           void *refblock, bool refblock_empty, bool *allocated,
2745                           Error **errp)
2746 {
2747     BDRVQcow2State *s = bs->opaque;
2748     int64_t offset;
2749
2750     if (!refblock_empty && reftable_index >= *reftable_size) {
2751         uint64_t *new_reftable;
2752         uint64_t new_reftable_size;
2753
2754         new_reftable_size = ROUND_UP(reftable_index + 1,
2755                                      s->cluster_size / sizeof(uint64_t));
2756         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2757             error_setg(errp,
2758                        "This operation would make the refcount table grow "
2759                        "beyond the maximum size supported by QEMU, aborting");
2760             return -ENOTSUP;
2761         }
2762
2763         new_reftable = g_try_realloc(*reftable, new_reftable_size *
2764                                                 sizeof(uint64_t));
2765         if (!new_reftable) {
2766             error_setg(errp, "Failed to increase reftable buffer size");
2767             return -ENOMEM;
2768         }
2769
2770         memset(new_reftable + *reftable_size, 0,
2771                (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2772
2773         *reftable      = new_reftable;
2774         *reftable_size = new_reftable_size;
2775     }
2776
2777     if (!refblock_empty && !(*reftable)[reftable_index]) {
2778         offset = qcow2_alloc_clusters(bs, s->cluster_size);
2779         if (offset < 0) {
2780             error_setg_errno(errp, -offset, "Failed to allocate refblock");
2781             return offset;
2782         }
2783         (*reftable)[reftable_index] = offset;
2784         *allocated = true;
2785     }
2786
2787     return 0;
2788 }
2789
2790 /**
2791  * This "operation" for walk_over_reftable() writes the refblock to disk at the
2792  * offset specified by the new reftable's entry. It does not modify the new
2793  * reftable or change any refcounts.
2794  */
2795 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2796                           uint64_t reftable_index, uint64_t *reftable_size,
2797                           void *refblock, bool refblock_empty, bool *allocated,
2798                           Error **errp)
2799 {
2800     BDRVQcow2State *s = bs->opaque;
2801     int64_t offset;
2802     int ret;
2803
2804     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2805         offset = (*reftable)[reftable_index];
2806
2807         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2808         if (ret < 0) {
2809             error_setg_errno(errp, -ret, "Overlap check failed");
2810             return ret;
2811         }
2812
2813         ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
2814         if (ret < 0) {
2815             error_setg_errno(errp, -ret, "Failed to write refblock");
2816             return ret;
2817         }
2818     } else {
2819         assert(refblock_empty);
2820     }
2821
2822     return 0;
2823 }
2824
2825 /**
2826  * This function walks over the existing reftable and every referenced refblock;
2827  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2828  * create an equal new entry in the passed @new_refblock. Once that
2829  * @new_refblock is completely filled, @operation will be called.
2830  *
2831  * @status_cb and @cb_opaque are used for the amend operation's status callback.
2832  * @index is the index of the walk_over_reftable() calls and @total is the total
2833  * number of walk_over_reftable() calls per amend operation. Both are used for
2834  * calculating the parameters for the status callback.
2835  *
2836  * @allocated is set to true if a new cluster has been allocated.
2837  */
2838 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2839                               uint64_t *new_reftable_index,
2840                               uint64_t *new_reftable_size,
2841                               void *new_refblock, int new_refblock_size,
2842                               int new_refcount_bits,
2843                               RefblockFinishOp *operation, bool *allocated,
2844                               Qcow2SetRefcountFunc *new_set_refcount,
2845                               BlockDriverAmendStatusCB *status_cb,
2846                               void *cb_opaque, int index, int total,
2847                               Error **errp)
2848 {
2849     BDRVQcow2State *s = bs->opaque;
2850     uint64_t reftable_index;
2851     bool new_refblock_empty = true;
2852     int refblock_index;
2853     int new_refblock_index = 0;
2854     int ret;
2855
2856     for (reftable_index = 0; reftable_index < s->refcount_table_size;
2857          reftable_index++)
2858     {
2859         uint64_t refblock_offset = s->refcount_table[reftable_index]
2860                                  & REFT_OFFSET_MASK;
2861
2862         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2863                   (uint64_t)total * s->refcount_table_size, cb_opaque);
2864
2865         if (refblock_offset) {
2866             void *refblock;
2867
2868             if (offset_into_cluster(s, refblock_offset)) {
2869                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2870                                         PRIx64 " unaligned (reftable index: %#"
2871                                         PRIx64 ")", refblock_offset,
2872                                         reftable_index);
2873                 error_setg(errp,
2874                            "Image is corrupt (unaligned refblock offset)");
2875                 return -EIO;
2876             }
2877
2878             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2879                                   &refblock);
2880             if (ret < 0) {
2881                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2882                 return ret;
2883             }
2884
2885             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2886                  refblock_index++)
2887             {
2888                 uint64_t refcount;
2889
2890                 if (new_refblock_index >= new_refblock_size) {
2891                     /* new_refblock is now complete */
2892                     ret = operation(bs, new_reftable, *new_reftable_index,
2893                                     new_reftable_size, new_refblock,
2894                                     new_refblock_empty, allocated, errp);
2895                     if (ret < 0) {
2896                         qcow2_cache_put(s->refcount_block_cache, &refblock);
2897                         return ret;
2898                     }
2899
2900                     (*new_reftable_index)++;
2901                     new_refblock_index = 0;
2902                     new_refblock_empty = true;
2903                 }
2904
2905                 refcount = s->get_refcount(refblock, refblock_index);
2906                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2907                     uint64_t offset;
2908
2909                     qcow2_cache_put(s->refcount_block_cache, &refblock);
2910
2911                     offset = ((reftable_index << s->refcount_block_bits)
2912                               + refblock_index) << s->cluster_bits;
2913
2914                     error_setg(errp, "Cannot decrease refcount entry width to "
2915                                "%i bits: Cluster at offset %#" PRIx64 " has a "
2916                                "refcount of %" PRIu64, new_refcount_bits,
2917                                offset, refcount);
2918                     return -EINVAL;
2919                 }
2920
2921                 if (new_set_refcount) {
2922                     new_set_refcount(new_refblock, new_refblock_index++,
2923                                      refcount);
2924                 } else {
2925                     new_refblock_index++;
2926                 }
2927                 new_refblock_empty = new_refblock_empty && refcount == 0;
2928             }
2929
2930             qcow2_cache_put(s->refcount_block_cache, &refblock);
2931         } else {
2932             /* No refblock means every refcount is 0 */
2933             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2934                  refblock_index++)
2935             {
2936                 if (new_refblock_index >= new_refblock_size) {
2937                     /* new_refblock is now complete */
2938                     ret = operation(bs, new_reftable, *new_reftable_index,
2939                                     new_reftable_size, new_refblock,
2940                                     new_refblock_empty, allocated, errp);
2941                     if (ret < 0) {
2942                         return ret;
2943                     }
2944
2945                     (*new_reftable_index)++;
2946                     new_refblock_index = 0;
2947                     new_refblock_empty = true;
2948                 }
2949
2950                 if (new_set_refcount) {
2951                     new_set_refcount(new_refblock, new_refblock_index++, 0);
2952                 } else {
2953                     new_refblock_index++;
2954                 }
2955             }
2956         }
2957     }
2958
2959     if (new_refblock_index > 0) {
2960         /* Complete the potentially existing partially filled final refblock */
2961         if (new_set_refcount) {
2962             for (; new_refblock_index < new_refblock_size;
2963                  new_refblock_index++)
2964             {
2965                 new_set_refcount(new_refblock, new_refblock_index, 0);
2966             }
2967         }
2968
2969         ret = operation(bs, new_reftable, *new_reftable_index,
2970                         new_reftable_size, new_refblock, new_refblock_empty,
2971                         allocated, errp);
2972         if (ret < 0) {
2973             return ret;
2974         }
2975
2976         (*new_reftable_index)++;
2977     }
2978
2979     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
2980               (uint64_t)total * s->refcount_table_size, cb_opaque);
2981
2982     return 0;
2983 }
2984
2985 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
2986                                 BlockDriverAmendStatusCB *status_cb,
2987                                 void *cb_opaque, Error **errp)
2988 {
2989     BDRVQcow2State *s = bs->opaque;
2990     Qcow2GetRefcountFunc *new_get_refcount;
2991     Qcow2SetRefcountFunc *new_set_refcount;
2992     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
2993     uint64_t *new_reftable = NULL, new_reftable_size = 0;
2994     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
2995     uint64_t new_reftable_index = 0;
2996     uint64_t i;
2997     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
2998     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
2999     int old_refcount_order;
3000     int walk_index = 0;
3001     int ret;
3002     bool new_allocation;
3003
3004     assert(s->qcow_version >= 3);
3005     assert(refcount_order >= 0 && refcount_order <= 6);
3006
3007     /* see qcow2_open() */
3008     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3009
3010     new_get_refcount = get_refcount_funcs[refcount_order];
3011     new_set_refcount = set_refcount_funcs[refcount_order];
3012
3013
3014     do {
3015         int total_walks;
3016
3017         new_allocation = false;
3018
3019         /* At least we have to do this walk and the one which writes the
3020          * refblocks; also, at least we have to do this loop here at least
3021          * twice (normally), first to do the allocations, and second to
3022          * determine that everything is correctly allocated, this then makes
3023          * three walks in total */
3024         total_walks = MAX(walk_index + 2, 3);
3025
3026         /* First, allocate the structures so they are present in the refcount
3027          * structures */
3028         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3029                                  &new_reftable_size, NULL, new_refblock_size,
3030                                  new_refcount_bits, &alloc_refblock,
3031                                  &new_allocation, NULL, status_cb, cb_opaque,
3032                                  walk_index++, total_walks, errp);
3033         if (ret < 0) {
3034             goto done;
3035         }
3036
3037         new_reftable_index = 0;
3038
3039         if (new_allocation) {
3040             if (new_reftable_offset) {
3041                 qcow2_free_clusters(bs, new_reftable_offset,
3042                                     allocated_reftable_size * sizeof(uint64_t),
3043                                     QCOW2_DISCARD_NEVER);
3044             }
3045
3046             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3047                                                            sizeof(uint64_t));
3048             if (new_reftable_offset < 0) {
3049                 error_setg_errno(errp, -new_reftable_offset,
3050                                  "Failed to allocate the new reftable");
3051                 ret = new_reftable_offset;
3052                 goto done;
3053             }
3054             allocated_reftable_size = new_reftable_size;
3055         }
3056     } while (new_allocation);
3057
3058     /* Second, write the new refblocks */
3059     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3060                              &new_reftable_size, new_refblock,
3061                              new_refblock_size, new_refcount_bits,
3062                              &flush_refblock, &new_allocation, new_set_refcount,
3063                              status_cb, cb_opaque, walk_index, walk_index + 1,
3064                              errp);
3065     if (ret < 0) {
3066         goto done;
3067     }
3068     assert(!new_allocation);
3069
3070
3071     /* Write the new reftable */
3072     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3073                                         new_reftable_size * sizeof(uint64_t));
3074     if (ret < 0) {
3075         error_setg_errno(errp, -ret, "Overlap check failed");
3076         goto done;
3077     }
3078
3079     for (i = 0; i < new_reftable_size; i++) {
3080         cpu_to_be64s(&new_reftable[i]);
3081     }
3082
3083     ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
3084                       new_reftable_size * sizeof(uint64_t));
3085
3086     for (i = 0; i < new_reftable_size; i++) {
3087         be64_to_cpus(&new_reftable[i]);
3088     }
3089
3090     if (ret < 0) {
3091         error_setg_errno(errp, -ret, "Failed to write the new reftable");
3092         goto done;
3093     }
3094
3095
3096     /* Empty the refcount cache */
3097     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3098     if (ret < 0) {
3099         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3100         goto done;
3101     }
3102
3103     /* Update the image header to point to the new reftable; this only updates
3104      * the fields which are relevant to qcow2_update_header(); other fields
3105      * such as s->refcount_table or s->refcount_bits stay stale for now
3106      * (because we have to restore everything if qcow2_update_header() fails) */
3107     old_refcount_order  = s->refcount_order;
3108     old_reftable_size   = s->refcount_table_size;
3109     old_reftable_offset = s->refcount_table_offset;
3110
3111     s->refcount_order        = refcount_order;
3112     s->refcount_table_size   = new_reftable_size;
3113     s->refcount_table_offset = new_reftable_offset;
3114
3115     ret = qcow2_update_header(bs);
3116     if (ret < 0) {
3117         s->refcount_order        = old_refcount_order;
3118         s->refcount_table_size   = old_reftable_size;
3119         s->refcount_table_offset = old_reftable_offset;
3120         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3121         goto done;
3122     }
3123
3124     /* Now update the rest of the in-memory information */
3125     old_reftable = s->refcount_table;
3126     s->refcount_table = new_reftable;
3127     update_max_refcount_table_index(s);
3128
3129     s->refcount_bits = 1 << refcount_order;
3130     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3131     s->refcount_max += s->refcount_max - 1;
3132
3133     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3134     s->refcount_block_size = 1 << s->refcount_block_bits;
3135
3136     s->get_refcount = new_get_refcount;
3137     s->set_refcount = new_set_refcount;
3138
3139     /* For cleaning up all old refblocks and the old reftable below the "done"
3140      * label */
3141     new_reftable        = old_reftable;
3142     new_reftable_size   = old_reftable_size;
3143     new_reftable_offset = old_reftable_offset;
3144
3145 done:
3146     if (new_reftable) {
3147         /* On success, new_reftable actually points to the old reftable (and
3148          * new_reftable_size is the old reftable's size); but that is just
3149          * fine */
3150         for (i = 0; i < new_reftable_size; i++) {
3151             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3152             if (offset) {
3153                 qcow2_free_clusters(bs, offset, s->cluster_size,
3154                                     QCOW2_DISCARD_OTHER);
3155             }
3156         }
3157         g_free(new_reftable);
3158
3159         if (new_reftable_offset > 0) {
3160             qcow2_free_clusters(bs, new_reftable_offset,
3161                                 new_reftable_size * sizeof(uint64_t),
3162                                 QCOW2_DISCARD_OTHER);
3163         }
3164     }
3165
3166     qemu_vfree(new_refblock);
3167     return ret;
3168 }
3169
3170 static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3171 {
3172     BDRVQcow2State *s = bs->opaque;
3173     uint32_t index = offset_to_reftable_index(s, offset);
3174     int64_t covering_refblock_offset = 0;
3175
3176     if (index < s->refcount_table_size) {
3177         covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3178     }
3179     if (!covering_refblock_offset) {
3180         qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3181                                 "not covered by the refcount structures",
3182                                 offset);
3183         return -EIO;
3184     }
3185
3186     return covering_refblock_offset;
3187 }
3188
3189 static int qcow2_discard_refcount_block(BlockDriverState *bs,
3190                                         uint64_t discard_block_offs)
3191 {
3192     BDRVQcow2State *s = bs->opaque;
3193     int64_t refblock_offs;
3194     uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3195     uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3196     void *refblock;
3197     int ret;
3198
3199     refblock_offs = get_refblock_offset(bs, discard_block_offs);
3200     if (refblock_offs < 0) {
3201         return refblock_offs;
3202     }
3203
3204     assert(discard_block_offs != 0);
3205
3206     ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3207                           &refblock);
3208     if (ret < 0) {
3209         return ret;
3210     }
3211
3212     if (s->get_refcount(refblock, block_index) != 1) {
3213         qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3214                                 " refblock offset %#" PRIx64
3215                                 ", reftable index %u"
3216                                 ", block offset %#" PRIx64
3217                                 ", refcount %#" PRIx64,
3218                                 refblock_offs,
3219                                 offset_to_reftable_index(s, discard_block_offs),
3220                                 discard_block_offs,
3221                                 s->get_refcount(refblock, block_index));
3222         qcow2_cache_put(s->refcount_block_cache, &refblock);
3223         return -EINVAL;
3224     }
3225     s->set_refcount(refblock, block_index, 0);
3226
3227     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3228
3229     qcow2_cache_put(s->refcount_block_cache, &refblock);
3230
3231     if (cluster_index < s->free_cluster_index) {
3232         s->free_cluster_index = cluster_index;
3233     }
3234
3235     refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3236                                            discard_block_offs);
3237     if (refblock) {
3238         /* discard refblock from the cache if refblock is cached */
3239         qcow2_cache_discard(s->refcount_block_cache, refblock);
3240     }
3241     update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3242
3243     return 0;
3244 }
3245
3246 int qcow2_shrink_reftable(BlockDriverState *bs)
3247 {
3248     BDRVQcow2State *s = bs->opaque;
3249     uint64_t *reftable_tmp =
3250         g_malloc(s->refcount_table_size * sizeof(uint64_t));
3251     int i, ret;
3252
3253     for (i = 0; i < s->refcount_table_size; i++) {
3254         int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3255         void *refblock;
3256         bool unused_block;
3257
3258         if (refblock_offs == 0) {
3259             reftable_tmp[i] = 0;
3260             continue;
3261         }
3262         ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3263                               &refblock);
3264         if (ret < 0) {
3265             goto out;
3266         }
3267
3268         /* the refblock has own reference */
3269         if (i == offset_to_reftable_index(s, refblock_offs)) {
3270             uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3271                                    (s->refcount_block_size - 1);
3272             uint64_t refcount = s->get_refcount(refblock, block_index);
3273
3274             s->set_refcount(refblock, block_index, 0);
3275
3276             unused_block = buffer_is_zero(refblock, s->cluster_size);
3277
3278             s->set_refcount(refblock, block_index, refcount);
3279         } else {
3280             unused_block = buffer_is_zero(refblock, s->cluster_size);
3281         }
3282         qcow2_cache_put(s->refcount_block_cache, &refblock);
3283
3284         reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3285     }
3286
3287     ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
3288                            s->refcount_table_size * sizeof(uint64_t));
3289     /*
3290      * If the write in the reftable failed the image may contain a partially
3291      * overwritten reftable. In this case it would be better to clear the
3292      * reftable in memory to avoid possible image corruption.
3293      */
3294     for (i = 0; i < s->refcount_table_size; i++) {
3295         if (s->refcount_table[i] && !reftable_tmp[i]) {
3296             if (ret == 0) {
3297                 ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3298                                                        REFT_OFFSET_MASK);
3299             }
3300             s->refcount_table[i] = 0;
3301         }
3302     }
3303
3304     if (!s->cache_discards) {
3305         qcow2_process_discards(bs, ret);
3306     }
3307
3308 out:
3309     g_free(reftable_tmp);
3310     return ret;
3311 }
3312
3313 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3314 {
3315     BDRVQcow2State *s = bs->opaque;
3316     int64_t i;
3317
3318     for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3319         uint64_t refcount;
3320         int ret = qcow2_get_refcount(bs, i, &refcount);
3321         if (ret < 0) {
3322             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3323                     i, strerror(-ret));
3324             return ret;
3325         }
3326         if (refcount > 0) {
3327             return i;
3328         }
3329     }
3330     qcow2_signal_corruption(bs, true, -1, -1,
3331                             "There are no references in the refcount table.");
3332     return -EIO;
3333 }
This page took 0.212148 seconds and 4 git commands to generate.