]> Git Repo - qemu.git/blob - block/qcow2-refcount.c
qcow2: Remove BDS parameter from qcow2_cache_entry_mark_dirty()
[qemu.git] / block / qcow2-refcount.c
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "block/qcow2.h"
30 #include "qemu/range.h"
31 #include "qemu/bswap.h"
32 #include "qemu/cutils.h"
33
34 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
35 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
36                             int64_t offset, int64_t length, uint64_t addend,
37                             bool decrease, enum qcow2_discard_type type);
38
39 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
40 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
41 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
42 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
43 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
44 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
45 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
46
47 static void set_refcount_ro0(void *refcount_array, uint64_t index,
48                              uint64_t value);
49 static void set_refcount_ro1(void *refcount_array, uint64_t index,
50                              uint64_t value);
51 static void set_refcount_ro2(void *refcount_array, uint64_t index,
52                              uint64_t value);
53 static void set_refcount_ro3(void *refcount_array, uint64_t index,
54                              uint64_t value);
55 static void set_refcount_ro4(void *refcount_array, uint64_t index,
56                              uint64_t value);
57 static void set_refcount_ro5(void *refcount_array, uint64_t index,
58                              uint64_t value);
59 static void set_refcount_ro6(void *refcount_array, uint64_t index,
60                              uint64_t value);
61
62
63 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
64     &get_refcount_ro0,
65     &get_refcount_ro1,
66     &get_refcount_ro2,
67     &get_refcount_ro3,
68     &get_refcount_ro4,
69     &get_refcount_ro5,
70     &get_refcount_ro6
71 };
72
73 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
74     &set_refcount_ro0,
75     &set_refcount_ro1,
76     &set_refcount_ro2,
77     &set_refcount_ro3,
78     &set_refcount_ro4,
79     &set_refcount_ro5,
80     &set_refcount_ro6
81 };
82
83
84 /*********************************************************/
85 /* refcount handling */
86
87 static void update_max_refcount_table_index(BDRVQcow2State *s)
88 {
89     unsigned i = s->refcount_table_size - 1;
90     while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
91         i--;
92     }
93     /* Set s->max_refcount_table_index to the index of the last used entry */
94     s->max_refcount_table_index = i;
95 }
96
97 int qcow2_refcount_init(BlockDriverState *bs)
98 {
99     BDRVQcow2State *s = bs->opaque;
100     unsigned int refcount_table_size2, i;
101     int ret;
102
103     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
104
105     s->get_refcount = get_refcount_funcs[s->refcount_order];
106     s->set_refcount = set_refcount_funcs[s->refcount_order];
107
108     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
109     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
110     s->refcount_table = g_try_malloc(refcount_table_size2);
111
112     if (s->refcount_table_size > 0) {
113         if (s->refcount_table == NULL) {
114             ret = -ENOMEM;
115             goto fail;
116         }
117         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
118         ret = bdrv_pread(bs->file, s->refcount_table_offset,
119                          s->refcount_table, refcount_table_size2);
120         if (ret < 0) {
121             goto fail;
122         }
123         for(i = 0; i < s->refcount_table_size; i++)
124             be64_to_cpus(&s->refcount_table[i]);
125         update_max_refcount_table_index(s);
126     }
127     return 0;
128  fail:
129     return ret;
130 }
131
132 void qcow2_refcount_close(BlockDriverState *bs)
133 {
134     BDRVQcow2State *s = bs->opaque;
135     g_free(s->refcount_table);
136 }
137
138
139 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
140 {
141     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
142 }
143
144 static void set_refcount_ro0(void *refcount_array, uint64_t index,
145                              uint64_t value)
146 {
147     assert(!(value >> 1));
148     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
149     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
150 }
151
152 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
153 {
154     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
155            & 0x3;
156 }
157
158 static void set_refcount_ro1(void *refcount_array, uint64_t index,
159                              uint64_t value)
160 {
161     assert(!(value >> 2));
162     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
163     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
164 }
165
166 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
167 {
168     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
169            & 0xf;
170 }
171
172 static void set_refcount_ro2(void *refcount_array, uint64_t index,
173                              uint64_t value)
174 {
175     assert(!(value >> 4));
176     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
177     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
178 }
179
180 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
181 {
182     return ((const uint8_t *)refcount_array)[index];
183 }
184
185 static void set_refcount_ro3(void *refcount_array, uint64_t index,
186                              uint64_t value)
187 {
188     assert(!(value >> 8));
189     ((uint8_t *)refcount_array)[index] = value;
190 }
191
192 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
193 {
194     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
195 }
196
197 static void set_refcount_ro4(void *refcount_array, uint64_t index,
198                              uint64_t value)
199 {
200     assert(!(value >> 16));
201     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
202 }
203
204 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
205 {
206     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
207 }
208
209 static void set_refcount_ro5(void *refcount_array, uint64_t index,
210                              uint64_t value)
211 {
212     assert(!(value >> 32));
213     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
214 }
215
216 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
217 {
218     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
219 }
220
221 static void set_refcount_ro6(void *refcount_array, uint64_t index,
222                              uint64_t value)
223 {
224     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
225 }
226
227
228 static int load_refcount_block(BlockDriverState *bs,
229                                int64_t refcount_block_offset,
230                                void **refcount_block)
231 {
232     BDRVQcow2State *s = bs->opaque;
233
234     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
235     return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
236                            refcount_block);
237 }
238
239 /*
240  * Retrieves the refcount of the cluster given by its index and stores it in
241  * *refcount. Returns 0 on success and -errno on failure.
242  */
243 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
244                        uint64_t *refcount)
245 {
246     BDRVQcow2State *s = bs->opaque;
247     uint64_t refcount_table_index, block_index;
248     int64_t refcount_block_offset;
249     int ret;
250     void *refcount_block;
251
252     refcount_table_index = cluster_index >> s->refcount_block_bits;
253     if (refcount_table_index >= s->refcount_table_size) {
254         *refcount = 0;
255         return 0;
256     }
257     refcount_block_offset =
258         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
259     if (!refcount_block_offset) {
260         *refcount = 0;
261         return 0;
262     }
263
264     if (offset_into_cluster(s, refcount_block_offset)) {
265         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
266                                 " unaligned (reftable index: %#" PRIx64 ")",
267                                 refcount_block_offset, refcount_table_index);
268         return -EIO;
269     }
270
271     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
272                           &refcount_block);
273     if (ret < 0) {
274         return ret;
275     }
276
277     block_index = cluster_index & (s->refcount_block_size - 1);
278     *refcount = s->get_refcount(refcount_block, block_index);
279
280     qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
281
282     return 0;
283 }
284
285 /* Checks if two offsets are described by the same refcount block */
286 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
287     uint64_t offset_b)
288 {
289     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
290     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
291
292     return (block_a == block_b);
293 }
294
295 /*
296  * Loads a refcount block. If it doesn't exist yet, it is allocated first
297  * (including growing the refcount table if needed).
298  *
299  * Returns 0 on success or -errno in error case
300  */
301 static int alloc_refcount_block(BlockDriverState *bs,
302                                 int64_t cluster_index, void **refcount_block)
303 {
304     BDRVQcow2State *s = bs->opaque;
305     unsigned int refcount_table_index;
306     int64_t ret;
307
308     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
309
310     /* Find the refcount block for the given cluster */
311     refcount_table_index = cluster_index >> s->refcount_block_bits;
312
313     if (refcount_table_index < s->refcount_table_size) {
314
315         uint64_t refcount_block_offset =
316             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
317
318         /* If it's already there, we're done */
319         if (refcount_block_offset) {
320             if (offset_into_cluster(s, refcount_block_offset)) {
321                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
322                                         PRIx64 " unaligned (reftable index: "
323                                         "%#x)", refcount_block_offset,
324                                         refcount_table_index);
325                 return -EIO;
326             }
327
328              return load_refcount_block(bs, refcount_block_offset,
329                                         refcount_block);
330         }
331     }
332
333     /*
334      * If we came here, we need to allocate something. Something is at least
335      * a cluster for the new refcount block. It may also include a new refcount
336      * table if the old refcount table is too small.
337      *
338      * Note that allocating clusters here needs some special care:
339      *
340      * - We can't use the normal qcow2_alloc_clusters(), it would try to
341      *   increase the refcount and very likely we would end up with an endless
342      *   recursion. Instead we must place the refcount blocks in a way that
343      *   they can describe them themselves.
344      *
345      * - We need to consider that at this point we are inside update_refcounts
346      *   and potentially doing an initial refcount increase. This means that
347      *   some clusters have already been allocated by the caller, but their
348      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
349      *   need to return -EAGAIN to signal the caller that it needs to restart
350      *   the search for free clusters.
351      *
352      * - alloc_clusters_noref and qcow2_free_clusters may load a different
353      *   refcount block into the cache
354      */
355
356     *refcount_block = NULL;
357
358     /* We write to the refcount table, so we might depend on L2 tables */
359     ret = qcow2_cache_flush(bs, s->l2_table_cache);
360     if (ret < 0) {
361         return ret;
362     }
363
364     /* Allocate the refcount block itself and mark it as used */
365     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
366     if (new_block < 0) {
367         return new_block;
368     }
369
370     /* If we're allocating the block at offset 0 then something is wrong */
371     if (new_block == 0) {
372         qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
373                                 "allocation of refcount block at offset 0");
374         return -EIO;
375     }
376
377 #ifdef DEBUG_ALLOC2
378     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
379         " at %" PRIx64 "\n",
380         refcount_table_index, cluster_index << s->cluster_bits, new_block);
381 #endif
382
383     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
384         /* Zero the new refcount block before updating it */
385         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
386                                     refcount_block);
387         if (ret < 0) {
388             goto fail;
389         }
390
391         memset(*refcount_block, 0, s->cluster_size);
392
393         /* The block describes itself, need to update the cache */
394         int block_index = (new_block >> s->cluster_bits) &
395             (s->refcount_block_size - 1);
396         s->set_refcount(*refcount_block, block_index, 1);
397     } else {
398         /* Described somewhere else. This can recurse at most twice before we
399          * arrive at a block that describes itself. */
400         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
401                               QCOW2_DISCARD_NEVER);
402         if (ret < 0) {
403             goto fail;
404         }
405
406         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
407         if (ret < 0) {
408             goto fail;
409         }
410
411         /* Initialize the new refcount block only after updating its refcount,
412          * update_refcount uses the refcount cache itself */
413         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
414                                     refcount_block);
415         if (ret < 0) {
416             goto fail;
417         }
418
419         memset(*refcount_block, 0, s->cluster_size);
420     }
421
422     /* Now the new refcount block needs to be written to disk */
423     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
424     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
425     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
426     if (ret < 0) {
427         goto fail;
428     }
429
430     /* If the refcount table is big enough, just hook the block up there */
431     if (refcount_table_index < s->refcount_table_size) {
432         uint64_t data64 = cpu_to_be64(new_block);
433         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
434         ret = bdrv_pwrite_sync(bs->file,
435             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
436             &data64, sizeof(data64));
437         if (ret < 0) {
438             goto fail;
439         }
440
441         s->refcount_table[refcount_table_index] = new_block;
442         /* If there's a hole in s->refcount_table then it can happen
443          * that refcount_table_index < s->max_refcount_table_index */
444         s->max_refcount_table_index =
445             MAX(s->max_refcount_table_index, refcount_table_index);
446
447         /* The new refcount block may be where the caller intended to put its
448          * data, so let it restart the search. */
449         return -EAGAIN;
450     }
451
452     qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
453
454     /*
455      * If we come here, we need to grow the refcount table. Again, a new
456      * refcount table needs some space and we can't simply allocate to avoid
457      * endless recursion.
458      *
459      * Therefore let's grab new refcount blocks at the end of the image, which
460      * will describe themselves and the new refcount table. This way we can
461      * reference them only in the new table and do the switch to the new
462      * refcount table at once without producing an inconsistent state in
463      * between.
464      */
465     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
466
467     /* Calculate the number of refcount blocks needed so far; this will be the
468      * basis for calculating the index of the first cluster used for the
469      * self-describing refcount structures which we are about to create.
470      *
471      * Because we reached this point, there cannot be any refcount entries for
472      * cluster_index or higher indices yet. However, because new_block has been
473      * allocated to describe that cluster (and it will assume this role later
474      * on), we cannot use that index; also, new_block may actually have a higher
475      * cluster index than cluster_index, so it needs to be taken into account
476      * here (and 1 needs to be added to its value because that cluster is used).
477      */
478     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
479                                             (new_block >> s->cluster_bits) + 1),
480                                         s->refcount_block_size);
481
482     /* Create the new refcount table and blocks */
483     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
484         s->cluster_size;
485
486     ret = qcow2_refcount_area(bs, meta_offset, 0, false,
487                               refcount_table_index, new_block);
488     if (ret < 0) {
489         return ret;
490     }
491
492     ret = load_refcount_block(bs, new_block, refcount_block);
493     if (ret < 0) {
494         return ret;
495     }
496
497     /* If we were trying to do the initial refcount update for some cluster
498      * allocation, we might have used the same clusters to store newly
499      * allocated metadata. Make the caller search some new space. */
500     return -EAGAIN;
501
502 fail:
503     if (*refcount_block != NULL) {
504         qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
505     }
506     return ret;
507 }
508
509 /*
510  * Starting at @start_offset, this function creates new self-covering refcount
511  * structures: A new refcount table and refcount blocks which cover all of
512  * themselves, and a number of @additional_clusters beyond their end.
513  * @start_offset must be at the end of the image file, that is, there must be
514  * only empty space beyond it.
515  * If @exact_size is false, the refcount table will have 50 % more entries than
516  * necessary so it will not need to grow again soon.
517  * If @new_refblock_offset is not zero, it contains the offset of a refcount
518  * block that should be entered into the new refcount table at index
519  * @new_refblock_index.
520  *
521  * Returns: The offset after the new refcount structures (i.e. where the
522  *          @additional_clusters may be placed) on success, -errno on error.
523  */
524 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
525                             uint64_t additional_clusters, bool exact_size,
526                             int new_refblock_index,
527                             uint64_t new_refblock_offset)
528 {
529     BDRVQcow2State *s = bs->opaque;
530     uint64_t total_refblock_count_u64, additional_refblock_count;
531     int total_refblock_count, table_size, area_reftable_index, table_clusters;
532     int i;
533     uint64_t table_offset, block_offset, end_offset;
534     int ret;
535     uint64_t *new_table;
536
537     assert(!(start_offset % s->cluster_size));
538
539     qcow2_refcount_metadata_size(start_offset / s->cluster_size +
540                                  additional_clusters,
541                                  s->cluster_size, s->refcount_order,
542                                  !exact_size, &total_refblock_count_u64);
543     if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
544         return -EFBIG;
545     }
546     total_refblock_count = total_refblock_count_u64;
547
548     /* Index in the refcount table of the first refcount block to cover the area
549      * of refcount structures we are about to create; we know that
550      * @total_refblock_count can cover @start_offset, so this will definitely
551      * fit into an int. */
552     area_reftable_index = (start_offset / s->cluster_size) /
553                           s->refcount_block_size;
554
555     if (exact_size) {
556         table_size = total_refblock_count;
557     } else {
558         table_size = total_refblock_count +
559                      DIV_ROUND_UP(total_refblock_count, 2);
560     }
561     /* The qcow2 file can only store the reftable size in number of clusters */
562     table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t));
563     table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size;
564
565     if (table_size > QCOW_MAX_REFTABLE_SIZE) {
566         return -EFBIG;
567     }
568
569     new_table = g_try_new0(uint64_t, table_size);
570
571     assert(table_size > 0);
572     if (new_table == NULL) {
573         ret = -ENOMEM;
574         goto fail;
575     }
576
577     /* Fill the new refcount table */
578     if (table_size > s->max_refcount_table_index) {
579         /* We're actually growing the reftable */
580         memcpy(new_table, s->refcount_table,
581                (s->max_refcount_table_index + 1) * sizeof(uint64_t));
582     } else {
583         /* Improbable case: We're shrinking the reftable. However, the caller
584          * has assured us that there is only empty space beyond @start_offset,
585          * so we can simply drop all of the refblocks that won't fit into the
586          * new reftable. */
587         memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t));
588     }
589
590     if (new_refblock_offset) {
591         assert(new_refblock_index < total_refblock_count);
592         new_table[new_refblock_index] = new_refblock_offset;
593     }
594
595     /* Count how many new refblocks we have to create */
596     additional_refblock_count = 0;
597     for (i = area_reftable_index; i < total_refblock_count; i++) {
598         if (!new_table[i]) {
599             additional_refblock_count++;
600         }
601     }
602
603     table_offset = start_offset + additional_refblock_count * s->cluster_size;
604     end_offset = table_offset + table_clusters * s->cluster_size;
605
606     /* Fill the refcount blocks, and create new ones, if necessary */
607     block_offset = start_offset;
608     for (i = area_reftable_index; i < total_refblock_count; i++) {
609         void *refblock_data;
610         uint64_t first_offset_covered;
611
612         /* Reuse an existing refblock if possible, create a new one otherwise */
613         if (new_table[i]) {
614             ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
615                                   &refblock_data);
616             if (ret < 0) {
617                 goto fail;
618             }
619         } else {
620             ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
621                                         block_offset, &refblock_data);
622             if (ret < 0) {
623                 goto fail;
624             }
625             memset(refblock_data, 0, s->cluster_size);
626             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
627                                          refblock_data);
628
629             new_table[i] = block_offset;
630             block_offset += s->cluster_size;
631         }
632
633         /* First host offset covered by this refblock */
634         first_offset_covered = (uint64_t)i * s->refcount_block_size *
635                                s->cluster_size;
636         if (first_offset_covered < end_offset) {
637             int j, end_index;
638
639             /* Set the refcount of all of the new refcount structures to 1 */
640
641             if (first_offset_covered < start_offset) {
642                 assert(i == area_reftable_index);
643                 j = (start_offset - first_offset_covered) / s->cluster_size;
644                 assert(j < s->refcount_block_size);
645             } else {
646                 j = 0;
647             }
648
649             end_index = MIN((end_offset - first_offset_covered) /
650                             s->cluster_size,
651                             s->refcount_block_size);
652
653             for (; j < end_index; j++) {
654                 /* The caller guaranteed us this space would be empty */
655                 assert(s->get_refcount(refblock_data, j) == 0);
656                 s->set_refcount(refblock_data, j, 1);
657             }
658
659             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
660                                          refblock_data);
661         }
662
663         qcow2_cache_put(bs, s->refcount_block_cache, &refblock_data);
664     }
665
666     assert(block_offset == table_offset);
667
668     /* Write refcount blocks to disk */
669     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
670     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
671     if (ret < 0) {
672         goto fail;
673     }
674
675     /* Write refcount table to disk */
676     for (i = 0; i < total_refblock_count; i++) {
677         cpu_to_be64s(&new_table[i]);
678     }
679
680     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
681     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
682         table_size * sizeof(uint64_t));
683     if (ret < 0) {
684         goto fail;
685     }
686
687     for (i = 0; i < total_refblock_count; i++) {
688         be64_to_cpus(&new_table[i]);
689     }
690
691     /* Hook up the new refcount table in the qcow2 header */
692     struct QEMU_PACKED {
693         uint64_t d64;
694         uint32_t d32;
695     } data;
696     data.d64 = cpu_to_be64(table_offset);
697     data.d32 = cpu_to_be32(table_clusters);
698     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
699     ret = bdrv_pwrite_sync(bs->file,
700                            offsetof(QCowHeader, refcount_table_offset),
701                            &data, sizeof(data));
702     if (ret < 0) {
703         goto fail;
704     }
705
706     /* And switch it in memory */
707     uint64_t old_table_offset = s->refcount_table_offset;
708     uint64_t old_table_size = s->refcount_table_size;
709
710     g_free(s->refcount_table);
711     s->refcount_table = new_table;
712     s->refcount_table_size = table_size;
713     s->refcount_table_offset = table_offset;
714     update_max_refcount_table_index(s);
715
716     /* Free old table. */
717     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
718                         QCOW2_DISCARD_OTHER);
719
720     return end_offset;
721
722 fail:
723     g_free(new_table);
724     return ret;
725 }
726
727 void qcow2_process_discards(BlockDriverState *bs, int ret)
728 {
729     BDRVQcow2State *s = bs->opaque;
730     Qcow2DiscardRegion *d, *next;
731
732     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
733         QTAILQ_REMOVE(&s->discards, d, next);
734
735         /* Discard is optional, ignore the return value */
736         if (ret >= 0) {
737             bdrv_pdiscard(bs->file->bs, d->offset, d->bytes);
738         }
739
740         g_free(d);
741     }
742 }
743
744 static void update_refcount_discard(BlockDriverState *bs,
745                                     uint64_t offset, uint64_t length)
746 {
747     BDRVQcow2State *s = bs->opaque;
748     Qcow2DiscardRegion *d, *p, *next;
749
750     QTAILQ_FOREACH(d, &s->discards, next) {
751         uint64_t new_start = MIN(offset, d->offset);
752         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
753
754         if (new_end - new_start <= length + d->bytes) {
755             /* There can't be any overlap, areas ending up here have no
756              * references any more and therefore shouldn't get freed another
757              * time. */
758             assert(d->bytes + length == new_end - new_start);
759             d->offset = new_start;
760             d->bytes = new_end - new_start;
761             goto found;
762         }
763     }
764
765     d = g_malloc(sizeof(*d));
766     *d = (Qcow2DiscardRegion) {
767         .bs     = bs,
768         .offset = offset,
769         .bytes  = length,
770     };
771     QTAILQ_INSERT_TAIL(&s->discards, d, next);
772
773 found:
774     /* Merge discard requests if they are adjacent now */
775     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
776         if (p == d
777             || p->offset > d->offset + d->bytes
778             || d->offset > p->offset + p->bytes)
779         {
780             continue;
781         }
782
783         /* Still no overlap possible */
784         assert(p->offset == d->offset + d->bytes
785             || d->offset == p->offset + p->bytes);
786
787         QTAILQ_REMOVE(&s->discards, p, next);
788         d->offset = MIN(d->offset, p->offset);
789         d->bytes += p->bytes;
790         g_free(p);
791     }
792 }
793
794 /* XXX: cache several refcount block clusters ? */
795 /* @addend is the absolute value of the addend; if @decrease is set, @addend
796  * will be subtracted from the current refcount, otherwise it will be added */
797 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
798                                                    int64_t offset,
799                                                    int64_t length,
800                                                    uint64_t addend,
801                                                    bool decrease,
802                                                    enum qcow2_discard_type type)
803 {
804     BDRVQcow2State *s = bs->opaque;
805     int64_t start, last, cluster_offset;
806     void *refcount_block = NULL;
807     int64_t old_table_index = -1;
808     int ret;
809
810 #ifdef DEBUG_ALLOC2
811     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
812             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
813             addend);
814 #endif
815     if (length < 0) {
816         return -EINVAL;
817     } else if (length == 0) {
818         return 0;
819     }
820
821     if (decrease) {
822         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
823             s->l2_table_cache);
824     }
825
826     start = start_of_cluster(s, offset);
827     last = start_of_cluster(s, offset + length - 1);
828     for(cluster_offset = start; cluster_offset <= last;
829         cluster_offset += s->cluster_size)
830     {
831         int block_index;
832         uint64_t refcount;
833         int64_t cluster_index = cluster_offset >> s->cluster_bits;
834         int64_t table_index = cluster_index >> s->refcount_block_bits;
835
836         /* Load the refcount block and allocate it if needed */
837         if (table_index != old_table_index) {
838             if (refcount_block) {
839                 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
840             }
841             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
842             if (ret < 0) {
843                 goto fail;
844             }
845         }
846         old_table_index = table_index;
847
848         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
849
850         /* we can update the count and save it */
851         block_index = cluster_index & (s->refcount_block_size - 1);
852
853         refcount = s->get_refcount(refcount_block, block_index);
854         if (decrease ? (refcount - addend > refcount)
855                      : (refcount + addend < refcount ||
856                         refcount + addend > s->refcount_max))
857         {
858             ret = -EINVAL;
859             goto fail;
860         }
861         if (decrease) {
862             refcount -= addend;
863         } else {
864             refcount += addend;
865         }
866         if (refcount == 0 && cluster_index < s->free_cluster_index) {
867             s->free_cluster_index = cluster_index;
868         }
869         s->set_refcount(refcount_block, block_index, refcount);
870
871         if (refcount == 0) {
872             void *table;
873
874             table = qcow2_cache_is_table_offset(bs, s->refcount_block_cache,
875                                                 offset);
876             if (table != NULL) {
877                 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
878                 qcow2_cache_discard(bs, s->refcount_block_cache, table);
879             }
880
881             table = qcow2_cache_is_table_offset(bs, s->l2_table_cache, offset);
882             if (table != NULL) {
883                 qcow2_cache_discard(bs, s->l2_table_cache, table);
884             }
885
886             if (s->discard_passthrough[type]) {
887                 update_refcount_discard(bs, cluster_offset, s->cluster_size);
888             }
889         }
890     }
891
892     ret = 0;
893 fail:
894     if (!s->cache_discards) {
895         qcow2_process_discards(bs, ret);
896     }
897
898     /* Write last changed block to disk */
899     if (refcount_block) {
900         qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
901     }
902
903     /*
904      * Try do undo any updates if an error is returned (This may succeed in
905      * some cases like ENOSPC for allocating a new refcount block)
906      */
907     if (ret < 0) {
908         int dummy;
909         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
910                                 !decrease, QCOW2_DISCARD_NEVER);
911         (void)dummy;
912     }
913
914     return ret;
915 }
916
917 /*
918  * Increases or decreases the refcount of a given cluster.
919  *
920  * @addend is the absolute value of the addend; if @decrease is set, @addend
921  * will be subtracted from the current refcount, otherwise it will be added.
922  *
923  * On success 0 is returned; on failure -errno is returned.
924  */
925 int qcow2_update_cluster_refcount(BlockDriverState *bs,
926                                   int64_t cluster_index,
927                                   uint64_t addend, bool decrease,
928                                   enum qcow2_discard_type type)
929 {
930     BDRVQcow2State *s = bs->opaque;
931     int ret;
932
933     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
934                           decrease, type);
935     if (ret < 0) {
936         return ret;
937     }
938
939     return 0;
940 }
941
942
943
944 /*********************************************************/
945 /* cluster allocation functions */
946
947
948
949 /* return < 0 if error */
950 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
951 {
952     BDRVQcow2State *s = bs->opaque;
953     uint64_t i, nb_clusters, refcount;
954     int ret;
955
956     /* We can't allocate clusters if they may still be queued for discard. */
957     if (s->cache_discards) {
958         qcow2_process_discards(bs, 0);
959     }
960
961     nb_clusters = size_to_clusters(s, size);
962 retry:
963     for(i = 0; i < nb_clusters; i++) {
964         uint64_t next_cluster_index = s->free_cluster_index++;
965         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
966
967         if (ret < 0) {
968             return ret;
969         } else if (refcount != 0) {
970             goto retry;
971         }
972     }
973
974     /* Make sure that all offsets in the "allocated" range are representable
975      * in an int64_t */
976     if (s->free_cluster_index > 0 &&
977         s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
978     {
979         return -EFBIG;
980     }
981
982 #ifdef DEBUG_ALLOC2
983     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
984             size,
985             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
986 #endif
987     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
988 }
989
990 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
991 {
992     int64_t offset;
993     int ret;
994
995     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
996     do {
997         offset = alloc_clusters_noref(bs, size);
998         if (offset < 0) {
999             return offset;
1000         }
1001
1002         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1003     } while (ret == -EAGAIN);
1004
1005     if (ret < 0) {
1006         return ret;
1007     }
1008
1009     return offset;
1010 }
1011
1012 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1013                                 int64_t nb_clusters)
1014 {
1015     BDRVQcow2State *s = bs->opaque;
1016     uint64_t cluster_index, refcount;
1017     uint64_t i;
1018     int ret;
1019
1020     assert(nb_clusters >= 0);
1021     if (nb_clusters == 0) {
1022         return 0;
1023     }
1024
1025     do {
1026         /* Check how many clusters there are free */
1027         cluster_index = offset >> s->cluster_bits;
1028         for(i = 0; i < nb_clusters; i++) {
1029             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1030             if (ret < 0) {
1031                 return ret;
1032             } else if (refcount != 0) {
1033                 break;
1034             }
1035         }
1036
1037         /* And then allocate them */
1038         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1039                               QCOW2_DISCARD_NEVER);
1040     } while (ret == -EAGAIN);
1041
1042     if (ret < 0) {
1043         return ret;
1044     }
1045
1046     return i;
1047 }
1048
1049 /* only used to allocate compressed sectors. We try to allocate
1050    contiguous sectors. size must be <= cluster_size */
1051 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
1052 {
1053     BDRVQcow2State *s = bs->opaque;
1054     int64_t offset;
1055     size_t free_in_cluster;
1056     int ret;
1057
1058     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1059     assert(size > 0 && size <= s->cluster_size);
1060     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1061
1062     offset = s->free_byte_offset;
1063
1064     if (offset) {
1065         uint64_t refcount;
1066         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1067         if (ret < 0) {
1068             return ret;
1069         }
1070
1071         if (refcount == s->refcount_max) {
1072             offset = 0;
1073         }
1074     }
1075
1076     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1077     do {
1078         if (!offset || free_in_cluster < size) {
1079             int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
1080             if (new_cluster < 0) {
1081                 return new_cluster;
1082             }
1083
1084             if (new_cluster == 0) {
1085                 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1086                                         "allocation of compressed cluster "
1087                                         "at offset 0");
1088                 return -EIO;
1089             }
1090
1091             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1092                 offset = new_cluster;
1093                 free_in_cluster = s->cluster_size;
1094             } else {
1095                 free_in_cluster += s->cluster_size;
1096             }
1097         }
1098
1099         assert(offset);
1100         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1101         if (ret < 0) {
1102             offset = 0;
1103         }
1104     } while (ret == -EAGAIN);
1105     if (ret < 0) {
1106         return ret;
1107     }
1108
1109     /* The cluster refcount was incremented; refcount blocks must be flushed
1110      * before the caller's L2 table updates. */
1111     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1112
1113     s->free_byte_offset = offset + size;
1114     if (!offset_into_cluster(s, s->free_byte_offset)) {
1115         s->free_byte_offset = 0;
1116     }
1117
1118     return offset;
1119 }
1120
1121 void qcow2_free_clusters(BlockDriverState *bs,
1122                           int64_t offset, int64_t size,
1123                           enum qcow2_discard_type type)
1124 {
1125     int ret;
1126
1127     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1128     ret = update_refcount(bs, offset, size, 1, true, type);
1129     if (ret < 0) {
1130         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1131         /* TODO Remember the clusters to free them later and avoid leaking */
1132     }
1133 }
1134
1135 /*
1136  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1137  * normal cluster, compressed cluster, etc.)
1138  */
1139 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1140                              int nb_clusters, enum qcow2_discard_type type)
1141 {
1142     BDRVQcow2State *s = bs->opaque;
1143
1144     switch (qcow2_get_cluster_type(l2_entry)) {
1145     case QCOW2_CLUSTER_COMPRESSED:
1146         {
1147             int nb_csectors;
1148             nb_csectors = ((l2_entry >> s->csize_shift) &
1149                            s->csize_mask) + 1;
1150             qcow2_free_clusters(bs,
1151                 (l2_entry & s->cluster_offset_mask) & ~511,
1152                 nb_csectors * 512, type);
1153         }
1154         break;
1155     case QCOW2_CLUSTER_NORMAL:
1156     case QCOW2_CLUSTER_ZERO_ALLOC:
1157         if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1158             qcow2_signal_corruption(bs, false, -1, -1,
1159                                     "Cannot free unaligned cluster %#llx",
1160                                     l2_entry & L2E_OFFSET_MASK);
1161         } else {
1162             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1163                                 nb_clusters << s->cluster_bits, type);
1164         }
1165         break;
1166     case QCOW2_CLUSTER_ZERO_PLAIN:
1167     case QCOW2_CLUSTER_UNALLOCATED:
1168         break;
1169     default:
1170         abort();
1171     }
1172 }
1173
1174
1175
1176 /*********************************************************/
1177 /* snapshots and image creation */
1178
1179
1180
1181 /* update the refcounts of snapshots and the copied flag */
1182 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1183     int64_t l1_table_offset, int l1_size, int addend)
1184 {
1185     BDRVQcow2State *s = bs->opaque;
1186     uint64_t *l1_table, *l2_table, l2_offset, entry, l1_size2, refcount;
1187     bool l1_allocated = false;
1188     int64_t old_entry, old_l2_offset;
1189     int i, j, l1_modified = 0, nb_csectors;
1190     int ret;
1191
1192     assert(addend >= -1 && addend <= 1);
1193
1194     l2_table = NULL;
1195     l1_table = NULL;
1196     l1_size2 = l1_size * sizeof(uint64_t);
1197
1198     s->cache_discards = true;
1199
1200     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1201      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1202      * when changing this! */
1203     if (l1_table_offset != s->l1_table_offset) {
1204         l1_table = g_try_malloc0(align_offset(l1_size2, 512));
1205         if (l1_size2 && l1_table == NULL) {
1206             ret = -ENOMEM;
1207             goto fail;
1208         }
1209         l1_allocated = true;
1210
1211         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1212         if (ret < 0) {
1213             goto fail;
1214         }
1215
1216         for (i = 0; i < l1_size; i++) {
1217             be64_to_cpus(&l1_table[i]);
1218         }
1219     } else {
1220         assert(l1_size == s->l1_size);
1221         l1_table = s->l1_table;
1222         l1_allocated = false;
1223     }
1224
1225     for (i = 0; i < l1_size; i++) {
1226         l2_offset = l1_table[i];
1227         if (l2_offset) {
1228             old_l2_offset = l2_offset;
1229             l2_offset &= L1E_OFFSET_MASK;
1230
1231             if (offset_into_cluster(s, l2_offset)) {
1232                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1233                                         PRIx64 " unaligned (L1 index: %#x)",
1234                                         l2_offset, i);
1235                 ret = -EIO;
1236                 goto fail;
1237             }
1238
1239             ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1240                 (void**) &l2_table);
1241             if (ret < 0) {
1242                 goto fail;
1243             }
1244
1245             for (j = 0; j < s->l2_size; j++) {
1246                 uint64_t cluster_index;
1247                 uint64_t offset;
1248
1249                 entry = be64_to_cpu(l2_table[j]);
1250                 old_entry = entry;
1251                 entry &= ~QCOW_OFLAG_COPIED;
1252                 offset = entry & L2E_OFFSET_MASK;
1253
1254                 switch (qcow2_get_cluster_type(entry)) {
1255                 case QCOW2_CLUSTER_COMPRESSED:
1256                     nb_csectors = ((entry >> s->csize_shift) &
1257                                    s->csize_mask) + 1;
1258                     if (addend != 0) {
1259                         ret = update_refcount(bs,
1260                                 (entry & s->cluster_offset_mask) & ~511,
1261                                 nb_csectors * 512, abs(addend), addend < 0,
1262                                 QCOW2_DISCARD_SNAPSHOT);
1263                         if (ret < 0) {
1264                             goto fail;
1265                         }
1266                     }
1267                     /* compressed clusters are never modified */
1268                     refcount = 2;
1269                     break;
1270
1271                 case QCOW2_CLUSTER_NORMAL:
1272                 case QCOW2_CLUSTER_ZERO_ALLOC:
1273                     if (offset_into_cluster(s, offset)) {
1274                         qcow2_signal_corruption(bs, true, -1, -1, "Cluster "
1275                                                 "allocation offset %#" PRIx64
1276                                                 " unaligned (L2 offset: %#"
1277                                                 PRIx64 ", L2 index: %#x)",
1278                                                 offset, l2_offset, j);
1279                         ret = -EIO;
1280                         goto fail;
1281                     }
1282
1283                     cluster_index = offset >> s->cluster_bits;
1284                     assert(cluster_index);
1285                     if (addend != 0) {
1286                         ret = qcow2_update_cluster_refcount(bs,
1287                                     cluster_index, abs(addend), addend < 0,
1288                                     QCOW2_DISCARD_SNAPSHOT);
1289                         if (ret < 0) {
1290                             goto fail;
1291                         }
1292                     }
1293
1294                     ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1295                     if (ret < 0) {
1296                         goto fail;
1297                     }
1298                     break;
1299
1300                 case QCOW2_CLUSTER_ZERO_PLAIN:
1301                 case QCOW2_CLUSTER_UNALLOCATED:
1302                     refcount = 0;
1303                     break;
1304
1305                 default:
1306                     abort();
1307                 }
1308
1309                 if (refcount == 1) {
1310                     entry |= QCOW_OFLAG_COPIED;
1311                 }
1312                 if (entry != old_entry) {
1313                     if (addend > 0) {
1314                         qcow2_cache_set_dependency(bs, s->l2_table_cache,
1315                             s->refcount_block_cache);
1316                     }
1317                     l2_table[j] = cpu_to_be64(entry);
1318                     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
1319                 }
1320             }
1321
1322             qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1323
1324             if (addend != 0) {
1325                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1326                                                         s->cluster_bits,
1327                                                     abs(addend), addend < 0,
1328                                                     QCOW2_DISCARD_SNAPSHOT);
1329                 if (ret < 0) {
1330                     goto fail;
1331                 }
1332             }
1333             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1334                                      &refcount);
1335             if (ret < 0) {
1336                 goto fail;
1337             } else if (refcount == 1) {
1338                 l2_offset |= QCOW_OFLAG_COPIED;
1339             }
1340             if (l2_offset != old_l2_offset) {
1341                 l1_table[i] = l2_offset;
1342                 l1_modified = 1;
1343             }
1344         }
1345     }
1346
1347     ret = bdrv_flush(bs);
1348 fail:
1349     if (l2_table) {
1350         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1351     }
1352
1353     s->cache_discards = false;
1354     qcow2_process_discards(bs, ret);
1355
1356     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1357     if (ret == 0 && addend >= 0 && l1_modified) {
1358         for (i = 0; i < l1_size; i++) {
1359             cpu_to_be64s(&l1_table[i]);
1360         }
1361
1362         ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
1363                                l1_table, l1_size2);
1364
1365         for (i = 0; i < l1_size; i++) {
1366             be64_to_cpus(&l1_table[i]);
1367         }
1368     }
1369     if (l1_allocated)
1370         g_free(l1_table);
1371     return ret;
1372 }
1373
1374
1375
1376
1377 /*********************************************************/
1378 /* refcount checking functions */
1379
1380
1381 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1382 {
1383     /* This assertion holds because there is no way we can address more than
1384      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1385      * offsets have to be representable in bytes); due to every cluster
1386      * corresponding to one refcount entry, we are well below that limit */
1387     assert(entries < (UINT64_C(1) << (64 - 9)));
1388
1389     /* Thanks to the assertion this will not overflow, because
1390      * s->refcount_order < 7.
1391      * (note: x << s->refcount_order == x * s->refcount_bits) */
1392     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1393 }
1394
1395 /**
1396  * Reallocates *array so that it can hold new_size entries. *size must contain
1397  * the current number of entries in *array. If the reallocation fails, *array
1398  * and *size will not be modified and -errno will be returned. If the
1399  * reallocation is successful, *array will be set to the new buffer, *size
1400  * will be set to new_size and 0 will be returned. The size of the reallocated
1401  * refcount array buffer will be aligned to a cluster boundary, and the newly
1402  * allocated area will be zeroed.
1403  */
1404 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1405                                   int64_t *size, int64_t new_size)
1406 {
1407     int64_t old_byte_size, new_byte_size;
1408     void *new_ptr;
1409
1410     /* Round to clusters so the array can be directly written to disk */
1411     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1412                     * s->cluster_size;
1413     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1414                     * s->cluster_size;
1415
1416     if (new_byte_size == old_byte_size) {
1417         *size = new_size;
1418         return 0;
1419     }
1420
1421     assert(new_byte_size > 0);
1422
1423     if (new_byte_size > SIZE_MAX) {
1424         return -ENOMEM;
1425     }
1426
1427     new_ptr = g_try_realloc(*array, new_byte_size);
1428     if (!new_ptr) {
1429         return -ENOMEM;
1430     }
1431
1432     if (new_byte_size > old_byte_size) {
1433         memset((char *)new_ptr + old_byte_size, 0,
1434                new_byte_size - old_byte_size);
1435     }
1436
1437     *array = new_ptr;
1438     *size  = new_size;
1439
1440     return 0;
1441 }
1442
1443 /*
1444  * Increases the refcount for a range of clusters in a given refcount table.
1445  * This is used to construct a temporary refcount table out of L1 and L2 tables
1446  * which can be compared to the refcount table saved in the image.
1447  *
1448  * Modifies the number of errors in res.
1449  */
1450 int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1451                              void **refcount_table,
1452                              int64_t *refcount_table_size,
1453                              int64_t offset, int64_t size)
1454 {
1455     BDRVQcow2State *s = bs->opaque;
1456     uint64_t start, last, cluster_offset, k, refcount;
1457     int ret;
1458
1459     if (size <= 0) {
1460         return 0;
1461     }
1462
1463     start = start_of_cluster(s, offset);
1464     last = start_of_cluster(s, offset + size - 1);
1465     for(cluster_offset = start; cluster_offset <= last;
1466         cluster_offset += s->cluster_size) {
1467         k = cluster_offset >> s->cluster_bits;
1468         if (k >= *refcount_table_size) {
1469             ret = realloc_refcount_array(s, refcount_table,
1470                                          refcount_table_size, k + 1);
1471             if (ret < 0) {
1472                 res->check_errors++;
1473                 return ret;
1474             }
1475         }
1476
1477         refcount = s->get_refcount(*refcount_table, k);
1478         if (refcount == s->refcount_max) {
1479             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1480                     "\n", cluster_offset);
1481             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1482                     "width or qemu-img convert to create a clean copy if the "
1483                     "image cannot be opened for writing\n");
1484             res->corruptions++;
1485             continue;
1486         }
1487         s->set_refcount(*refcount_table, k, refcount + 1);
1488     }
1489
1490     return 0;
1491 }
1492
1493 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1494 enum {
1495     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1496 };
1497
1498 /*
1499  * Increases the refcount in the given refcount table for the all clusters
1500  * referenced in the L2 table. While doing so, performs some checks on L2
1501  * entries.
1502  *
1503  * Returns the number of errors found by the checks or -errno if an internal
1504  * error occurred.
1505  */
1506 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1507                               void **refcount_table,
1508                               int64_t *refcount_table_size, int64_t l2_offset,
1509                               int flags, BdrvCheckMode fix)
1510 {
1511     BDRVQcow2State *s = bs->opaque;
1512     uint64_t *l2_table, l2_entry;
1513     uint64_t next_contiguous_offset = 0;
1514     int i, l2_size, nb_csectors, ret;
1515
1516     /* Read L2 table from disk */
1517     l2_size = s->l2_size * sizeof(uint64_t);
1518     l2_table = g_malloc(l2_size);
1519
1520     ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
1521     if (ret < 0) {
1522         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1523         res->check_errors++;
1524         goto fail;
1525     }
1526
1527     /* Do the actual checks */
1528     for(i = 0; i < s->l2_size; i++) {
1529         l2_entry = be64_to_cpu(l2_table[i]);
1530
1531         switch (qcow2_get_cluster_type(l2_entry)) {
1532         case QCOW2_CLUSTER_COMPRESSED:
1533             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1534             if (l2_entry & QCOW_OFLAG_COPIED) {
1535                 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1536                     "copied flag must never be set for compressed "
1537                     "clusters\n", l2_entry >> s->cluster_bits);
1538                 l2_entry &= ~QCOW_OFLAG_COPIED;
1539                 res->corruptions++;
1540             }
1541
1542             /* Mark cluster as used */
1543             nb_csectors = ((l2_entry >> s->csize_shift) &
1544                            s->csize_mask) + 1;
1545             l2_entry &= s->cluster_offset_mask;
1546             ret = qcow2_inc_refcounts_imrt(bs, res,
1547                                            refcount_table, refcount_table_size,
1548                                            l2_entry & ~511, nb_csectors * 512);
1549             if (ret < 0) {
1550                 goto fail;
1551             }
1552
1553             if (flags & CHECK_FRAG_INFO) {
1554                 res->bfi.allocated_clusters++;
1555                 res->bfi.compressed_clusters++;
1556
1557                 /* Compressed clusters are fragmented by nature.  Since they
1558                  * take up sub-sector space but we only have sector granularity
1559                  * I/O we need to re-read the same sectors even for adjacent
1560                  * compressed clusters.
1561                  */
1562                 res->bfi.fragmented_clusters++;
1563             }
1564             break;
1565
1566         case QCOW2_CLUSTER_ZERO_ALLOC:
1567         case QCOW2_CLUSTER_NORMAL:
1568         {
1569             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1570
1571             if (flags & CHECK_FRAG_INFO) {
1572                 res->bfi.allocated_clusters++;
1573                 if (next_contiguous_offset &&
1574                     offset != next_contiguous_offset) {
1575                     res->bfi.fragmented_clusters++;
1576                 }
1577                 next_contiguous_offset = offset + s->cluster_size;
1578             }
1579
1580             /* Correct offsets are cluster aligned */
1581             if (offset_into_cluster(s, offset)) {
1582                 if (qcow2_get_cluster_type(l2_entry) ==
1583                     QCOW2_CLUSTER_ZERO_ALLOC)
1584                 {
1585                     fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated zero "
1586                             "cluster is not properly aligned; L2 entry "
1587                             "corrupted.\n",
1588                             fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1589                             offset);
1590                     if (fix & BDRV_FIX_ERRORS) {
1591                         uint64_t l2e_offset =
1592                             l2_offset + (uint64_t)i * sizeof(uint64_t);
1593
1594                         l2_entry = QCOW_OFLAG_ZERO;
1595                         l2_table[i] = cpu_to_be64(l2_entry);
1596                         ret = qcow2_pre_write_overlap_check(bs,
1597                                 QCOW2_OL_ACTIVE_L2 | QCOW2_OL_INACTIVE_L2,
1598                                 l2e_offset, sizeof(uint64_t));
1599                         if (ret < 0) {
1600                             fprintf(stderr, "ERROR: Overlap check failed\n");
1601                             res->check_errors++;
1602                             /* Something is seriously wrong, so abort checking
1603                              * this L2 table */
1604                             goto fail;
1605                         }
1606
1607                         ret = bdrv_pwrite_sync(bs->file, l2e_offset,
1608                                                &l2_table[i], sizeof(uint64_t));
1609                         if (ret < 0) {
1610                             fprintf(stderr, "ERROR: Failed to overwrite L2 "
1611                                     "table entry: %s\n", strerror(-ret));
1612                             res->check_errors++;
1613                             /* Do not abort, continue checking the rest of this
1614                              * L2 table's entries */
1615                         } else {
1616                             res->corruptions_fixed++;
1617                             /* Skip marking the cluster as used
1618                              * (it is unused now) */
1619                             continue;
1620                         }
1621                     } else {
1622                         res->corruptions++;
1623                     }
1624                 } else {
1625                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1626                         "not properly aligned; L2 entry corrupted.\n", offset);
1627                     res->corruptions++;
1628                 }
1629             }
1630
1631             /* Mark cluster as used */
1632             ret = qcow2_inc_refcounts_imrt(bs, res,
1633                                            refcount_table, refcount_table_size,
1634                                            offset, s->cluster_size);
1635             if (ret < 0) {
1636                 goto fail;
1637             }
1638             break;
1639         }
1640
1641         case QCOW2_CLUSTER_ZERO_PLAIN:
1642         case QCOW2_CLUSTER_UNALLOCATED:
1643             break;
1644
1645         default:
1646             abort();
1647         }
1648     }
1649
1650     g_free(l2_table);
1651     return 0;
1652
1653 fail:
1654     g_free(l2_table);
1655     return ret;
1656 }
1657
1658 /*
1659  * Increases the refcount for the L1 table, its L2 tables and all referenced
1660  * clusters in the given refcount table. While doing so, performs some checks
1661  * on L1 and L2 entries.
1662  *
1663  * Returns the number of errors found by the checks or -errno if an internal
1664  * error occurred.
1665  */
1666 static int check_refcounts_l1(BlockDriverState *bs,
1667                               BdrvCheckResult *res,
1668                               void **refcount_table,
1669                               int64_t *refcount_table_size,
1670                               int64_t l1_table_offset, int l1_size,
1671                               int flags, BdrvCheckMode fix)
1672 {
1673     BDRVQcow2State *s = bs->opaque;
1674     uint64_t *l1_table = NULL, l2_offset, l1_size2;
1675     int i, ret;
1676
1677     l1_size2 = l1_size * sizeof(uint64_t);
1678
1679     /* Mark L1 table as used */
1680     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1681                                    l1_table_offset, l1_size2);
1682     if (ret < 0) {
1683         goto fail;
1684     }
1685
1686     /* Read L1 table entries from disk */
1687     if (l1_size2 > 0) {
1688         l1_table = g_try_malloc(l1_size2);
1689         if (l1_table == NULL) {
1690             ret = -ENOMEM;
1691             res->check_errors++;
1692             goto fail;
1693         }
1694         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1695         if (ret < 0) {
1696             fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1697             res->check_errors++;
1698             goto fail;
1699         }
1700         for(i = 0;i < l1_size; i++)
1701             be64_to_cpus(&l1_table[i]);
1702     }
1703
1704     /* Do the actual checks */
1705     for(i = 0; i < l1_size; i++) {
1706         l2_offset = l1_table[i];
1707         if (l2_offset) {
1708             /* Mark L2 table as used */
1709             l2_offset &= L1E_OFFSET_MASK;
1710             ret = qcow2_inc_refcounts_imrt(bs, res,
1711                                            refcount_table, refcount_table_size,
1712                                            l2_offset, s->cluster_size);
1713             if (ret < 0) {
1714                 goto fail;
1715             }
1716
1717             /* L2 tables are cluster aligned */
1718             if (offset_into_cluster(s, l2_offset)) {
1719                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1720                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1721                 res->corruptions++;
1722             }
1723
1724             /* Process and check L2 entries */
1725             ret = check_refcounts_l2(bs, res, refcount_table,
1726                                      refcount_table_size, l2_offset, flags,
1727                                      fix);
1728             if (ret < 0) {
1729                 goto fail;
1730             }
1731         }
1732     }
1733     g_free(l1_table);
1734     return 0;
1735
1736 fail:
1737     g_free(l1_table);
1738     return ret;
1739 }
1740
1741 /*
1742  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1743  *
1744  * This function does not print an error message nor does it increment
1745  * check_errors if qcow2_get_refcount fails (this is because such an error will
1746  * have been already detected and sufficiently signaled by the calling function
1747  * (qcow2_check_refcounts) by the time this function is called).
1748  */
1749 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1750                               BdrvCheckMode fix)
1751 {
1752     BDRVQcow2State *s = bs->opaque;
1753     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1754     int ret;
1755     uint64_t refcount;
1756     int i, j;
1757
1758     for (i = 0; i < s->l1_size; i++) {
1759         uint64_t l1_entry = s->l1_table[i];
1760         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1761         bool l2_dirty = false;
1762
1763         if (!l2_offset) {
1764             continue;
1765         }
1766
1767         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1768                                  &refcount);
1769         if (ret < 0) {
1770             /* don't print message nor increment check_errors */
1771             continue;
1772         }
1773         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1774             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1775                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1776                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1777                                             "ERROR",
1778                     i, l1_entry, refcount);
1779             if (fix & BDRV_FIX_ERRORS) {
1780                 s->l1_table[i] = refcount == 1
1781                                ? l1_entry |  QCOW_OFLAG_COPIED
1782                                : l1_entry & ~QCOW_OFLAG_COPIED;
1783                 ret = qcow2_write_l1_entry(bs, i);
1784                 if (ret < 0) {
1785                     res->check_errors++;
1786                     goto fail;
1787                 }
1788                 res->corruptions_fixed++;
1789             } else {
1790                 res->corruptions++;
1791             }
1792         }
1793
1794         ret = bdrv_pread(bs->file, l2_offset, l2_table,
1795                          s->l2_size * sizeof(uint64_t));
1796         if (ret < 0) {
1797             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1798                     strerror(-ret));
1799             res->check_errors++;
1800             goto fail;
1801         }
1802
1803         for (j = 0; j < s->l2_size; j++) {
1804             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1805             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1806             QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
1807
1808             if (cluster_type == QCOW2_CLUSTER_NORMAL ||
1809                 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
1810                 ret = qcow2_get_refcount(bs,
1811                                          data_offset >> s->cluster_bits,
1812                                          &refcount);
1813                 if (ret < 0) {
1814                     /* don't print message nor increment check_errors */
1815                     continue;
1816                 }
1817                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1818                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1819                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1820                             fix & BDRV_FIX_ERRORS ? "Repairing" :
1821                                                     "ERROR",
1822                             l2_entry, refcount);
1823                     if (fix & BDRV_FIX_ERRORS) {
1824                         l2_table[j] = cpu_to_be64(refcount == 1
1825                                     ? l2_entry |  QCOW_OFLAG_COPIED
1826                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1827                         l2_dirty = true;
1828                         res->corruptions_fixed++;
1829                     } else {
1830                         res->corruptions++;
1831                     }
1832                 }
1833             }
1834         }
1835
1836         if (l2_dirty) {
1837             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1838                                                 l2_offset, s->cluster_size);
1839             if (ret < 0) {
1840                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1841                         "overlap check failed: %s\n", strerror(-ret));
1842                 res->check_errors++;
1843                 goto fail;
1844             }
1845
1846             ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
1847                               s->cluster_size);
1848             if (ret < 0) {
1849                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1850                         strerror(-ret));
1851                 res->check_errors++;
1852                 goto fail;
1853             }
1854         }
1855     }
1856
1857     ret = 0;
1858
1859 fail:
1860     qemu_vfree(l2_table);
1861     return ret;
1862 }
1863
1864 /*
1865  * Checks consistency of refblocks and accounts for each refblock in
1866  * *refcount_table.
1867  */
1868 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1869                            BdrvCheckMode fix, bool *rebuild,
1870                            void **refcount_table, int64_t *nb_clusters)
1871 {
1872     BDRVQcow2State *s = bs->opaque;
1873     int64_t i, size;
1874     int ret;
1875
1876     for(i = 0; i < s->refcount_table_size; i++) {
1877         uint64_t offset, cluster;
1878         offset = s->refcount_table[i];
1879         cluster = offset >> s->cluster_bits;
1880
1881         /* Refcount blocks are cluster aligned */
1882         if (offset_into_cluster(s, offset)) {
1883             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1884                 "cluster aligned; refcount table entry corrupted\n", i);
1885             res->corruptions++;
1886             *rebuild = true;
1887             continue;
1888         }
1889
1890         if (cluster >= *nb_clusters) {
1891             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1892                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1893
1894             if (fix & BDRV_FIX_ERRORS) {
1895                 int64_t new_nb_clusters;
1896                 Error *local_err = NULL;
1897
1898                 if (offset > INT64_MAX - s->cluster_size) {
1899                     ret = -EINVAL;
1900                     goto resize_fail;
1901                 }
1902
1903                 ret = bdrv_truncate(bs->file, offset + s->cluster_size,
1904                                     PREALLOC_MODE_OFF, &local_err);
1905                 if (ret < 0) {
1906                     error_report_err(local_err);
1907                     goto resize_fail;
1908                 }
1909                 size = bdrv_getlength(bs->file->bs);
1910                 if (size < 0) {
1911                     ret = size;
1912                     goto resize_fail;
1913                 }
1914
1915                 new_nb_clusters = size_to_clusters(s, size);
1916                 assert(new_nb_clusters >= *nb_clusters);
1917
1918                 ret = realloc_refcount_array(s, refcount_table,
1919                                              nb_clusters, new_nb_clusters);
1920                 if (ret < 0) {
1921                     res->check_errors++;
1922                     return ret;
1923                 }
1924
1925                 if (cluster >= *nb_clusters) {
1926                     ret = -EINVAL;
1927                     goto resize_fail;
1928                 }
1929
1930                 res->corruptions_fixed++;
1931                 ret = qcow2_inc_refcounts_imrt(bs, res,
1932                                                refcount_table, nb_clusters,
1933                                                offset, s->cluster_size);
1934                 if (ret < 0) {
1935                     return ret;
1936                 }
1937                 /* No need to check whether the refcount is now greater than 1:
1938                  * This area was just allocated and zeroed, so it can only be
1939                  * exactly 1 after qcow2_inc_refcounts_imrt() */
1940                 continue;
1941
1942 resize_fail:
1943                 res->corruptions++;
1944                 *rebuild = true;
1945                 fprintf(stderr, "ERROR could not resize image: %s\n",
1946                         strerror(-ret));
1947             } else {
1948                 res->corruptions++;
1949             }
1950             continue;
1951         }
1952
1953         if (offset != 0) {
1954             ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1955                                            offset, s->cluster_size);
1956             if (ret < 0) {
1957                 return ret;
1958             }
1959             if (s->get_refcount(*refcount_table, cluster) != 1) {
1960                 fprintf(stderr, "ERROR refcount block %" PRId64
1961                         " refcount=%" PRIu64 "\n", i,
1962                         s->get_refcount(*refcount_table, cluster));
1963                 res->corruptions++;
1964                 *rebuild = true;
1965             }
1966         }
1967     }
1968
1969     return 0;
1970 }
1971
1972 /*
1973  * Calculates an in-memory refcount table.
1974  */
1975 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1976                                BdrvCheckMode fix, bool *rebuild,
1977                                void **refcount_table, int64_t *nb_clusters)
1978 {
1979     BDRVQcow2State *s = bs->opaque;
1980     int64_t i;
1981     QCowSnapshot *sn;
1982     int ret;
1983
1984     if (!*refcount_table) {
1985         int64_t old_size = 0;
1986         ret = realloc_refcount_array(s, refcount_table,
1987                                      &old_size, *nb_clusters);
1988         if (ret < 0) {
1989             res->check_errors++;
1990             return ret;
1991         }
1992     }
1993
1994     /* header */
1995     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1996                                    0, s->cluster_size);
1997     if (ret < 0) {
1998         return ret;
1999     }
2000
2001     /* current L1 table */
2002     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2003                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2004                              fix);
2005     if (ret < 0) {
2006         return ret;
2007     }
2008
2009     /* snapshots */
2010     for (i = 0; i < s->nb_snapshots; i++) {
2011         sn = s->snapshots + i;
2012         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2013                                  sn->l1_table_offset, sn->l1_size, 0, fix);
2014         if (ret < 0) {
2015             return ret;
2016         }
2017     }
2018     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2019                                    s->snapshots_offset, s->snapshots_size);
2020     if (ret < 0) {
2021         return ret;
2022     }
2023
2024     /* refcount data */
2025     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2026                                    s->refcount_table_offset,
2027                                    s->refcount_table_size * sizeof(uint64_t));
2028     if (ret < 0) {
2029         return ret;
2030     }
2031
2032     /* encryption */
2033     if (s->crypto_header.length) {
2034         ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2035                                        s->crypto_header.offset,
2036                                        s->crypto_header.length);
2037         if (ret < 0) {
2038             return ret;
2039         }
2040     }
2041
2042     /* bitmaps */
2043     ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2044     if (ret < 0) {
2045         return ret;
2046     }
2047
2048     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2049 }
2050
2051 /*
2052  * Compares the actual reference count for each cluster in the image against the
2053  * refcount as reported by the refcount structures on-disk.
2054  */
2055 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2056                               BdrvCheckMode fix, bool *rebuild,
2057                               int64_t *highest_cluster,
2058                               void *refcount_table, int64_t nb_clusters)
2059 {
2060     BDRVQcow2State *s = bs->opaque;
2061     int64_t i;
2062     uint64_t refcount1, refcount2;
2063     int ret;
2064
2065     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2066         ret = qcow2_get_refcount(bs, i, &refcount1);
2067         if (ret < 0) {
2068             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2069                     i, strerror(-ret));
2070             res->check_errors++;
2071             continue;
2072         }
2073
2074         refcount2 = s->get_refcount(refcount_table, i);
2075
2076         if (refcount1 > 0 || refcount2 > 0) {
2077             *highest_cluster = i;
2078         }
2079
2080         if (refcount1 != refcount2) {
2081             /* Check if we're allowed to fix the mismatch */
2082             int *num_fixed = NULL;
2083             if (refcount1 == 0) {
2084                 *rebuild = true;
2085             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2086                 num_fixed = &res->leaks_fixed;
2087             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2088                 num_fixed = &res->corruptions_fixed;
2089             }
2090
2091             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2092                     " reference=%" PRIu64 "\n",
2093                    num_fixed != NULL     ? "Repairing" :
2094                    refcount1 < refcount2 ? "ERROR" :
2095                                            "Leaked",
2096                    i, refcount1, refcount2);
2097
2098             if (num_fixed) {
2099                 ret = update_refcount(bs, i << s->cluster_bits, 1,
2100                                       refcount_diff(refcount1, refcount2),
2101                                       refcount1 > refcount2,
2102                                       QCOW2_DISCARD_ALWAYS);
2103                 if (ret >= 0) {
2104                     (*num_fixed)++;
2105                     continue;
2106                 }
2107             }
2108
2109             /* And if we couldn't, print an error */
2110             if (refcount1 < refcount2) {
2111                 res->corruptions++;
2112             } else {
2113                 res->leaks++;
2114             }
2115         }
2116     }
2117 }
2118
2119 /*
2120  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2121  * the on-disk refcount structures.
2122  *
2123  * On input, *first_free_cluster tells where to start looking, and need not
2124  * actually be a free cluster; the returned offset will not be before that
2125  * cluster.  On output, *first_free_cluster points to the first gap found, even
2126  * if that gap was too small to be used as the returned offset.
2127  *
2128  * Note that *first_free_cluster is a cluster index whereas the return value is
2129  * an offset.
2130  */
2131 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2132                                    int cluster_count,
2133                                    void **refcount_table,
2134                                    int64_t *imrt_nb_clusters,
2135                                    int64_t *first_free_cluster)
2136 {
2137     BDRVQcow2State *s = bs->opaque;
2138     int64_t cluster = *first_free_cluster, i;
2139     bool first_gap = true;
2140     int contiguous_free_clusters;
2141     int ret;
2142
2143     /* Starting at *first_free_cluster, find a range of at least cluster_count
2144      * continuously free clusters */
2145     for (contiguous_free_clusters = 0;
2146          cluster < *imrt_nb_clusters &&
2147          contiguous_free_clusters < cluster_count;
2148          cluster++)
2149     {
2150         if (!s->get_refcount(*refcount_table, cluster)) {
2151             contiguous_free_clusters++;
2152             if (first_gap) {
2153                 /* If this is the first free cluster found, update
2154                  * *first_free_cluster accordingly */
2155                 *first_free_cluster = cluster;
2156                 first_gap = false;
2157             }
2158         } else if (contiguous_free_clusters) {
2159             contiguous_free_clusters = 0;
2160         }
2161     }
2162
2163     /* If contiguous_free_clusters is greater than zero, it contains the number
2164      * of continuously free clusters until the current cluster; the first free
2165      * cluster in the current "gap" is therefore
2166      * cluster - contiguous_free_clusters */
2167
2168     /* If no such range could be found, grow the in-memory refcount table
2169      * accordingly to append free clusters at the end of the image */
2170     if (contiguous_free_clusters < cluster_count) {
2171         /* contiguous_free_clusters clusters are already empty at the image end;
2172          * we need cluster_count clusters; therefore, we have to allocate
2173          * cluster_count - contiguous_free_clusters new clusters at the end of
2174          * the image (which is the current value of cluster; note that cluster
2175          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2176          * the image end) */
2177         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2178                                      cluster + cluster_count
2179                                      - contiguous_free_clusters);
2180         if (ret < 0) {
2181             return ret;
2182         }
2183     }
2184
2185     /* Go back to the first free cluster */
2186     cluster -= contiguous_free_clusters;
2187     for (i = 0; i < cluster_count; i++) {
2188         s->set_refcount(*refcount_table, cluster + i, 1);
2189     }
2190
2191     return cluster << s->cluster_bits;
2192 }
2193
2194 /*
2195  * Creates a new refcount structure based solely on the in-memory information
2196  * given through *refcount_table. All necessary allocations will be reflected
2197  * in that array.
2198  *
2199  * On success, the old refcount structure is leaked (it will be covered by the
2200  * new refcount structure).
2201  */
2202 static int rebuild_refcount_structure(BlockDriverState *bs,
2203                                       BdrvCheckResult *res,
2204                                       void **refcount_table,
2205                                       int64_t *nb_clusters)
2206 {
2207     BDRVQcow2State *s = bs->opaque;
2208     int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2209     int64_t refblock_offset, refblock_start, refblock_index;
2210     uint32_t reftable_size = 0;
2211     uint64_t *on_disk_reftable = NULL;
2212     void *on_disk_refblock;
2213     int ret = 0;
2214     struct {
2215         uint64_t reftable_offset;
2216         uint32_t reftable_clusters;
2217     } QEMU_PACKED reftable_offset_and_clusters;
2218
2219     qcow2_cache_empty(bs, s->refcount_block_cache);
2220
2221 write_refblocks:
2222     for (; cluster < *nb_clusters; cluster++) {
2223         if (!s->get_refcount(*refcount_table, cluster)) {
2224             continue;
2225         }
2226
2227         refblock_index = cluster >> s->refcount_block_bits;
2228         refblock_start = refblock_index << s->refcount_block_bits;
2229
2230         /* Don't allocate a cluster in a refblock already written to disk */
2231         if (first_free_cluster < refblock_start) {
2232             first_free_cluster = refblock_start;
2233         }
2234         refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2235                                               nb_clusters, &first_free_cluster);
2236         if (refblock_offset < 0) {
2237             fprintf(stderr, "ERROR allocating refblock: %s\n",
2238                     strerror(-refblock_offset));
2239             res->check_errors++;
2240             ret = refblock_offset;
2241             goto fail;
2242         }
2243
2244         if (reftable_size <= refblock_index) {
2245             uint32_t old_reftable_size = reftable_size;
2246             uint64_t *new_on_disk_reftable;
2247
2248             reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2249                                      s->cluster_size) / sizeof(uint64_t);
2250             new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2251                                                  reftable_size *
2252                                                  sizeof(uint64_t));
2253             if (!new_on_disk_reftable) {
2254                 res->check_errors++;
2255                 ret = -ENOMEM;
2256                 goto fail;
2257             }
2258             on_disk_reftable = new_on_disk_reftable;
2259
2260             memset(on_disk_reftable + old_reftable_size, 0,
2261                    (reftable_size - old_reftable_size) * sizeof(uint64_t));
2262
2263             /* The offset we have for the reftable is now no longer valid;
2264              * this will leak that range, but we can easily fix that by running
2265              * a leak-fixing check after this rebuild operation */
2266             reftable_offset = -1;
2267         } else {
2268             assert(on_disk_reftable);
2269         }
2270         on_disk_reftable[refblock_index] = refblock_offset;
2271
2272         /* If this is apparently the last refblock (for now), try to squeeze the
2273          * reftable in */
2274         if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2275             reftable_offset < 0)
2276         {
2277             uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2278                                                           sizeof(uint64_t));
2279             reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2280                                                   refcount_table, nb_clusters,
2281                                                   &first_free_cluster);
2282             if (reftable_offset < 0) {
2283                 fprintf(stderr, "ERROR allocating reftable: %s\n",
2284                         strerror(-reftable_offset));
2285                 res->check_errors++;
2286                 ret = reftable_offset;
2287                 goto fail;
2288             }
2289         }
2290
2291         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2292                                             s->cluster_size);
2293         if (ret < 0) {
2294             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2295             goto fail;
2296         }
2297
2298         /* The size of *refcount_table is always cluster-aligned, therefore the
2299          * write operation will not overflow */
2300         on_disk_refblock = (void *)((char *) *refcount_table +
2301                                     refblock_index * s->cluster_size);
2302
2303         ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
2304                          on_disk_refblock, s->cluster_sectors);
2305         if (ret < 0) {
2306             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2307             goto fail;
2308         }
2309
2310         /* Go to the end of this refblock */
2311         cluster = refblock_start + s->refcount_block_size - 1;
2312     }
2313
2314     if (reftable_offset < 0) {
2315         uint64_t post_refblock_start, reftable_clusters;
2316
2317         post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2318         reftable_clusters = size_to_clusters(s,
2319                                              reftable_size * sizeof(uint64_t));
2320         /* Not pretty but simple */
2321         if (first_free_cluster < post_refblock_start) {
2322             first_free_cluster = post_refblock_start;
2323         }
2324         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2325                                               refcount_table, nb_clusters,
2326                                               &first_free_cluster);
2327         if (reftable_offset < 0) {
2328             fprintf(stderr, "ERROR allocating reftable: %s\n",
2329                     strerror(-reftable_offset));
2330             res->check_errors++;
2331             ret = reftable_offset;
2332             goto fail;
2333         }
2334
2335         goto write_refblocks;
2336     }
2337
2338     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2339         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2340     }
2341
2342     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2343                                         reftable_size * sizeof(uint64_t));
2344     if (ret < 0) {
2345         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2346         goto fail;
2347     }
2348
2349     assert(reftable_size < INT_MAX / sizeof(uint64_t));
2350     ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2351                       reftable_size * sizeof(uint64_t));
2352     if (ret < 0) {
2353         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2354         goto fail;
2355     }
2356
2357     /* Enter new reftable into the image header */
2358     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2359     reftable_offset_and_clusters.reftable_clusters =
2360         cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2361     ret = bdrv_pwrite_sync(bs->file,
2362                            offsetof(QCowHeader, refcount_table_offset),
2363                            &reftable_offset_and_clusters,
2364                            sizeof(reftable_offset_and_clusters));
2365     if (ret < 0) {
2366         fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2367         goto fail;
2368     }
2369
2370     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2371         be64_to_cpus(&on_disk_reftable[refblock_index]);
2372     }
2373     s->refcount_table = on_disk_reftable;
2374     s->refcount_table_offset = reftable_offset;
2375     s->refcount_table_size = reftable_size;
2376     update_max_refcount_table_index(s);
2377
2378     return 0;
2379
2380 fail:
2381     g_free(on_disk_reftable);
2382     return ret;
2383 }
2384
2385 /*
2386  * Checks an image for refcount consistency.
2387  *
2388  * Returns 0 if no errors are found, the number of errors in case the image is
2389  * detected as corrupted, and -errno when an internal error occurred.
2390  */
2391 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2392                           BdrvCheckMode fix)
2393 {
2394     BDRVQcow2State *s = bs->opaque;
2395     BdrvCheckResult pre_compare_res;
2396     int64_t size, highest_cluster, nb_clusters;
2397     void *refcount_table = NULL;
2398     bool rebuild = false;
2399     int ret;
2400
2401     size = bdrv_getlength(bs->file->bs);
2402     if (size < 0) {
2403         res->check_errors++;
2404         return size;
2405     }
2406
2407     nb_clusters = size_to_clusters(s, size);
2408     if (nb_clusters > INT_MAX) {
2409         res->check_errors++;
2410         return -EFBIG;
2411     }
2412
2413     res->bfi.total_clusters =
2414         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2415
2416     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2417                               &nb_clusters);
2418     if (ret < 0) {
2419         goto fail;
2420     }
2421
2422     /* In case we don't need to rebuild the refcount structure (but want to fix
2423      * something), this function is immediately called again, in which case the
2424      * result should be ignored */
2425     pre_compare_res = *res;
2426     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2427                       nb_clusters);
2428
2429     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2430         BdrvCheckResult old_res = *res;
2431         int fresh_leaks = 0;
2432
2433         fprintf(stderr, "Rebuilding refcount structure\n");
2434         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2435                                          &nb_clusters);
2436         if (ret < 0) {
2437             goto fail;
2438         }
2439
2440         res->corruptions = 0;
2441         res->leaks = 0;
2442
2443         /* Because the old reftable has been exchanged for a new one the
2444          * references have to be recalculated */
2445         rebuild = false;
2446         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2447         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2448                                   &nb_clusters);
2449         if (ret < 0) {
2450             goto fail;
2451         }
2452
2453         if (fix & BDRV_FIX_LEAKS) {
2454             /* The old refcount structures are now leaked, fix it; the result
2455              * can be ignored, aside from leaks which were introduced by
2456              * rebuild_refcount_structure() that could not be fixed */
2457             BdrvCheckResult saved_res = *res;
2458             *res = (BdrvCheckResult){ 0 };
2459
2460             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2461                               &highest_cluster, refcount_table, nb_clusters);
2462             if (rebuild) {
2463                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2464                         "broken\n");
2465             }
2466
2467             /* Any leaks accounted for here were introduced by
2468              * rebuild_refcount_structure() because that function has created a
2469              * new refcount structure from scratch */
2470             fresh_leaks = res->leaks;
2471             *res = saved_res;
2472         }
2473
2474         if (res->corruptions < old_res.corruptions) {
2475             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2476         }
2477         if (res->leaks < old_res.leaks) {
2478             res->leaks_fixed += old_res.leaks - res->leaks;
2479         }
2480         res->leaks += fresh_leaks;
2481     } else if (fix) {
2482         if (rebuild) {
2483             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2484             res->check_errors++;
2485             ret = -EIO;
2486             goto fail;
2487         }
2488
2489         if (res->leaks || res->corruptions) {
2490             *res = pre_compare_res;
2491             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2492                               refcount_table, nb_clusters);
2493         }
2494     }
2495
2496     /* check OFLAG_COPIED */
2497     ret = check_oflag_copied(bs, res, fix);
2498     if (ret < 0) {
2499         goto fail;
2500     }
2501
2502     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2503     ret = 0;
2504
2505 fail:
2506     g_free(refcount_table);
2507
2508     return ret;
2509 }
2510
2511 #define overlaps_with(ofs, sz) \
2512     ranges_overlap(offset, size, ofs, sz)
2513
2514 /*
2515  * Checks if the given offset into the image file is actually free to use by
2516  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2517  * i.e. a sanity check without relying on the refcount tables.
2518  *
2519  * The ign parameter specifies what checks not to perform (being a bitmask of
2520  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2521  *
2522  * Returns:
2523  * - 0 if writing to this offset will not affect the mentioned metadata
2524  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2525  * - a negative value (-errno) indicating an error while performing a check,
2526  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2527  */
2528 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2529                                  int64_t size)
2530 {
2531     BDRVQcow2State *s = bs->opaque;
2532     int chk = s->overlap_check & ~ign;
2533     int i, j;
2534
2535     if (!size) {
2536         return 0;
2537     }
2538
2539     if (chk & QCOW2_OL_MAIN_HEADER) {
2540         if (offset < s->cluster_size) {
2541             return QCOW2_OL_MAIN_HEADER;
2542         }
2543     }
2544
2545     /* align range to test to cluster boundaries */
2546     size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
2547     offset = start_of_cluster(s, offset);
2548
2549     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2550         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2551             return QCOW2_OL_ACTIVE_L1;
2552         }
2553     }
2554
2555     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2556         if (overlaps_with(s->refcount_table_offset,
2557             s->refcount_table_size * sizeof(uint64_t))) {
2558             return QCOW2_OL_REFCOUNT_TABLE;
2559         }
2560     }
2561
2562     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2563         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2564             return QCOW2_OL_SNAPSHOT_TABLE;
2565         }
2566     }
2567
2568     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2569         for (i = 0; i < s->nb_snapshots; i++) {
2570             if (s->snapshots[i].l1_size &&
2571                 overlaps_with(s->snapshots[i].l1_table_offset,
2572                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
2573                 return QCOW2_OL_INACTIVE_L1;
2574             }
2575         }
2576     }
2577
2578     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2579         for (i = 0; i < s->l1_size; i++) {
2580             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2581                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2582                 s->cluster_size)) {
2583                 return QCOW2_OL_ACTIVE_L2;
2584             }
2585         }
2586     }
2587
2588     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2589         unsigned last_entry = s->max_refcount_table_index;
2590         assert(last_entry < s->refcount_table_size);
2591         assert(last_entry + 1 == s->refcount_table_size ||
2592                (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2593         for (i = 0; i <= last_entry; i++) {
2594             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2595                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2596                 s->cluster_size)) {
2597                 return QCOW2_OL_REFCOUNT_BLOCK;
2598             }
2599         }
2600     }
2601
2602     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2603         for (i = 0; i < s->nb_snapshots; i++) {
2604             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2605             uint32_t l1_sz  = s->snapshots[i].l1_size;
2606             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2607             uint64_t *l1 = g_try_malloc(l1_sz2);
2608             int ret;
2609
2610             if (l1_sz2 && l1 == NULL) {
2611                 return -ENOMEM;
2612             }
2613
2614             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2615             if (ret < 0) {
2616                 g_free(l1);
2617                 return ret;
2618             }
2619
2620             for (j = 0; j < l1_sz; j++) {
2621                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2622                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2623                     g_free(l1);
2624                     return QCOW2_OL_INACTIVE_L2;
2625                 }
2626             }
2627
2628             g_free(l1);
2629         }
2630     }
2631
2632     return 0;
2633 }
2634
2635 static const char *metadata_ol_names[] = {
2636     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2637     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2638     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2639     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2640     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2641     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2642     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2643     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2644 };
2645
2646 /*
2647  * First performs a check for metadata overlaps (through
2648  * qcow2_check_metadata_overlap); if that fails with a negative value (error
2649  * while performing a check), that value is returned. If an impending overlap
2650  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2651  * and -EIO returned.
2652  *
2653  * Returns 0 if there were neither overlaps nor errors while checking for
2654  * overlaps; or a negative value (-errno) on error.
2655  */
2656 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2657                                   int64_t size)
2658 {
2659     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2660
2661     if (ret < 0) {
2662         return ret;
2663     } else if (ret > 0) {
2664         int metadata_ol_bitnr = ctz32(ret);
2665         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2666
2667         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2668                                 "write on metadata (overlaps with %s)",
2669                                 metadata_ol_names[metadata_ol_bitnr]);
2670         return -EIO;
2671     }
2672
2673     return 0;
2674 }
2675
2676 /* A pointer to a function of this type is given to walk_over_reftable(). That
2677  * function will create refblocks and pass them to a RefblockFinishOp once they
2678  * are completed (@refblock). @refblock_empty is set if the refblock is
2679  * completely empty.
2680  *
2681  * Along with the refblock, a corresponding reftable entry is passed, in the
2682  * reftable @reftable (which may be reallocated) at @reftable_index.
2683  *
2684  * @allocated should be set to true if a new cluster has been allocated.
2685  */
2686 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2687                                uint64_t reftable_index, uint64_t *reftable_size,
2688                                void *refblock, bool refblock_empty,
2689                                bool *allocated, Error **errp);
2690
2691 /**
2692  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2693  * it is not empty) and inserts its offset into the new reftable. The size of
2694  * this new reftable is increased as required.
2695  */
2696 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2697                           uint64_t reftable_index, uint64_t *reftable_size,
2698                           void *refblock, bool refblock_empty, bool *allocated,
2699                           Error **errp)
2700 {
2701     BDRVQcow2State *s = bs->opaque;
2702     int64_t offset;
2703
2704     if (!refblock_empty && reftable_index >= *reftable_size) {
2705         uint64_t *new_reftable;
2706         uint64_t new_reftable_size;
2707
2708         new_reftable_size = ROUND_UP(reftable_index + 1,
2709                                      s->cluster_size / sizeof(uint64_t));
2710         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2711             error_setg(errp,
2712                        "This operation would make the refcount table grow "
2713                        "beyond the maximum size supported by QEMU, aborting");
2714             return -ENOTSUP;
2715         }
2716
2717         new_reftable = g_try_realloc(*reftable, new_reftable_size *
2718                                                 sizeof(uint64_t));
2719         if (!new_reftable) {
2720             error_setg(errp, "Failed to increase reftable buffer size");
2721             return -ENOMEM;
2722         }
2723
2724         memset(new_reftable + *reftable_size, 0,
2725                (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2726
2727         *reftable      = new_reftable;
2728         *reftable_size = new_reftable_size;
2729     }
2730
2731     if (!refblock_empty && !(*reftable)[reftable_index]) {
2732         offset = qcow2_alloc_clusters(bs, s->cluster_size);
2733         if (offset < 0) {
2734             error_setg_errno(errp, -offset, "Failed to allocate refblock");
2735             return offset;
2736         }
2737         (*reftable)[reftable_index] = offset;
2738         *allocated = true;
2739     }
2740
2741     return 0;
2742 }
2743
2744 /**
2745  * This "operation" for walk_over_reftable() writes the refblock to disk at the
2746  * offset specified by the new reftable's entry. It does not modify the new
2747  * reftable or change any refcounts.
2748  */
2749 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2750                           uint64_t reftable_index, uint64_t *reftable_size,
2751                           void *refblock, bool refblock_empty, bool *allocated,
2752                           Error **errp)
2753 {
2754     BDRVQcow2State *s = bs->opaque;
2755     int64_t offset;
2756     int ret;
2757
2758     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2759         offset = (*reftable)[reftable_index];
2760
2761         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2762         if (ret < 0) {
2763             error_setg_errno(errp, -ret, "Overlap check failed");
2764             return ret;
2765         }
2766
2767         ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
2768         if (ret < 0) {
2769             error_setg_errno(errp, -ret, "Failed to write refblock");
2770             return ret;
2771         }
2772     } else {
2773         assert(refblock_empty);
2774     }
2775
2776     return 0;
2777 }
2778
2779 /**
2780  * This function walks over the existing reftable and every referenced refblock;
2781  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2782  * create an equal new entry in the passed @new_refblock. Once that
2783  * @new_refblock is completely filled, @operation will be called.
2784  *
2785  * @status_cb and @cb_opaque are used for the amend operation's status callback.
2786  * @index is the index of the walk_over_reftable() calls and @total is the total
2787  * number of walk_over_reftable() calls per amend operation. Both are used for
2788  * calculating the parameters for the status callback.
2789  *
2790  * @allocated is set to true if a new cluster has been allocated.
2791  */
2792 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2793                               uint64_t *new_reftable_index,
2794                               uint64_t *new_reftable_size,
2795                               void *new_refblock, int new_refblock_size,
2796                               int new_refcount_bits,
2797                               RefblockFinishOp *operation, bool *allocated,
2798                               Qcow2SetRefcountFunc *new_set_refcount,
2799                               BlockDriverAmendStatusCB *status_cb,
2800                               void *cb_opaque, int index, int total,
2801                               Error **errp)
2802 {
2803     BDRVQcow2State *s = bs->opaque;
2804     uint64_t reftable_index;
2805     bool new_refblock_empty = true;
2806     int refblock_index;
2807     int new_refblock_index = 0;
2808     int ret;
2809
2810     for (reftable_index = 0; reftable_index < s->refcount_table_size;
2811          reftable_index++)
2812     {
2813         uint64_t refblock_offset = s->refcount_table[reftable_index]
2814                                  & REFT_OFFSET_MASK;
2815
2816         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2817                   (uint64_t)total * s->refcount_table_size, cb_opaque);
2818
2819         if (refblock_offset) {
2820             void *refblock;
2821
2822             if (offset_into_cluster(s, refblock_offset)) {
2823                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2824                                         PRIx64 " unaligned (reftable index: %#"
2825                                         PRIx64 ")", refblock_offset,
2826                                         reftable_index);
2827                 error_setg(errp,
2828                            "Image is corrupt (unaligned refblock offset)");
2829                 return -EIO;
2830             }
2831
2832             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2833                                   &refblock);
2834             if (ret < 0) {
2835                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2836                 return ret;
2837             }
2838
2839             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2840                  refblock_index++)
2841             {
2842                 uint64_t refcount;
2843
2844                 if (new_refblock_index >= new_refblock_size) {
2845                     /* new_refblock is now complete */
2846                     ret = operation(bs, new_reftable, *new_reftable_index,
2847                                     new_reftable_size, new_refblock,
2848                                     new_refblock_empty, allocated, errp);
2849                     if (ret < 0) {
2850                         qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2851                         return ret;
2852                     }
2853
2854                     (*new_reftable_index)++;
2855                     new_refblock_index = 0;
2856                     new_refblock_empty = true;
2857                 }
2858
2859                 refcount = s->get_refcount(refblock, refblock_index);
2860                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2861                     uint64_t offset;
2862
2863                     qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2864
2865                     offset = ((reftable_index << s->refcount_block_bits)
2866                               + refblock_index) << s->cluster_bits;
2867
2868                     error_setg(errp, "Cannot decrease refcount entry width to "
2869                                "%i bits: Cluster at offset %#" PRIx64 " has a "
2870                                "refcount of %" PRIu64, new_refcount_bits,
2871                                offset, refcount);
2872                     return -EINVAL;
2873                 }
2874
2875                 if (new_set_refcount) {
2876                     new_set_refcount(new_refblock, new_refblock_index++,
2877                                      refcount);
2878                 } else {
2879                     new_refblock_index++;
2880                 }
2881                 new_refblock_empty = new_refblock_empty && refcount == 0;
2882             }
2883
2884             qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2885         } else {
2886             /* No refblock means every refcount is 0 */
2887             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2888                  refblock_index++)
2889             {
2890                 if (new_refblock_index >= new_refblock_size) {
2891                     /* new_refblock is now complete */
2892                     ret = operation(bs, new_reftable, *new_reftable_index,
2893                                     new_reftable_size, new_refblock,
2894                                     new_refblock_empty, allocated, errp);
2895                     if (ret < 0) {
2896                         return ret;
2897                     }
2898
2899                     (*new_reftable_index)++;
2900                     new_refblock_index = 0;
2901                     new_refblock_empty = true;
2902                 }
2903
2904                 if (new_set_refcount) {
2905                     new_set_refcount(new_refblock, new_refblock_index++, 0);
2906                 } else {
2907                     new_refblock_index++;
2908                 }
2909             }
2910         }
2911     }
2912
2913     if (new_refblock_index > 0) {
2914         /* Complete the potentially existing partially filled final refblock */
2915         if (new_set_refcount) {
2916             for (; new_refblock_index < new_refblock_size;
2917                  new_refblock_index++)
2918             {
2919                 new_set_refcount(new_refblock, new_refblock_index, 0);
2920             }
2921         }
2922
2923         ret = operation(bs, new_reftable, *new_reftable_index,
2924                         new_reftable_size, new_refblock, new_refblock_empty,
2925                         allocated, errp);
2926         if (ret < 0) {
2927             return ret;
2928         }
2929
2930         (*new_reftable_index)++;
2931     }
2932
2933     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
2934               (uint64_t)total * s->refcount_table_size, cb_opaque);
2935
2936     return 0;
2937 }
2938
2939 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
2940                                 BlockDriverAmendStatusCB *status_cb,
2941                                 void *cb_opaque, Error **errp)
2942 {
2943     BDRVQcow2State *s = bs->opaque;
2944     Qcow2GetRefcountFunc *new_get_refcount;
2945     Qcow2SetRefcountFunc *new_set_refcount;
2946     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
2947     uint64_t *new_reftable = NULL, new_reftable_size = 0;
2948     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
2949     uint64_t new_reftable_index = 0;
2950     uint64_t i;
2951     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
2952     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
2953     int old_refcount_order;
2954     int walk_index = 0;
2955     int ret;
2956     bool new_allocation;
2957
2958     assert(s->qcow_version >= 3);
2959     assert(refcount_order >= 0 && refcount_order <= 6);
2960
2961     /* see qcow2_open() */
2962     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
2963
2964     new_get_refcount = get_refcount_funcs[refcount_order];
2965     new_set_refcount = set_refcount_funcs[refcount_order];
2966
2967
2968     do {
2969         int total_walks;
2970
2971         new_allocation = false;
2972
2973         /* At least we have to do this walk and the one which writes the
2974          * refblocks; also, at least we have to do this loop here at least
2975          * twice (normally), first to do the allocations, and second to
2976          * determine that everything is correctly allocated, this then makes
2977          * three walks in total */
2978         total_walks = MAX(walk_index + 2, 3);
2979
2980         /* First, allocate the structures so they are present in the refcount
2981          * structures */
2982         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2983                                  &new_reftable_size, NULL, new_refblock_size,
2984                                  new_refcount_bits, &alloc_refblock,
2985                                  &new_allocation, NULL, status_cb, cb_opaque,
2986                                  walk_index++, total_walks, errp);
2987         if (ret < 0) {
2988             goto done;
2989         }
2990
2991         new_reftable_index = 0;
2992
2993         if (new_allocation) {
2994             if (new_reftable_offset) {
2995                 qcow2_free_clusters(bs, new_reftable_offset,
2996                                     allocated_reftable_size * sizeof(uint64_t),
2997                                     QCOW2_DISCARD_NEVER);
2998             }
2999
3000             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3001                                                            sizeof(uint64_t));
3002             if (new_reftable_offset < 0) {
3003                 error_setg_errno(errp, -new_reftable_offset,
3004                                  "Failed to allocate the new reftable");
3005                 ret = new_reftable_offset;
3006                 goto done;
3007             }
3008             allocated_reftable_size = new_reftable_size;
3009         }
3010     } while (new_allocation);
3011
3012     /* Second, write the new refblocks */
3013     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3014                              &new_reftable_size, new_refblock,
3015                              new_refblock_size, new_refcount_bits,
3016                              &flush_refblock, &new_allocation, new_set_refcount,
3017                              status_cb, cb_opaque, walk_index, walk_index + 1,
3018                              errp);
3019     if (ret < 0) {
3020         goto done;
3021     }
3022     assert(!new_allocation);
3023
3024
3025     /* Write the new reftable */
3026     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3027                                         new_reftable_size * sizeof(uint64_t));
3028     if (ret < 0) {
3029         error_setg_errno(errp, -ret, "Overlap check failed");
3030         goto done;
3031     }
3032
3033     for (i = 0; i < new_reftable_size; i++) {
3034         cpu_to_be64s(&new_reftable[i]);
3035     }
3036
3037     ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
3038                       new_reftable_size * sizeof(uint64_t));
3039
3040     for (i = 0; i < new_reftable_size; i++) {
3041         be64_to_cpus(&new_reftable[i]);
3042     }
3043
3044     if (ret < 0) {
3045         error_setg_errno(errp, -ret, "Failed to write the new reftable");
3046         goto done;
3047     }
3048
3049
3050     /* Empty the refcount cache */
3051     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3052     if (ret < 0) {
3053         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3054         goto done;
3055     }
3056
3057     /* Update the image header to point to the new reftable; this only updates
3058      * the fields which are relevant to qcow2_update_header(); other fields
3059      * such as s->refcount_table or s->refcount_bits stay stale for now
3060      * (because we have to restore everything if qcow2_update_header() fails) */
3061     old_refcount_order  = s->refcount_order;
3062     old_reftable_size   = s->refcount_table_size;
3063     old_reftable_offset = s->refcount_table_offset;
3064
3065     s->refcount_order        = refcount_order;
3066     s->refcount_table_size   = new_reftable_size;
3067     s->refcount_table_offset = new_reftable_offset;
3068
3069     ret = qcow2_update_header(bs);
3070     if (ret < 0) {
3071         s->refcount_order        = old_refcount_order;
3072         s->refcount_table_size   = old_reftable_size;
3073         s->refcount_table_offset = old_reftable_offset;
3074         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3075         goto done;
3076     }
3077
3078     /* Now update the rest of the in-memory information */
3079     old_reftable = s->refcount_table;
3080     s->refcount_table = new_reftable;
3081     update_max_refcount_table_index(s);
3082
3083     s->refcount_bits = 1 << refcount_order;
3084     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3085     s->refcount_max += s->refcount_max - 1;
3086
3087     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3088     s->refcount_block_size = 1 << s->refcount_block_bits;
3089
3090     s->get_refcount = new_get_refcount;
3091     s->set_refcount = new_set_refcount;
3092
3093     /* For cleaning up all old refblocks and the old reftable below the "done"
3094      * label */
3095     new_reftable        = old_reftable;
3096     new_reftable_size   = old_reftable_size;
3097     new_reftable_offset = old_reftable_offset;
3098
3099 done:
3100     if (new_reftable) {
3101         /* On success, new_reftable actually points to the old reftable (and
3102          * new_reftable_size is the old reftable's size); but that is just
3103          * fine */
3104         for (i = 0; i < new_reftable_size; i++) {
3105             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3106             if (offset) {
3107                 qcow2_free_clusters(bs, offset, s->cluster_size,
3108                                     QCOW2_DISCARD_OTHER);
3109             }
3110         }
3111         g_free(new_reftable);
3112
3113         if (new_reftable_offset > 0) {
3114             qcow2_free_clusters(bs, new_reftable_offset,
3115                                 new_reftable_size * sizeof(uint64_t),
3116                                 QCOW2_DISCARD_OTHER);
3117         }
3118     }
3119
3120     qemu_vfree(new_refblock);
3121     return ret;
3122 }
3123
3124 static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3125 {
3126     BDRVQcow2State *s = bs->opaque;
3127     uint32_t index = offset_to_reftable_index(s, offset);
3128     int64_t covering_refblock_offset = 0;
3129
3130     if (index < s->refcount_table_size) {
3131         covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3132     }
3133     if (!covering_refblock_offset) {
3134         qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3135                                 "not covered by the refcount structures",
3136                                 offset);
3137         return -EIO;
3138     }
3139
3140     return covering_refblock_offset;
3141 }
3142
3143 static int qcow2_discard_refcount_block(BlockDriverState *bs,
3144                                         uint64_t discard_block_offs)
3145 {
3146     BDRVQcow2State *s = bs->opaque;
3147     int64_t refblock_offs;
3148     uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3149     uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3150     void *refblock;
3151     int ret;
3152
3153     refblock_offs = get_refblock_offset(bs, discard_block_offs);
3154     if (refblock_offs < 0) {
3155         return refblock_offs;
3156     }
3157
3158     assert(discard_block_offs != 0);
3159
3160     ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3161                           &refblock);
3162     if (ret < 0) {
3163         return ret;
3164     }
3165
3166     if (s->get_refcount(refblock, block_index) != 1) {
3167         qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3168                                 " refblock offset %#" PRIx64
3169                                 ", reftable index %u"
3170                                 ", block offset %#" PRIx64
3171                                 ", refcount %#" PRIx64,
3172                                 refblock_offs,
3173                                 offset_to_reftable_index(s, discard_block_offs),
3174                                 discard_block_offs,
3175                                 s->get_refcount(refblock, block_index));
3176         qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
3177         return -EINVAL;
3178     }
3179     s->set_refcount(refblock, block_index, 0);
3180
3181     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3182
3183     qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
3184
3185     if (cluster_index < s->free_cluster_index) {
3186         s->free_cluster_index = cluster_index;
3187     }
3188
3189     refblock = qcow2_cache_is_table_offset(bs, s->refcount_block_cache,
3190                                            discard_block_offs);
3191     if (refblock) {
3192         /* discard refblock from the cache if refblock is cached */
3193         qcow2_cache_discard(bs, s->refcount_block_cache, refblock);
3194     }
3195     update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3196
3197     return 0;
3198 }
3199
3200 int qcow2_shrink_reftable(BlockDriverState *bs)
3201 {
3202     BDRVQcow2State *s = bs->opaque;
3203     uint64_t *reftable_tmp =
3204         g_malloc(s->refcount_table_size * sizeof(uint64_t));
3205     int i, ret;
3206
3207     for (i = 0; i < s->refcount_table_size; i++) {
3208         int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3209         void *refblock;
3210         bool unused_block;
3211
3212         if (refblock_offs == 0) {
3213             reftable_tmp[i] = 0;
3214             continue;
3215         }
3216         ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3217                               &refblock);
3218         if (ret < 0) {
3219             goto out;
3220         }
3221
3222         /* the refblock has own reference */
3223         if (i == offset_to_reftable_index(s, refblock_offs)) {
3224             uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3225                                    (s->refcount_block_size - 1);
3226             uint64_t refcount = s->get_refcount(refblock, block_index);
3227
3228             s->set_refcount(refblock, block_index, 0);
3229
3230             unused_block = buffer_is_zero(refblock, s->cluster_size);
3231
3232             s->set_refcount(refblock, block_index, refcount);
3233         } else {
3234             unused_block = buffer_is_zero(refblock, s->cluster_size);
3235         }
3236         qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
3237
3238         reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3239     }
3240
3241     ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
3242                            s->refcount_table_size * sizeof(uint64_t));
3243     /*
3244      * If the write in the reftable failed the image may contain a partially
3245      * overwritten reftable. In this case it would be better to clear the
3246      * reftable in memory to avoid possible image corruption.
3247      */
3248     for (i = 0; i < s->refcount_table_size; i++) {
3249         if (s->refcount_table[i] && !reftable_tmp[i]) {
3250             if (ret == 0) {
3251                 ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3252                                                        REFT_OFFSET_MASK);
3253             }
3254             s->refcount_table[i] = 0;
3255         }
3256     }
3257
3258     if (!s->cache_discards) {
3259         qcow2_process_discards(bs, ret);
3260     }
3261
3262 out:
3263     g_free(reftable_tmp);
3264     return ret;
3265 }
3266
3267 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3268 {
3269     BDRVQcow2State *s = bs->opaque;
3270     int64_t i;
3271
3272     for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3273         uint64_t refcount;
3274         int ret = qcow2_get_refcount(bs, i, &refcount);
3275         if (ret < 0) {
3276             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3277                     i, strerror(-ret));
3278             return ret;
3279         }
3280         if (refcount > 0) {
3281             return i;
3282         }
3283     }
3284     qcow2_signal_corruption(bs, true, -1, -1,
3285                             "There are no references in the refcount table.");
3286     return -EIO;
3287 }
This page took 0.210608 seconds and 4 git commands to generate.