]>
Commit | Line | Data |
---|---|---|
8996f61a FM |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "backref.h" | |
4 | #include "btrfs_inode.h" | |
5 | #include "fiemap.h" | |
6 | #include "file.h" | |
7 | #include "file-item.h" | |
8 | ||
9 | struct btrfs_fiemap_entry { | |
10 | u64 offset; | |
11 | u64 phys; | |
12 | u64 len; | |
13 | u32 flags; | |
14 | }; | |
15 | ||
16 | /* | |
17 | * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file | |
18 | * range from the inode's io tree, unlock the subvolume tree search path, flush | |
19 | * the fiemap cache and relock the file range and research the subvolume tree. | |
20 | * The value here is something negative that can't be confused with a valid | |
21 | * errno value and different from 1 because that's also a return value from | |
22 | * fiemap_fill_next_extent() and also it's often used to mean some btree search | |
23 | * did not find a key, so make it some distinct negative value. | |
24 | */ | |
25 | #define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1)) | |
26 | ||
27 | /* | |
28 | * Used to: | |
29 | * | |
30 | * - Cache the next entry to be emitted to the fiemap buffer, so that we can | |
31 | * merge extents that are contiguous and can be grouped as a single one; | |
32 | * | |
33 | * - Store extents ready to be written to the fiemap buffer in an intermediary | |
34 | * buffer. This intermediary buffer is to ensure that in case the fiemap | |
35 | * buffer is memory mapped to the fiemap target file, we don't deadlock | |
36 | * during btrfs_page_mkwrite(). This is because during fiemap we are locking | |
37 | * an extent range in order to prevent races with delalloc flushing and | |
38 | * ordered extent completion, which is needed in order to reliably detect | |
39 | * delalloc in holes and prealloc extents. And this can lead to a deadlock | |
40 | * if the fiemap buffer is memory mapped to the file we are running fiemap | |
41 | * against (a silly, useless in practice scenario, but possible) because | |
42 | * btrfs_page_mkwrite() will try to lock the same extent range. | |
43 | */ | |
44 | struct fiemap_cache { | |
45 | /* An array of ready fiemap entries. */ | |
46 | struct btrfs_fiemap_entry *entries; | |
47 | /* Number of entries in the entries array. */ | |
48 | int entries_size; | |
49 | /* Index of the next entry in the entries array to write to. */ | |
50 | int entries_pos; | |
51 | /* | |
52 | * Once the entries array is full, this indicates what's the offset for | |
53 | * the next file extent item we must search for in the inode's subvolume | |
54 | * tree after unlocking the extent range in the inode's io tree and | |
55 | * releasing the search path. | |
56 | */ | |
57 | u64 next_search_offset; | |
58 | /* | |
59 | * This matches struct fiemap_extent_info::fi_mapped_extents, we use it | |
60 | * to count ourselves emitted extents and stop instead of relying on | |
61 | * fiemap_fill_next_extent() because we buffer ready fiemap entries at | |
62 | * the @entries array, and we want to stop as soon as we hit the max | |
63 | * amount of extents to map, not just to save time but also to make the | |
64 | * logic at extent_fiemap() simpler. | |
65 | */ | |
66 | unsigned int extents_mapped; | |
67 | /* Fields for the cached extent (unsubmitted, not ready, extent). */ | |
68 | u64 offset; | |
69 | u64 phys; | |
70 | u64 len; | |
71 | u32 flags; | |
72 | bool cached; | |
73 | }; | |
74 | ||
75 | static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo, | |
76 | struct fiemap_cache *cache) | |
77 | { | |
78 | for (int i = 0; i < cache->entries_pos; i++) { | |
79 | struct btrfs_fiemap_entry *entry = &cache->entries[i]; | |
80 | int ret; | |
81 | ||
82 | ret = fiemap_fill_next_extent(fieinfo, entry->offset, | |
83 | entry->phys, entry->len, | |
84 | entry->flags); | |
85 | /* | |
86 | * Ignore 1 (reached max entries) because we keep track of that | |
87 | * ourselves in emit_fiemap_extent(). | |
88 | */ | |
89 | if (ret < 0) | |
90 | return ret; | |
91 | } | |
92 | cache->entries_pos = 0; | |
93 | ||
94 | return 0; | |
95 | } | |
96 | ||
97 | /* | |
98 | * Helper to submit fiemap extent. | |
99 | * | |
100 | * Will try to merge current fiemap extent specified by @offset, @phys, | |
101 | * @len and @flags with cached one. | |
102 | * And only when we fails to merge, cached one will be submitted as | |
103 | * fiemap extent. | |
104 | * | |
105 | * Return value is the same as fiemap_fill_next_extent(). | |
106 | */ | |
107 | static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, | |
108 | struct fiemap_cache *cache, | |
109 | u64 offset, u64 phys, u64 len, u32 flags) | |
110 | { | |
111 | struct btrfs_fiemap_entry *entry; | |
112 | u64 cache_end; | |
113 | ||
114 | /* Set at the end of extent_fiemap(). */ | |
115 | ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); | |
116 | ||
117 | if (!cache->cached) | |
118 | goto assign; | |
119 | ||
120 | /* | |
121 | * When iterating the extents of the inode, at extent_fiemap(), we may | |
122 | * find an extent that starts at an offset behind the end offset of the | |
123 | * previous extent we processed. This happens if fiemap is called | |
124 | * without FIEMAP_FLAG_SYNC and there are ordered extents completing | |
125 | * after we had to unlock the file range, release the search path, emit | |
126 | * the fiemap extents stored in the buffer (cache->entries array) and | |
127 | * the lock the remainder of the range and re-search the btree. | |
128 | * | |
129 | * For example we are in leaf X processing its last item, which is the | |
130 | * file extent item for file range [512K, 1M[, and after | |
131 | * btrfs_next_leaf() releases the path, there's an ordered extent that | |
132 | * completes for the file range [768K, 2M[, and that results in trimming | |
133 | * the file extent item so that it now corresponds to the file range | |
134 | * [512K, 768K[ and a new file extent item is inserted for the file | |
135 | * range [768K, 2M[, which may end up as the last item of leaf X or as | |
136 | * the first item of the next leaf - in either case btrfs_next_leaf() | |
137 | * will leave us with a path pointing to the new extent item, for the | |
138 | * file range [768K, 2M[, since that's the first key that follows the | |
139 | * last one we processed. So in order not to report overlapping extents | |
140 | * to user space, we trim the length of the previously cached extent and | |
141 | * emit it. | |
142 | * | |
143 | * Upon calling btrfs_next_leaf() we may also find an extent with an | |
144 | * offset smaller than or equals to cache->offset, and this happens | |
145 | * when we had a hole or prealloc extent with several delalloc ranges in | |
146 | * it, but after btrfs_next_leaf() released the path, delalloc was | |
147 | * flushed and the resulting ordered extents were completed, so we can | |
148 | * now have found a file extent item for an offset that is smaller than | |
149 | * or equals to what we have in cache->offset. We deal with this as | |
150 | * described below. | |
151 | */ | |
152 | cache_end = cache->offset + cache->len; | |
153 | if (cache_end > offset) { | |
154 | if (offset == cache->offset) { | |
155 | /* | |
156 | * We cached a dealloc range (found in the io tree) for | |
157 | * a hole or prealloc extent and we have now found a | |
158 | * file extent item for the same offset. What we have | |
159 | * now is more recent and up to date, so discard what | |
160 | * we had in the cache and use what we have just found. | |
161 | */ | |
162 | goto assign; | |
163 | } else if (offset > cache->offset) { | |
164 | /* | |
165 | * The extent range we previously found ends after the | |
166 | * offset of the file extent item we found and that | |
167 | * offset falls somewhere in the middle of that previous | |
168 | * extent range. So adjust the range we previously found | |
169 | * to end at the offset of the file extent item we have | |
170 | * just found, since this extent is more up to date. | |
171 | * Emit that adjusted range and cache the file extent | |
172 | * item we have just found. This corresponds to the case | |
173 | * where a previously found file extent item was split | |
174 | * due to an ordered extent completing. | |
175 | */ | |
176 | cache->len = offset - cache->offset; | |
177 | goto emit; | |
178 | } else { | |
179 | const u64 range_end = offset + len; | |
180 | ||
181 | /* | |
182 | * The offset of the file extent item we have just found | |
183 | * is behind the cached offset. This means we were | |
184 | * processing a hole or prealloc extent for which we | |
185 | * have found delalloc ranges (in the io tree), so what | |
186 | * we have in the cache is the last delalloc range we | |
187 | * found while the file extent item we found can be | |
188 | * either for a whole delalloc range we previously | |
189 | * emmitted or only a part of that range. | |
190 | * | |
191 | * We have two cases here: | |
192 | * | |
193 | * 1) The file extent item's range ends at or behind the | |
194 | * cached extent's end. In this case just ignore the | |
195 | * current file extent item because we don't want to | |
196 | * overlap with previous ranges that may have been | |
197 | * emmitted already; | |
198 | * | |
199 | * 2) The file extent item starts behind the currently | |
200 | * cached extent but its end offset goes beyond the | |
201 | * end offset of the cached extent. We don't want to | |
202 | * overlap with a previous range that may have been | |
203 | * emmitted already, so we emit the currently cached | |
204 | * extent and then partially store the current file | |
205 | * extent item's range in the cache, for the subrange | |
206 | * going the cached extent's end to the end of the | |
207 | * file extent item. | |
208 | */ | |
209 | if (range_end <= cache_end) | |
210 | return 0; | |
211 | ||
212 | if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC))) | |
213 | phys += cache_end - offset; | |
214 | ||
215 | offset = cache_end; | |
216 | len = range_end - cache_end; | |
217 | goto emit; | |
218 | } | |
219 | } | |
220 | ||
221 | /* | |
222 | * Only merges fiemap extents if | |
223 | * 1) Their logical addresses are continuous | |
224 | * | |
225 | * 2) Their physical addresses are continuous | |
226 | * So truly compressed (physical size smaller than logical size) | |
227 | * extents won't get merged with each other | |
228 | * | |
229 | * 3) Share same flags | |
230 | */ | |
231 | if (cache->offset + cache->len == offset && | |
232 | cache->phys + cache->len == phys && | |
233 | cache->flags == flags) { | |
234 | cache->len += len; | |
235 | return 0; | |
236 | } | |
237 | ||
238 | emit: | |
239 | /* Not mergeable, need to submit cached one */ | |
240 | ||
241 | if (cache->entries_pos == cache->entries_size) { | |
242 | /* | |
243 | * We will need to research for the end offset of the last | |
244 | * stored extent and not from the current offset, because after | |
245 | * unlocking the range and releasing the path, if there's a hole | |
246 | * between that end offset and this current offset, a new extent | |
247 | * may have been inserted due to a new write, so we don't want | |
248 | * to miss it. | |
249 | */ | |
250 | entry = &cache->entries[cache->entries_size - 1]; | |
251 | cache->next_search_offset = entry->offset + entry->len; | |
252 | cache->cached = false; | |
253 | ||
254 | return BTRFS_FIEMAP_FLUSH_CACHE; | |
255 | } | |
256 | ||
257 | entry = &cache->entries[cache->entries_pos]; | |
258 | entry->offset = cache->offset; | |
259 | entry->phys = cache->phys; | |
260 | entry->len = cache->len; | |
261 | entry->flags = cache->flags; | |
262 | cache->entries_pos++; | |
263 | cache->extents_mapped++; | |
264 | ||
265 | if (cache->extents_mapped == fieinfo->fi_extents_max) { | |
266 | cache->cached = false; | |
267 | return 1; | |
268 | } | |
269 | assign: | |
270 | cache->cached = true; | |
271 | cache->offset = offset; | |
272 | cache->phys = phys; | |
273 | cache->len = len; | |
274 | cache->flags = flags; | |
275 | ||
276 | return 0; | |
277 | } | |
278 | ||
279 | /* | |
280 | * Emit last fiemap cache | |
281 | * | |
282 | * The last fiemap cache may still be cached in the following case: | |
283 | * 0 4k 8k | |
284 | * |<- Fiemap range ->| | |
285 | * |<------------ First extent ----------->| | |
286 | * | |
287 | * In this case, the first extent range will be cached but not emitted. | |
288 | * So we must emit it before ending extent_fiemap(). | |
289 | */ | |
290 | static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, | |
291 | struct fiemap_cache *cache) | |
292 | { | |
293 | int ret; | |
294 | ||
295 | if (!cache->cached) | |
296 | return 0; | |
297 | ||
298 | ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, | |
299 | cache->len, cache->flags); | |
300 | cache->cached = false; | |
301 | if (ret > 0) | |
302 | ret = 0; | |
303 | return ret; | |
304 | } | |
305 | ||
306 | static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) | |
307 | { | |
308 | struct extent_buffer *clone = path->nodes[0]; | |
309 | struct btrfs_key key; | |
310 | int slot; | |
311 | int ret; | |
312 | ||
313 | path->slots[0]++; | |
314 | if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) | |
315 | return 0; | |
316 | ||
317 | /* | |
318 | * Add a temporary extra ref to an already cloned extent buffer to | |
319 | * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid | |
320 | * the cost of allocating a new one. | |
321 | */ | |
322 | ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags)); | |
323 | atomic_inc(&clone->refs); | |
324 | ||
325 | ret = btrfs_next_leaf(inode->root, path); | |
326 | if (ret != 0) | |
327 | goto out; | |
328 | ||
329 | /* | |
330 | * Don't bother with cloning if there are no more file extent items for | |
331 | * our inode. | |
332 | */ | |
333 | btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); | |
334 | if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { | |
335 | ret = 1; | |
336 | goto out; | |
337 | } | |
338 | ||
339 | /* | |
340 | * Important to preserve the start field, for the optimizations when | |
341 | * checking if extents are shared (see extent_fiemap()). | |
342 | * | |
343 | * We must set ->start before calling copy_extent_buffer_full(). If we | |
344 | * are on sub-pagesize blocksize, we use ->start to determine the offset | |
345 | * into the folio where our eb exists, and if we update ->start after | |
346 | * the fact then any subsequent reads of the eb may read from a | |
347 | * different offset in the folio than where we originally copied into. | |
348 | */ | |
349 | clone->start = path->nodes[0]->start; | |
350 | /* See the comment at fiemap_search_slot() about why we clone. */ | |
351 | copy_extent_buffer_full(clone, path->nodes[0]); | |
352 | ||
353 | slot = path->slots[0]; | |
354 | btrfs_release_path(path); | |
355 | path->nodes[0] = clone; | |
356 | path->slots[0] = slot; | |
357 | out: | |
358 | if (ret) | |
359 | free_extent_buffer(clone); | |
360 | ||
361 | return ret; | |
362 | } | |
363 | ||
364 | /* | |
365 | * Search for the first file extent item that starts at a given file offset or | |
366 | * the one that starts immediately before that offset. | |
367 | * Returns: 0 on success, < 0 on error, 1 if not found. | |
368 | */ | |
369 | static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path, | |
370 | u64 file_offset) | |
371 | { | |
372 | const u64 ino = btrfs_ino(inode); | |
373 | struct btrfs_root *root = inode->root; | |
374 | struct extent_buffer *clone; | |
375 | struct btrfs_key key; | |
376 | int slot; | |
377 | int ret; | |
378 | ||
379 | key.objectid = ino; | |
380 | key.type = BTRFS_EXTENT_DATA_KEY; | |
381 | key.offset = file_offset; | |
382 | ||
383 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | |
384 | if (ret < 0) | |
385 | return ret; | |
386 | ||
387 | if (ret > 0 && path->slots[0] > 0) { | |
388 | btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); | |
389 | if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) | |
390 | path->slots[0]--; | |
391 | } | |
392 | ||
393 | if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { | |
394 | ret = btrfs_next_leaf(root, path); | |
395 | if (ret != 0) | |
396 | return ret; | |
397 | ||
398 | btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); | |
399 | if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) | |
400 | return 1; | |
401 | } | |
402 | ||
403 | /* | |
404 | * We clone the leaf and use it during fiemap. This is because while | |
405 | * using the leaf we do expensive things like checking if an extent is | |
406 | * shared, which can take a long time. In order to prevent blocking | |
407 | * other tasks for too long, we use a clone of the leaf. We have locked | |
408 | * the file range in the inode's io tree, so we know none of our file | |
409 | * extent items can change. This way we avoid blocking other tasks that | |
410 | * want to insert items for other inodes in the same leaf or b+tree | |
411 | * rebalance operations (triggered for example when someone is trying | |
412 | * to push items into this leaf when trying to insert an item in a | |
413 | * neighbour leaf). | |
414 | * We also need the private clone because holding a read lock on an | |
415 | * extent buffer of the subvolume's b+tree will make lockdep unhappy | |
416 | * when we check if extents are shared, as backref walking may need to | |
417 | * lock the same leaf we are processing. | |
418 | */ | |
419 | clone = btrfs_clone_extent_buffer(path->nodes[0]); | |
420 | if (!clone) | |
421 | return -ENOMEM; | |
422 | ||
423 | slot = path->slots[0]; | |
424 | btrfs_release_path(path); | |
425 | path->nodes[0] = clone; | |
426 | path->slots[0] = slot; | |
427 | ||
428 | return 0; | |
429 | } | |
430 | ||
431 | /* | |
432 | * Process a range which is a hole or a prealloc extent in the inode's subvolume | |
433 | * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc | |
434 | * extent. The end offset (@end) is inclusive. | |
435 | */ | |
436 | static int fiemap_process_hole(struct btrfs_inode *inode, | |
437 | struct fiemap_extent_info *fieinfo, | |
438 | struct fiemap_cache *cache, | |
439 | struct extent_state **delalloc_cached_state, | |
440 | struct btrfs_backref_share_check_ctx *backref_ctx, | |
441 | u64 disk_bytenr, u64 extent_offset, | |
442 | u64 extent_gen, | |
443 | u64 start, u64 end) | |
444 | { | |
445 | const u64 i_size = i_size_read(&inode->vfs_inode); | |
446 | u64 cur_offset = start; | |
447 | u64 last_delalloc_end = 0; | |
448 | u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; | |
449 | bool checked_extent_shared = false; | |
450 | int ret; | |
451 | ||
452 | /* | |
453 | * There can be no delalloc past i_size, so don't waste time looking for | |
454 | * it beyond i_size. | |
455 | */ | |
456 | while (cur_offset < end && cur_offset < i_size) { | |
457 | u64 delalloc_start; | |
458 | u64 delalloc_end; | |
459 | u64 prealloc_start; | |
460 | u64 prealloc_len = 0; | |
461 | bool delalloc; | |
462 | ||
463 | delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, | |
464 | delalloc_cached_state, | |
465 | &delalloc_start, | |
466 | &delalloc_end); | |
467 | if (!delalloc) | |
468 | break; | |
469 | ||
470 | /* | |
471 | * If this is a prealloc extent we have to report every section | |
472 | * of it that has no delalloc. | |
473 | */ | |
474 | if (disk_bytenr != 0) { | |
475 | if (last_delalloc_end == 0) { | |
476 | prealloc_start = start; | |
477 | prealloc_len = delalloc_start - start; | |
478 | } else { | |
479 | prealloc_start = last_delalloc_end + 1; | |
480 | prealloc_len = delalloc_start - prealloc_start; | |
481 | } | |
482 | } | |
483 | ||
484 | if (prealloc_len > 0) { | |
485 | if (!checked_extent_shared && fieinfo->fi_extents_max) { | |
486 | ret = btrfs_is_data_extent_shared(inode, | |
487 | disk_bytenr, | |
488 | extent_gen, | |
489 | backref_ctx); | |
490 | if (ret < 0) | |
491 | return ret; | |
492 | else if (ret > 0) | |
493 | prealloc_flags |= FIEMAP_EXTENT_SHARED; | |
494 | ||
495 | checked_extent_shared = true; | |
496 | } | |
497 | ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, | |
498 | disk_bytenr + extent_offset, | |
499 | prealloc_len, prealloc_flags); | |
500 | if (ret) | |
501 | return ret; | |
502 | extent_offset += prealloc_len; | |
503 | } | |
504 | ||
505 | ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0, | |
506 | delalloc_end + 1 - delalloc_start, | |
507 | FIEMAP_EXTENT_DELALLOC | | |
508 | FIEMAP_EXTENT_UNKNOWN); | |
509 | if (ret) | |
510 | return ret; | |
511 | ||
512 | last_delalloc_end = delalloc_end; | |
513 | cur_offset = delalloc_end + 1; | |
514 | extent_offset += cur_offset - delalloc_start; | |
515 | cond_resched(); | |
516 | } | |
517 | ||
518 | /* | |
519 | * Either we found no delalloc for the whole prealloc extent or we have | |
520 | * a prealloc extent that spans i_size or starts at or after i_size. | |
521 | */ | |
522 | if (disk_bytenr != 0 && last_delalloc_end < end) { | |
523 | u64 prealloc_start; | |
524 | u64 prealloc_len; | |
525 | ||
526 | if (last_delalloc_end == 0) { | |
527 | prealloc_start = start; | |
528 | prealloc_len = end + 1 - start; | |
529 | } else { | |
530 | prealloc_start = last_delalloc_end + 1; | |
531 | prealloc_len = end + 1 - prealloc_start; | |
532 | } | |
533 | ||
534 | if (!checked_extent_shared && fieinfo->fi_extents_max) { | |
535 | ret = btrfs_is_data_extent_shared(inode, | |
536 | disk_bytenr, | |
537 | extent_gen, | |
538 | backref_ctx); | |
539 | if (ret < 0) | |
540 | return ret; | |
541 | else if (ret > 0) | |
542 | prealloc_flags |= FIEMAP_EXTENT_SHARED; | |
543 | } | |
544 | ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, | |
545 | disk_bytenr + extent_offset, | |
546 | prealloc_len, prealloc_flags); | |
547 | if (ret) | |
548 | return ret; | |
549 | } | |
550 | ||
551 | return 0; | |
552 | } | |
553 | ||
554 | static int fiemap_find_last_extent_offset(struct btrfs_inode *inode, | |
555 | struct btrfs_path *path, | |
556 | u64 *last_extent_end_ret) | |
557 | { | |
558 | const u64 ino = btrfs_ino(inode); | |
559 | struct btrfs_root *root = inode->root; | |
560 | struct extent_buffer *leaf; | |
561 | struct btrfs_file_extent_item *ei; | |
562 | struct btrfs_key key; | |
563 | u64 disk_bytenr; | |
564 | int ret; | |
565 | ||
566 | /* | |
567 | * Lookup the last file extent. We're not using i_size here because | |
568 | * there might be preallocation past i_size. | |
569 | */ | |
570 | ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0); | |
571 | /* There can't be a file extent item at offset (u64)-1 */ | |
572 | ASSERT(ret != 0); | |
573 | if (ret < 0) | |
574 | return ret; | |
575 | ||
576 | /* | |
577 | * For a non-existing key, btrfs_search_slot() always leaves us at a | |
578 | * slot > 0, except if the btree is empty, which is impossible because | |
579 | * at least it has the inode item for this inode and all the items for | |
580 | * the root inode 256. | |
581 | */ | |
582 | ASSERT(path->slots[0] > 0); | |
583 | path->slots[0]--; | |
584 | leaf = path->nodes[0]; | |
585 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); | |
586 | if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { | |
587 | /* No file extent items in the subvolume tree. */ | |
588 | *last_extent_end_ret = 0; | |
589 | return 0; | |
590 | } | |
591 | ||
592 | /* | |
593 | * For an inline extent, the disk_bytenr is where inline data starts at, | |
594 | * so first check if we have an inline extent item before checking if we | |
595 | * have an implicit hole (disk_bytenr == 0). | |
596 | */ | |
597 | ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); | |
598 | if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { | |
599 | *last_extent_end_ret = btrfs_file_extent_end(path); | |
600 | return 0; | |
601 | } | |
602 | ||
603 | /* | |
604 | * Find the last file extent item that is not a hole (when NO_HOLES is | |
605 | * not enabled). This should take at most 2 iterations in the worst | |
606 | * case: we have one hole file extent item at slot 0 of a leaf and | |
607 | * another hole file extent item as the last item in the previous leaf. | |
608 | * This is because we merge file extent items that represent holes. | |
609 | */ | |
610 | disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); | |
611 | while (disk_bytenr == 0) { | |
612 | ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); | |
613 | if (ret < 0) { | |
614 | return ret; | |
615 | } else if (ret > 0) { | |
616 | /* No file extent items that are not holes. */ | |
617 | *last_extent_end_ret = 0; | |
618 | return 0; | |
619 | } | |
620 | leaf = path->nodes[0]; | |
621 | ei = btrfs_item_ptr(leaf, path->slots[0], | |
622 | struct btrfs_file_extent_item); | |
623 | disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); | |
624 | } | |
625 | ||
626 | *last_extent_end_ret = btrfs_file_extent_end(path); | |
627 | return 0; | |
628 | } | |
629 | ||
630 | static int extent_fiemap(struct btrfs_inode *inode, | |
631 | struct fiemap_extent_info *fieinfo, | |
632 | u64 start, u64 len) | |
633 | { | |
634 | const u64 ino = btrfs_ino(inode); | |
635 | struct extent_state *cached_state = NULL; | |
636 | struct extent_state *delalloc_cached_state = NULL; | |
637 | struct btrfs_path *path; | |
638 | struct fiemap_cache cache = { 0 }; | |
639 | struct btrfs_backref_share_check_ctx *backref_ctx; | |
640 | u64 last_extent_end; | |
641 | u64 prev_extent_end; | |
642 | u64 range_start; | |
643 | u64 range_end; | |
644 | const u64 sectorsize = inode->root->fs_info->sectorsize; | |
645 | bool stopped = false; | |
646 | int ret; | |
647 | ||
648 | cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry); | |
649 | cache.entries = kmalloc_array(cache.entries_size, | |
650 | sizeof(struct btrfs_fiemap_entry), | |
651 | GFP_KERNEL); | |
652 | backref_ctx = btrfs_alloc_backref_share_check_ctx(); | |
653 | path = btrfs_alloc_path(); | |
654 | if (!cache.entries || !backref_ctx || !path) { | |
655 | ret = -ENOMEM; | |
656 | goto out; | |
657 | } | |
658 | ||
659 | restart: | |
660 | range_start = round_down(start, sectorsize); | |
661 | range_end = round_up(start + len, sectorsize); | |
662 | prev_extent_end = range_start; | |
663 | ||
664 | lock_extent(&inode->io_tree, range_start, range_end, &cached_state); | |
665 | ||
666 | ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); | |
667 | if (ret < 0) | |
668 | goto out_unlock; | |
669 | btrfs_release_path(path); | |
670 | ||
671 | path->reada = READA_FORWARD; | |
672 | ret = fiemap_search_slot(inode, path, range_start); | |
673 | if (ret < 0) { | |
674 | goto out_unlock; | |
675 | } else if (ret > 0) { | |
676 | /* | |
677 | * No file extent item found, but we may have delalloc between | |
678 | * the current offset and i_size. So check for that. | |
679 | */ | |
680 | ret = 0; | |
681 | goto check_eof_delalloc; | |
682 | } | |
683 | ||
684 | while (prev_extent_end < range_end) { | |
685 | struct extent_buffer *leaf = path->nodes[0]; | |
686 | struct btrfs_file_extent_item *ei; | |
687 | struct btrfs_key key; | |
688 | u64 extent_end; | |
689 | u64 extent_len; | |
690 | u64 extent_offset = 0; | |
691 | u64 extent_gen; | |
692 | u64 disk_bytenr = 0; | |
693 | u64 flags = 0; | |
694 | int extent_type; | |
695 | u8 compression; | |
696 | ||
697 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); | |
698 | if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) | |
699 | break; | |
700 | ||
701 | extent_end = btrfs_file_extent_end(path); | |
702 | ||
703 | /* | |
704 | * The first iteration can leave us at an extent item that ends | |
705 | * before our range's start. Move to the next item. | |
706 | */ | |
707 | if (extent_end <= range_start) | |
708 | goto next_item; | |
709 | ||
710 | backref_ctx->curr_leaf_bytenr = leaf->start; | |
711 | ||
712 | /* We have in implicit hole (NO_HOLES feature enabled). */ | |
713 | if (prev_extent_end < key.offset) { | |
714 | const u64 hole_end = min(key.offset, range_end) - 1; | |
715 | ||
716 | ret = fiemap_process_hole(inode, fieinfo, &cache, | |
717 | &delalloc_cached_state, | |
718 | backref_ctx, 0, 0, 0, | |
719 | prev_extent_end, hole_end); | |
720 | if (ret < 0) { | |
721 | goto out_unlock; | |
722 | } else if (ret > 0) { | |
723 | /* fiemap_fill_next_extent() told us to stop. */ | |
724 | stopped = true; | |
725 | break; | |
726 | } | |
727 | ||
728 | /* We've reached the end of the fiemap range, stop. */ | |
729 | if (key.offset >= range_end) { | |
730 | stopped = true; | |
731 | break; | |
732 | } | |
733 | } | |
734 | ||
735 | extent_len = extent_end - key.offset; | |
736 | ei = btrfs_item_ptr(leaf, path->slots[0], | |
737 | struct btrfs_file_extent_item); | |
738 | compression = btrfs_file_extent_compression(leaf, ei); | |
739 | extent_type = btrfs_file_extent_type(leaf, ei); | |
740 | extent_gen = btrfs_file_extent_generation(leaf, ei); | |
741 | ||
742 | if (extent_type != BTRFS_FILE_EXTENT_INLINE) { | |
743 | disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); | |
744 | if (compression == BTRFS_COMPRESS_NONE) | |
745 | extent_offset = btrfs_file_extent_offset(leaf, ei); | |
746 | } | |
747 | ||
748 | if (compression != BTRFS_COMPRESS_NONE) | |
749 | flags |= FIEMAP_EXTENT_ENCODED; | |
750 | ||
751 | if (extent_type == BTRFS_FILE_EXTENT_INLINE) { | |
752 | flags |= FIEMAP_EXTENT_DATA_INLINE; | |
753 | flags |= FIEMAP_EXTENT_NOT_ALIGNED; | |
754 | ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0, | |
755 | extent_len, flags); | |
756 | } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { | |
757 | ret = fiemap_process_hole(inode, fieinfo, &cache, | |
758 | &delalloc_cached_state, | |
759 | backref_ctx, | |
760 | disk_bytenr, extent_offset, | |
761 | extent_gen, key.offset, | |
762 | extent_end - 1); | |
763 | } else if (disk_bytenr == 0) { | |
764 | /* We have an explicit hole. */ | |
765 | ret = fiemap_process_hole(inode, fieinfo, &cache, | |
766 | &delalloc_cached_state, | |
767 | backref_ctx, 0, 0, 0, | |
768 | key.offset, extent_end - 1); | |
769 | } else { | |
770 | /* We have a regular extent. */ | |
771 | if (fieinfo->fi_extents_max) { | |
772 | ret = btrfs_is_data_extent_shared(inode, | |
773 | disk_bytenr, | |
774 | extent_gen, | |
775 | backref_ctx); | |
776 | if (ret < 0) | |
777 | goto out_unlock; | |
778 | else if (ret > 0) | |
779 | flags |= FIEMAP_EXTENT_SHARED; | |
780 | } | |
781 | ||
782 | ret = emit_fiemap_extent(fieinfo, &cache, key.offset, | |
783 | disk_bytenr + extent_offset, | |
784 | extent_len, flags); | |
785 | } | |
786 | ||
787 | if (ret < 0) { | |
788 | goto out_unlock; | |
789 | } else if (ret > 0) { | |
790 | /* emit_fiemap_extent() told us to stop. */ | |
791 | stopped = true; | |
792 | break; | |
793 | } | |
794 | ||
795 | prev_extent_end = extent_end; | |
796 | next_item: | |
797 | if (fatal_signal_pending(current)) { | |
798 | ret = -EINTR; | |
799 | goto out_unlock; | |
800 | } | |
801 | ||
802 | ret = fiemap_next_leaf_item(inode, path); | |
803 | if (ret < 0) { | |
804 | goto out_unlock; | |
805 | } else if (ret > 0) { | |
806 | /* No more file extent items for this inode. */ | |
807 | break; | |
808 | } | |
809 | cond_resched(); | |
810 | } | |
811 | ||
812 | check_eof_delalloc: | |
813 | if (!stopped && prev_extent_end < range_end) { | |
814 | ret = fiemap_process_hole(inode, fieinfo, &cache, | |
815 | &delalloc_cached_state, backref_ctx, | |
816 | 0, 0, 0, prev_extent_end, range_end - 1); | |
817 | if (ret < 0) | |
818 | goto out_unlock; | |
819 | prev_extent_end = range_end; | |
820 | } | |
821 | ||
822 | if (cache.cached && cache.offset + cache.len >= last_extent_end) { | |
823 | const u64 i_size = i_size_read(&inode->vfs_inode); | |
824 | ||
825 | if (prev_extent_end < i_size) { | |
826 | u64 delalloc_start; | |
827 | u64 delalloc_end; | |
828 | bool delalloc; | |
829 | ||
830 | delalloc = btrfs_find_delalloc_in_range(inode, | |
831 | prev_extent_end, | |
832 | i_size - 1, | |
833 | &delalloc_cached_state, | |
834 | &delalloc_start, | |
835 | &delalloc_end); | |
836 | if (!delalloc) | |
837 | cache.flags |= FIEMAP_EXTENT_LAST; | |
838 | } else { | |
839 | cache.flags |= FIEMAP_EXTENT_LAST; | |
840 | } | |
841 | } | |
842 | ||
843 | out_unlock: | |
844 | unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); | |
845 | ||
846 | if (ret == BTRFS_FIEMAP_FLUSH_CACHE) { | |
847 | btrfs_release_path(path); | |
848 | ret = flush_fiemap_cache(fieinfo, &cache); | |
849 | if (ret) | |
850 | goto out; | |
851 | len -= cache.next_search_offset - start; | |
852 | start = cache.next_search_offset; | |
853 | goto restart; | |
854 | } else if (ret < 0) { | |
855 | goto out; | |
856 | } | |
857 | ||
858 | /* | |
859 | * Must free the path before emitting to the fiemap buffer because we | |
860 | * may have a non-cloned leaf and if the fiemap buffer is memory mapped | |
861 | * to a file, a write into it (through btrfs_page_mkwrite()) may trigger | |
862 | * waiting for an ordered extent that in order to complete needs to | |
863 | * modify that leaf, therefore leading to a deadlock. | |
864 | */ | |
865 | btrfs_free_path(path); | |
866 | path = NULL; | |
867 | ||
868 | ret = flush_fiemap_cache(fieinfo, &cache); | |
869 | if (ret) | |
870 | goto out; | |
871 | ||
872 | ret = emit_last_fiemap_cache(fieinfo, &cache); | |
873 | out: | |
874 | free_extent_state(delalloc_cached_state); | |
875 | kfree(cache.entries); | |
876 | btrfs_free_backref_share_ctx(backref_ctx); | |
877 | btrfs_free_path(path); | |
878 | return ret; | |
879 | } | |
880 | ||
881 | int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | |
882 | u64 start, u64 len) | |
883 | { | |
884 | struct btrfs_inode *btrfs_inode = BTRFS_I(inode); | |
885 | int ret; | |
886 | ||
887 | ret = fiemap_prep(inode, fieinfo, start, &len, 0); | |
888 | if (ret) | |
889 | return ret; | |
890 | ||
891 | /* | |
892 | * fiemap_prep() called filemap_write_and_wait() for the whole possible | |
893 | * file range (0 to LLONG_MAX), but that is not enough if we have | |
894 | * compression enabled. The first filemap_fdatawrite_range() only kicks | |
895 | * in the compression of data (in an async thread) and will return | |
896 | * before the compression is done and writeback is started. A second | |
897 | * filemap_fdatawrite_range() is needed to wait for the compression to | |
898 | * complete and writeback to start. We also need to wait for ordered | |
899 | * extents to complete, because our fiemap implementation uses mainly | |
900 | * file extent items to list the extents, searching for extent maps | |
901 | * only for file ranges with holes or prealloc extents to figure out | |
902 | * if we have delalloc in those ranges. | |
903 | */ | |
904 | if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { | |
905 | ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX); | |
906 | if (ret) | |
907 | return ret; | |
908 | } | |
909 | ||
910 | btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); | |
911 | ||
912 | /* | |
913 | * We did an initial flush to avoid holding the inode's lock while | |
914 | * triggering writeback and waiting for the completion of IO and ordered | |
915 | * extents. Now after we locked the inode we do it again, because it's | |
916 | * possible a new write may have happened in between those two steps. | |
917 | */ | |
918 | if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { | |
919 | ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX); | |
920 | if (ret) { | |
921 | btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); | |
922 | return ret; | |
923 | } | |
924 | } | |
925 | ||
926 | ret = extent_fiemap(btrfs_inode, fieinfo, start, len); | |
927 | btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); | |
928 | ||
929 | return ret; | |
930 | } |