btrfs: convert btrfs_mark_ordered_io_finished() to take a folio
[linux.git] / fs / btrfs / inode.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
6cbd5570
CM
2/*
3 * Copyright (C) 2007 Oracle. All rights reserved.
6cbd5570
CM
4 */
5
7999096f 6#include <crypto/hash.h>
8f18cf13 7#include <linux/kernel.h>
065631f6 8#include <linux/bio.h>
348332e0 9#include <linux/blk-cgroup.h>
f2eb0a24 10#include <linux/file.h>
39279cc3
CM
11#include <linux/fs.h>
12#include <linux/pagemap.h>
13#include <linux/highmem.h>
14#include <linux/time.h>
15#include <linux/init.h>
16#include <linux/string.h>
39279cc3 17#include <linux/backing-dev.h>
39279cc3 18#include <linux/writeback.h>
39279cc3 19#include <linux/compat.h>
5103e947 20#include <linux/xattr.h>
33268eaf 21#include <linux/posix_acl.h>
d899e052 22#include <linux/falloc.h>
5a0e3ad6 23#include <linux/slab.h>
7a36ddec 24#include <linux/ratelimit.h>
55e301fd 25#include <linux/btrfs.h>
53b381b3 26#include <linux/blkdev.h>
f23b5a59 27#include <linux/posix_acl_xattr.h>
e2e40f2c 28#include <linux/uio.h>
69fe2d75 29#include <linux/magic.h>
ae5e165d 30#include <linux/iversion.h>
ed46ff3d 31#include <linux/swap.h>
f8e66081 32#include <linux/migrate.h>
b1c16ac9 33#include <linux/sched/mm.h>
f85781fb 34#include <linux/iomap.h>
92d32170 35#include <asm/unaligned.h>
14605409 36#include <linux/fsverity.h>
602cbe91 37#include "misc.h"
39279cc3
CM
38#include "ctree.h"
39#include "disk-io.h"
40#include "transaction.h"
41#include "btrfs_inode.h"
e6dcd2dc 42#include "ordered-data.h"
95819c05 43#include "xattr.h"
e02119d5 44#include "tree-log.h"
103c1972 45#include "bio.h"
c8b97818 46#include "compression.h"
b4ce94de 47#include "locking.h"
63541927 48#include "props.h"
31193213 49#include "qgroup.h"
86736342 50#include "delalloc-space.h"
aac0023c 51#include "block-group.h"
467dc47e 52#include "space-info.h"
d8e3fb10 53#include "zoned.h"
b945a463 54#include "subpage.h"
26c2c454 55#include "inode-item.h"
c7f13d42 56#include "fs.h"
ad1ac501 57#include "accessors.h"
a0231804 58#include "extent-tree.h"
45c40c8f 59#include "root-tree.h"
59b818e0 60#include "defrag.h"
f2b39277 61#include "dir-item.h"
7c8ede16 62#include "file-item.h"
c7a03b52 63#include "uuid-tree.h"
7572dec8 64#include "ioctl.h"
af142b6f 65#include "file.h"
33cf97a7 66#include "acl.h"
67707479 67#include "relocation.h"
5c11adcc 68#include "verity.h"
7f0add25 69#include "super.h"
aa5d3003 70#include "orphan.h"
b9a9a850 71#include "backref.h"
02c372e1 72#include "raid-stripe-tree.h"
8996f61a 73#include "fiemap.h"
39279cc3
CM
74
75struct btrfs_iget_args {
0202e83f 76 u64 ino;
39279cc3
CM
77 struct btrfs_root *root;
78};
79
88d2beec
FM
80struct btrfs_rename_ctx {
81 /* Output field. Stores the index number of the old directory entry. */
82 u64 index;
83};
84
b9a9a850
QW
85/*
86 * Used by data_reloc_print_warning_inode() to pass needed info for filename
87 * resolution and output of error message.
88 */
89struct data_reloc_warn {
90 struct btrfs_path path;
91 struct btrfs_fs_info *fs_info;
92 u64 extent_item_size;
93 u64 logical;
94 int mirror_num;
95};
96
516095cd
DS
97/*
98 * For the file_extent_tree, we want to hold the inode lock when we lookup and
99 * update the disk_i_size, but lockdep will complain because our io_tree we hold
100 * the tree lock and get the inode lock when setting delalloc. These two things
101 * are unrelated, so make a class for the file_extent_tree so we don't get the
102 * two locking patterns mixed up.
103 */
104static struct lock_class_key file_extent_tree_class;
105
6e1d5dcc
AD
106static const struct inode_operations btrfs_dir_inode_operations;
107static const struct inode_operations btrfs_symlink_inode_operations;
6e1d5dcc
AD
108static const struct inode_operations btrfs_special_inode_operations;
109static const struct inode_operations btrfs_file_inode_operations;
7f09410b 110static const struct address_space_operations btrfs_aops;
828c0950 111static const struct file_operations btrfs_dir_file_operations;
39279cc3
CM
112
113static struct kmem_cache *btrfs_inode_cachep;
39279cc3 114
3972f260 115static int btrfs_setsize(struct inode *inode, struct iattr *attr);
d9dcae67 116static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
ba9145ad 117
256b0cf9
CH
118static noinline int run_delalloc_cow(struct btrfs_inode *inode,
119 struct page *locked_page, u64 start,
120 u64 end, struct writeback_control *wbc,
121 bool pages_dirty);
7b128766 122
b9a9a850
QW
123static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
124 u64 root, void *warn_ctx)
125{
126 struct data_reloc_warn *warn = warn_ctx;
127 struct btrfs_fs_info *fs_info = warn->fs_info;
128 struct extent_buffer *eb;
129 struct btrfs_inode_item *inode_item;
130 struct inode_fs_paths *ipath = NULL;
131 struct btrfs_root *local_root;
132 struct btrfs_key key;
133 unsigned int nofs_flag;
134 u32 nlink;
135 int ret;
136
137 local_root = btrfs_get_fs_root(fs_info, root, true);
138 if (IS_ERR(local_root)) {
139 ret = PTR_ERR(local_root);
140 goto err;
141 }
142
143 /* This makes the path point to (inum INODE_ITEM ioff). */
144 key.objectid = inum;
145 key.type = BTRFS_INODE_ITEM_KEY;
146 key.offset = 0;
147
148 ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
149 if (ret) {
150 btrfs_put_root(local_root);
151 btrfs_release_path(&warn->path);
152 goto err;
153 }
154
155 eb = warn->path.nodes[0];
156 inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
157 nlink = btrfs_inode_nlink(eb, inode_item);
158 btrfs_release_path(&warn->path);
159
160 nofs_flag = memalloc_nofs_save();
161 ipath = init_ipath(4096, local_root, &warn->path);
162 memalloc_nofs_restore(nofs_flag);
163 if (IS_ERR(ipath)) {
164 btrfs_put_root(local_root);
165 ret = PTR_ERR(ipath);
166 ipath = NULL;
167 /*
168 * -ENOMEM, not a critical error, just output an generic error
169 * without filename.
170 */
171 btrfs_warn(fs_info,
172"checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
173 warn->logical, warn->mirror_num, root, inum, offset);
174 return ret;
175 }
176 ret = paths_from_inode(inum, ipath);
177 if (ret < 0)
178 goto err;
179
180 /*
181 * We deliberately ignore the bit ipath might have been too small to
182 * hold all of the paths here
183 */
184 for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
185 btrfs_warn(fs_info,
186"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
187 warn->logical, warn->mirror_num, root, inum, offset,
188 fs_info->sectorsize, nlink,
189 (char *)(unsigned long)ipath->fspath->val[i]);
190 }
191
192 btrfs_put_root(local_root);
193 free_ipath(ipath);
194 return 0;
195
196err:
197 btrfs_warn(fs_info,
198"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
199 warn->logical, warn->mirror_num, root, inum, offset, ret);
200
201 free_ipath(ipath);
202 return ret;
203}
204
205/*
206 * Do extra user-friendly error output (e.g. lookup all the affected files).
207 *
208 * Return true if we succeeded doing the backref lookup.
209 * Return false if such lookup failed, and has to fallback to the old error message.
210 */
211static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
212 const u8 *csum, const u8 *csum_expected,
213 int mirror_num)
214{
215 struct btrfs_fs_info *fs_info = inode->root->fs_info;
216 struct btrfs_path path = { 0 };
217 struct btrfs_key found_key = { 0 };
218 struct extent_buffer *eb;
219 struct btrfs_extent_item *ei;
220 const u32 csum_size = fs_info->csum_size;
221 u64 logical;
222 u64 flags;
223 u32 item_size;
224 int ret;
225
226 mutex_lock(&fs_info->reloc_mutex);
227 logical = btrfs_get_reloc_bg_bytenr(fs_info);
228 mutex_unlock(&fs_info->reloc_mutex);
229
230 if (logical == U64_MAX) {
231 btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
232 btrfs_warn_rl(fs_info,
233"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
e094f480 234 btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
b9a9a850
QW
235 CSUM_FMT_VALUE(csum_size, csum),
236 CSUM_FMT_VALUE(csum_size, csum_expected),
237 mirror_num);
238 return;
239 }
240
241 logical += file_off;
242 btrfs_warn_rl(fs_info,
243"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
e094f480 244 btrfs_root_id(inode->root),
b9a9a850
QW
245 btrfs_ino(inode), file_off, logical,
246 CSUM_FMT_VALUE(csum_size, csum),
247 CSUM_FMT_VALUE(csum_size, csum_expected),
248 mirror_num);
249
250 ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
251 if (ret < 0) {
252 btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
253 logical, ret);
254 return;
255 }
256 eb = path.nodes[0];
257 ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
258 item_size = btrfs_item_size(eb, path.slots[0]);
259 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
260 unsigned long ptr = 0;
261 u64 ref_root;
262 u8 ref_level;
263
b7f9945a 264 while (true) {
b9a9a850
QW
265 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
266 item_size, &ref_root,
267 &ref_level);
b7f9945a
QW
268 if (ret < 0) {
269 btrfs_warn_rl(fs_info,
270 "failed to resolve tree backref for logical %llu: %d",
271 logical, ret);
272 break;
273 }
274 if (ret > 0)
275 break;
276
b9a9a850
QW
277 btrfs_warn_rl(fs_info,
278"csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
279 logical, mirror_num,
280 (ref_level ? "node" : "leaf"),
b7f9945a
QW
281 ref_level, ref_root);
282 }
b9a9a850
QW
283 btrfs_release_path(&path);
284 } else {
285 struct btrfs_backref_walk_ctx ctx = { 0 };
286 struct data_reloc_warn reloc_warn = { 0 };
287
288 btrfs_release_path(&path);
289
290 ctx.bytenr = found_key.objectid;
291 ctx.extent_item_pos = logical - found_key.objectid;
292 ctx.fs_info = fs_info;
293
294 reloc_warn.logical = logical;
295 reloc_warn.extent_item_size = found_key.offset;
296 reloc_warn.mirror_num = mirror_num;
297 reloc_warn.fs_info = fs_info;
298
299 iterate_extent_inodes(&ctx, true,
300 data_reloc_print_warning_inode, &reloc_warn);
301 }
302}
303
f60acad3
JB
304static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
305 u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
306{
307 struct btrfs_root *root = inode->root;
308 const u32 csum_size = root->fs_info->csum_size;
309
b9a9a850 310 /* For data reloc tree, it's better to do a backref lookup instead. */
e094f480 311 if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
b9a9a850
QW
312 return print_data_reloc_error(inode, logical_start, csum,
313 csum_expected, mirror_num);
314
f60acad3 315 /* Output without objectid, which is more meaningful */
e094f480 316 if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
f60acad3
JB
317 btrfs_warn_rl(root->fs_info,
318"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
e094f480 319 btrfs_root_id(root), btrfs_ino(inode),
f60acad3
JB
320 logical_start,
321 CSUM_FMT_VALUE(csum_size, csum),
322 CSUM_FMT_VALUE(csum_size, csum_expected),
323 mirror_num);
324 } else {
325 btrfs_warn_rl(root->fs_info,
326"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
e094f480 327 btrfs_root_id(root), btrfs_ino(inode),
f60acad3
JB
328 logical_start,
329 CSUM_FMT_VALUE(csum_size, csum),
330 CSUM_FMT_VALUE(csum_size, csum_expected),
331 mirror_num);
332 }
333}
334
a14b78ad 335/*
9580503b 336 * Lock inode i_rwsem based on arguments passed.
a14b78ad
GR
337 *
338 * ilock_flags can have the following bit set:
339 *
340 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
341 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
342 * return -EAGAIN
8318ba79 343 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
a14b78ad 344 */
29b6352b 345int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
a14b78ad
GR
346{
347 if (ilock_flags & BTRFS_ILOCK_SHARED) {
348 if (ilock_flags & BTRFS_ILOCK_TRY) {
29b6352b 349 if (!inode_trylock_shared(&inode->vfs_inode))
a14b78ad
GR
350 return -EAGAIN;
351 else
352 return 0;
353 }
29b6352b 354 inode_lock_shared(&inode->vfs_inode);
a14b78ad
GR
355 } else {
356 if (ilock_flags & BTRFS_ILOCK_TRY) {
29b6352b 357 if (!inode_trylock(&inode->vfs_inode))
a14b78ad
GR
358 return -EAGAIN;
359 else
360 return 0;
361 }
29b6352b 362 inode_lock(&inode->vfs_inode);
a14b78ad 363 }
8318ba79 364 if (ilock_flags & BTRFS_ILOCK_MMAP)
29b6352b 365 down_write(&inode->i_mmap_lock);
a14b78ad
GR
366 return 0;
367}
368
369/*
9580503b 370 * Unock inode i_rwsem.
a14b78ad
GR
371 *
372 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
373 * to decide whether the lock acquired is shared or exclusive.
374 */
e5d4d75b 375void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
a14b78ad 376{
8318ba79 377 if (ilock_flags & BTRFS_ILOCK_MMAP)
e5d4d75b 378 up_write(&inode->i_mmap_lock);
a14b78ad 379 if (ilock_flags & BTRFS_ILOCK_SHARED)
e5d4d75b 380 inode_unlock_shared(&inode->vfs_inode);
a14b78ad 381 else
e5d4d75b 382 inode_unlock(&inode->vfs_inode);
a14b78ad
GR
383}
384
52427260
QW
385/*
386 * Cleanup all submitted ordered extents in specified range to handle errors
52042d8e 387 * from the btrfs_run_delalloc_range() callback.
52427260
QW
388 *
389 * NOTE: caller must ensure that when an error happens, it can not call
390 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
391 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
392 * to be released, which we want to happen only when finishing the ordered
d1051d6e 393 * extent (btrfs_finish_ordered_io()).
52427260 394 */
64e1db56 395static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
d1051d6e
NB
396 struct page *locked_page,
397 u64 offset, u64 bytes)
52427260 398{
63d71450
NA
399 unsigned long index = offset >> PAGE_SHIFT;
400 unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
0e47b25c 401 u64 page_start = 0, page_end = 0;
63d71450
NA
402 struct page *page;
403
99826e4c
NA
404 if (locked_page) {
405 page_start = page_offset(locked_page);
406 page_end = page_start + PAGE_SIZE - 1;
407 }
408
63d71450 409 while (index <= end_index) {
968f2566 410 /*
9783e4de
CH
411 * For locked page, we will call btrfs_mark_ordered_io_finished
412 * through btrfs_mark_ordered_io_finished() on it
413 * in run_delalloc_range() for the error handling, which will
414 * clear page Ordered and run the ordered extent accounting.
968f2566
QW
415 *
416 * Here we can't just clear the Ordered bit, or
417 * btrfs_mark_ordered_io_finished() would skip the accounting
418 * for the page range, and the ordered extent will never finish.
419 */
99826e4c 420 if (locked_page && index == (page_start >> PAGE_SHIFT)) {
968f2566
QW
421 index++;
422 continue;
423 }
64e1db56 424 page = find_get_page(inode->vfs_inode.i_mapping, index);
63d71450
NA
425 index++;
426 if (!page)
427 continue;
968f2566
QW
428
429 /*
430 * Here we just clear all Ordered bits for every page in the
711f447b 431 * range, then btrfs_mark_ordered_io_finished() will handle
968f2566
QW
432 * the ordered extent accounting for the range.
433 */
55151ea9
QW
434 btrfs_folio_clamp_clear_ordered(inode->root->fs_info,
435 page_folio(page), offset, bytes);
63d71450
NA
436 put_page(page);
437 }
d1051d6e 438
99826e4c
NA
439 if (locked_page) {
440 /* The locked page covers the full range, nothing needs to be done */
441 if (bytes + offset <= page_start + PAGE_SIZE)
442 return;
443 /*
444 * In case this page belongs to the delalloc range being
445 * instantiated then skip it, since the first page of a range is
446 * going to be properly cleaned up by the caller of
447 * run_delalloc_range
448 */
449 if (page_start >= offset && page_end <= (offset + bytes - 1)) {
450 bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
451 offset = page_offset(locked_page) + PAGE_SIZE;
452 }
d1051d6e
NB
453 }
454
711f447b 455 return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
52427260
QW
456}
457
7152b425 458static int btrfs_dirty_inode(struct btrfs_inode *inode);
7b128766 459
f34f57a3 460static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
3538d68d 461 struct btrfs_new_inode_args *args)
0279b4cd
JO
462{
463 int err;
464
3538d68d
OS
465 if (args->default_acl) {
466 err = __btrfs_set_acl(trans, args->inode, args->default_acl,
467 ACL_TYPE_DEFAULT);
468 if (err)
469 return err;
470 }
471 if (args->acl) {
472 err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
473 if (err)
474 return err;
475 }
476 if (!args->default_acl && !args->acl)
477 cache_no_acl(args->inode);
478 return btrfs_xattr_security_init(trans, args->inode, args->dir,
479 &args->dentry->d_name);
0279b4cd
JO
480}
481
c8b97818
CM
482/*
483 * this does all the hard work for inserting an inline extent into
484 * the btree. The caller should have done a btrfs_drop_extents so that
485 * no overlapping inline items exist in the btree
486 */
40f76580 487static int insert_inline_extent(struct btrfs_trans_handle *trans,
8dd9872d
OS
488 struct btrfs_path *path,
489 struct btrfs_inode *inode, bool extent_inserted,
490 size_t size, size_t compressed_size,
fe3f566c 491 int compress_type,
ae0d22a7 492 struct folio *compressed_folio,
d9496e8a 493 bool update_i_size)
c8b97818 494{
8dd9872d 495 struct btrfs_root *root = inode->root;
c8b97818
CM
496 struct extent_buffer *leaf;
497 struct page *page = NULL;
eb1fa9ab 498 const u32 sectorsize = trans->fs_info->sectorsize;
c8b97818
CM
499 char *kaddr;
500 unsigned long ptr;
501 struct btrfs_file_extent_item *ei;
c8b97818
CM
502 int ret;
503 size_t cur_size = size;
d9496e8a 504 u64 i_size;
c8b97818 505
eb1fa9ab
QW
506 /*
507 * The decompressed size must still be no larger than a sector. Under
508 * heavy race, we can have size == 0 passed in, but that shouldn't be a
509 * big deal and we can continue the insertion.
510 */
511 ASSERT(size <= sectorsize);
982f1f5d 512
eb1fa9ab
QW
513 /*
514 * The compressed size also needs to be no larger than a sector.
515 * That's also why we only need one page as the parameter.
516 */
ae0d22a7 517 if (compressed_folio)
eb1fa9ab
QW
518 ASSERT(compressed_size <= sectorsize);
519 else
520 ASSERT(compressed_size == 0);
982f1f5d 521
ae0d22a7 522 if (compressed_size && compressed_folio)
c8b97818 523 cur_size = compressed_size;
c8b97818 524
1acae57b
FDBM
525 if (!extent_inserted) {
526 struct btrfs_key key;
527 size_t datasize;
c8b97818 528
8dd9872d
OS
529 key.objectid = btrfs_ino(inode);
530 key.offset = 0;
962a298f 531 key.type = BTRFS_EXTENT_DATA_KEY;
c8b97818 532
1acae57b 533 datasize = btrfs_file_extent_calc_inline_size(cur_size);
1acae57b
FDBM
534 ret = btrfs_insert_empty_item(trans, root, path, &key,
535 datasize);
79b4f4c6 536 if (ret)
1acae57b 537 goto fail;
c8b97818
CM
538 }
539 leaf = path->nodes[0];
540 ei = btrfs_item_ptr(leaf, path->slots[0],
541 struct btrfs_file_extent_item);
542 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
543 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
544 btrfs_set_file_extent_encryption(leaf, ei, 0);
545 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
546 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
547 ptr = btrfs_file_extent_inline_start(ei);
548
261507a0 549 if (compress_type != BTRFS_COMPRESS_NONE) {
ae0d22a7 550 kaddr = kmap_local_folio(compressed_folio, 0);
eb1fa9ab
QW
551 write_extent_buffer(leaf, kaddr, ptr, compressed_size);
552 kunmap_local(kaddr);
c8b97818 553
c8b97818 554 btrfs_set_file_extent_compression(leaf, ei,
261507a0 555 compress_type);
c8b97818 556 } else {
8dd9872d 557 page = find_get_page(inode->vfs_inode.i_mapping, 0);
c8b97818 558 btrfs_set_file_extent_compression(leaf, ei, 0);
4cb2e5e8 559 kaddr = kmap_local_page(page);
8dd9872d 560 write_extent_buffer(leaf, kaddr, ptr, size);
4cb2e5e8 561 kunmap_local(kaddr);
09cbfeaf 562 put_page(page);
c8b97818 563 }
50564b65 564 btrfs_mark_buffer_dirty(trans, leaf);
1acae57b 565 btrfs_release_path(path);
c8b97818 566
9ddc959e
JB
567 /*
568 * We align size to sectorsize for inline extents just for simplicity
569 * sake.
570 */
8dd9872d
OS
571 ret = btrfs_inode_set_file_extent_range(inode, 0,
572 ALIGN(size, root->fs_info->sectorsize));
9ddc959e
JB
573 if (ret)
574 goto fail;
575
c2167754 576 /*
d9496e8a
OS
577 * We're an inline extent, so nobody can extend the file past i_size
578 * without locking a page we already have locked.
c2167754 579 *
d9496e8a
OS
580 * We must do any i_size and inode updates before we unlock the pages.
581 * Otherwise we could end up racing with unlink.
c2167754 582 */
d9496e8a
OS
583 i_size = i_size_read(&inode->vfs_inode);
584 if (update_i_size && size > i_size) {
585 i_size_write(&inode->vfs_inode, size);
586 i_size = size;
587 }
588 inode->disk_i_size = i_size;
8dd9872d 589
c8b97818 590fail:
79b4f4c6 591 return ret;
c8b97818
CM
592}
593
6eecfa22
JB
594static bool can_cow_file_range_inline(struct btrfs_inode *inode,
595 u64 offset, u64 size,
596 size_t compressed_size)
597{
598 struct btrfs_fs_info *fs_info = inode->root->fs_info;
599 u64 data_len = (compressed_size ?: size);
600
601 /* Inline extents must start at offset 0. */
602 if (offset != 0)
603 return false;
604
605 /*
606 * Due to the page size limit, for subpage we can only trigger the
607 * writeback for the dirty sectors of page, that means data writeback
608 * is doing more writeback than what we want.
609 *
610 * This is especially unexpected for some call sites like fallocate,
611 * where we only increase i_size after everything is done.
612 * This means we can trigger inline extent even if we didn't want to.
613 * So here we skip inline extent creation completely.
614 */
615 if (fs_info->sectorsize != PAGE_SIZE)
616 return false;
617
618 /* Inline extents are limited to sectorsize. */
619 if (size > fs_info->sectorsize)
620 return false;
621
622 /* We cannot exceed the maximum inline data size. */
623 if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
624 return false;
625
626 /* We cannot exceed the user specified max_inline size. */
627 if (data_len > fs_info->max_inline)
628 return false;
629
630 /* Inline extents must be the entirety of the file. */
631 if (size < i_size_read(&inode->vfs_inode))
632 return false;
633
634 return true;
635}
c8b97818
CM
636
637/*
638 * conditionally insert an inline extent into the file. This
639 * does the checks required to make sure the data is small enough
640 * to fit as an inline extent.
cd241a8f
JB
641 *
642 * If being used directly, you must have already checked we're allowed to cow
643 * the range by getting true from can_cow_file_range_inline().
c8b97818 644 */
0586d0a8
JB
645static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
646 u64 size, size_t compressed_size,
647 int compress_type,
648 struct folio *compressed_folio,
649 bool update_i_size)
c8b97818 650{
5893dfb9 651 struct btrfs_drop_extents_args drop_args = { 0 };
a0349401 652 struct btrfs_root *root = inode->root;
0b246afa 653 struct btrfs_fs_info *fs_info = root->fs_info;
00361589 654 struct btrfs_trans_handle *trans;
8dd9872d 655 u64 data_len = (compressed_size ?: size);
c8b97818 656 int ret;
1acae57b 657 struct btrfs_path *path;
c8b97818 658
1acae57b
FDBM
659 path = btrfs_alloc_path();
660 if (!path)
661 return -ENOMEM;
662
00361589 663 trans = btrfs_join_transaction(root);
1acae57b
FDBM
664 if (IS_ERR(trans)) {
665 btrfs_free_path(path);
00361589 666 return PTR_ERR(trans);
1acae57b 667 }
a0349401 668 trans->block_rsv = &inode->block_rsv;
00361589 669
5893dfb9 670 drop_args.path = path;
8dd9872d
OS
671 drop_args.start = 0;
672 drop_args.end = fs_info->sectorsize;
5893dfb9
FM
673 drop_args.drop_cache = true;
674 drop_args.replace_extent = true;
8dd9872d 675 drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
5893dfb9 676 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
00361589 677 if (ret) {
66642832 678 btrfs_abort_transaction(trans, ret);
00361589
JB
679 goto out;
680 }
c8b97818 681
8dd9872d
OS
682 ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
683 size, compressed_size, compress_type,
ae0d22a7 684 compressed_folio, update_i_size);
2adcac1a 685 if (ret && ret != -ENOSPC) {
66642832 686 btrfs_abort_transaction(trans, ret);
00361589 687 goto out;
2adcac1a 688 } else if (ret == -ENOSPC) {
00361589
JB
689 ret = 1;
690 goto out;
79787eaa 691 }
2adcac1a 692
8dd9872d 693 btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
8b9d0322 694 ret = btrfs_update_inode(trans, inode);
2766ff61
FM
695 if (ret && ret != -ENOSPC) {
696 btrfs_abort_transaction(trans, ret);
697 goto out;
698 } else if (ret == -ENOSPC) {
699 ret = 1;
700 goto out;
701 }
702
23e3337f 703 btrfs_set_inode_full_sync(inode);
00361589 704out:
94ed938a
QW
705 /*
706 * Don't forget to free the reserved space, as for inlined extent
707 * it won't count as data extent, free them directly here.
708 * And at reserve time, it's always aligned to page size, so
709 * just free one page here.
710 */
9e65bfca 711 btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
1acae57b 712 btrfs_free_path(path);
3a45bb20 713 btrfs_end_transaction(trans);
00361589 714 return ret;
c8b97818
CM
715}
716
47857437
BB
717static noinline int cow_file_range_inline(struct btrfs_inode *inode,
718 struct page *locked_page,
719 u64 offset, u64 end,
0586d0a8
JB
720 size_t compressed_size,
721 int compress_type,
722 struct folio *compressed_folio,
7034674b 723 bool update_i_size)
0586d0a8 724{
0ab54099 725 struct extent_state *cached = NULL;
0586d0a8 726 unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
7034674b 727 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
0586d0a8
JB
728 u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
729 int ret;
730
cd241a8f
JB
731 if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
732 return 1;
733
0ab54099 734 lock_extent(&inode->io_tree, offset, end, &cached);
0586d0a8
JB
735 ret = __cow_file_range_inline(inode, offset, size, compressed_size,
736 compress_type, compressed_folio,
737 update_i_size);
0ab54099
JB
738 if (ret > 0) {
739 unlock_extent(&inode->io_tree, offset, end, &cached);
0586d0a8 740 return ret;
0ab54099 741 }
0586d0a8 742
47857437
BB
743 if (ret == 0)
744 locked_page = NULL;
745
746 extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached,
6b0a63a4 747 clear_flags,
0586d0a8
JB
748 PAGE_UNLOCK | PAGE_START_WRITEBACK |
749 PAGE_END_WRITEBACK);
750 return ret;
751}
752
771ed689
CM
753struct async_extent {
754 u64 start;
755 u64 ram_size;
756 u64 compressed_size;
400b172b
QW
757 struct folio **folios;
758 unsigned long nr_folios;
261507a0 759 int compress_type;
771ed689
CM
760 struct list_head list;
761};
762
97db1204 763struct async_chunk {
99a81a44 764 struct btrfs_inode *inode;
771ed689
CM
765 struct page *locked_page;
766 u64 start;
767 u64 end;
bf9486d6 768 blk_opf_t write_flags;
771ed689 769 struct list_head extents;
ec39f769 770 struct cgroup_subsys_state *blkcg_css;
771ed689 771 struct btrfs_work work;
9e895a8f 772 struct async_cow *async_cow;
771ed689
CM
773};
774
97db1204 775struct async_cow {
97db1204
NB
776 atomic_t num_chunks;
777 struct async_chunk chunks[];
771ed689
CM
778};
779
97db1204 780static noinline int add_async_extent(struct async_chunk *cow,
771ed689
CM
781 u64 start, u64 ram_size,
782 u64 compressed_size,
400b172b
QW
783 struct folio **folios,
784 unsigned long nr_folios,
261507a0 785 int compress_type)
771ed689
CM
786{
787 struct async_extent *async_extent;
788
789 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
dbe6cda6
DS
790 if (!async_extent)
791 return -ENOMEM;
771ed689
CM
792 async_extent->start = start;
793 async_extent->ram_size = ram_size;
794 async_extent->compressed_size = compressed_size;
400b172b
QW
795 async_extent->folios = folios;
796 async_extent->nr_folios = nr_folios;
261507a0 797 async_extent->compress_type = compress_type;
771ed689
CM
798 list_add_tail(&async_extent->list, &cow->extents);
799 return 0;
800}
801
42c16da6
QW
802/*
803 * Check if the inode needs to be submitted to compression, based on mount
804 * options, defragmentation, properties or heuristics.
805 */
808a1292
NB
806static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
807 u64 end)
f79707b0 808{
808a1292 809 struct btrfs_fs_info *fs_info = inode->root->fs_info;
f79707b0 810
e6f9d696 811 if (!btrfs_inode_can_compress(inode)) {
42c16da6
QW
812 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
813 KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
808a1292 814 btrfs_ino(inode));
42c16da6
QW
815 return 0;
816 }
0cf9b244
QW
817 /*
818 * Special check for subpage.
819 *
820 * We lock the full page then run each delalloc range in the page, thus
821 * for the following case, we will hit some subpage specific corner case:
822 *
823 * 0 32K 64K
824 * | |///////| |///////|
825 * \- A \- B
826 *
827 * In above case, both range A and range B will try to unlock the full
828 * page [0, 64K), causing the one finished later will have page
829 * unlocked already, triggering various page lock requirement BUG_ON()s.
830 *
831 * So here we add an artificial limit that subpage compression can only
832 * if the range is fully page aligned.
833 *
834 * In theory we only need to ensure the first page is fully covered, but
835 * the tailing partial page will be locked until the full compression
836 * finishes, delaying the write of other range.
837 *
838 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
839 * first to prevent any submitted async extent to unlock the full page.
840 * By this, we can ensure for subpage case that only the last async_cow
841 * will unlock the full page.
842 */
843 if (fs_info->sectorsize < PAGE_SIZE) {
1280d2d1
FK
844 if (!PAGE_ALIGNED(start) ||
845 !PAGE_ALIGNED(end + 1))
0cf9b244
QW
846 return 0;
847 }
848
f79707b0 849 /* force compress */
0b246afa 850 if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
f79707b0 851 return 1;
eec63c65 852 /* defrag ioctl */
808a1292 853 if (inode->defrag_compress)
eec63c65 854 return 1;
f79707b0 855 /* bad compression ratios */
808a1292 856 if (inode->flags & BTRFS_INODE_NOCOMPRESS)
f79707b0 857 return 0;
0b246afa 858 if (btrfs_test_opt(fs_info, COMPRESS) ||
808a1292
NB
859 inode->flags & BTRFS_INODE_COMPRESS ||
860 inode->prop_compress)
e2877c2a 861 return btrfs_compress_heuristic(inode, start, end);
f79707b0
WS
862 return 0;
863}
864
6158e1ce 865static inline void inode_should_defrag(struct btrfs_inode *inode,
558732df 866 u64 start, u64 end, u64 num_bytes, u32 small_write)
26d30f85
AJ
867{
868 /* If this is a small write inside eof, kick off a defrag */
869 if (num_bytes < small_write &&
6158e1ce 870 (start > 0 || end + 1 < inode->disk_i_size))
558732df 871 btrfs_add_inode_defrag(NULL, inode, small_write);
26d30f85
AJ
872}
873
a3948437 874static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
af61081f 875{
af61081f
QW
876 unsigned long end_index = end >> PAGE_SHIFT;
877 struct page *page;
a3948437 878 int ret = 0;
af61081f 879
a3948437
QW
880 for (unsigned long index = start >> PAGE_SHIFT;
881 index <= end_index; index++) {
af61081f 882 page = find_get_page(inode->i_mapping, index);
a3948437
QW
883 if (unlikely(!page)) {
884 if (!ret)
885 ret = -ENOENT;
886 continue;
887 }
af61081f
QW
888 clear_page_dirty_for_io(page);
889 put_page(page);
af61081f 890 }
a3948437 891 return ret;
af61081f
QW
892}
893
d352ac68 894/*
c15d8cf2 895 * Work queue call back to started compression on a file and pages.
c8b97818 896 *
c15d8cf2
CH
897 * This is done inside an ordered work queue, and the compression is spread
898 * across many cpus. The actual IO submission is step two, and the ordered work
899 * queue takes care of making sure that happens in the same order things were
900 * put onto the queue by writepages and friends.
c8b97818 901 *
c15d8cf2
CH
902 * If this code finds it can't get good compression, it puts an entry onto the
903 * work queue to write the uncompressed bytes. This makes sure that both
904 * compressed inodes and uncompressed inodes are written in the same order that
905 * the flusher thread sent them down.
d352ac68 906 */
c15d8cf2 907static void compress_file_range(struct btrfs_work *work)
b888db2b 908{
c15d8cf2
CH
909 struct async_chunk *async_chunk =
910 container_of(work, struct async_chunk, work);
99a01bd6
DS
911 struct btrfs_inode *inode = async_chunk->inode;
912 struct btrfs_fs_info *fs_info = inode->root->fs_info;
a994310a 913 struct address_space *mapping = inode->vfs_inode.i_mapping;
0b246afa 914 u64 blocksize = fs_info->sectorsize;
1368c6da
NB
915 u64 start = async_chunk->start;
916 u64 end = async_chunk->end;
c8b97818 917 u64 actual_end;
d98da499 918 u64 i_size;
e6dcd2dc 919 int ret = 0;
400b172b
QW
920 struct folio **folios;
921 unsigned long nr_folios;
c8b97818
CM
922 unsigned long total_compressed = 0;
923 unsigned long total_in = 0;
e94e54e8 924 unsigned int poff;
c8b97818 925 int i;
0b246afa 926 int compress_type = fs_info->compress_type;
b888db2b 927
99a01bd6 928 inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
4cb5300b 929
44962ca3
CH
930 /*
931 * We need to call clear_page_dirty_for_io on each page in the range.
932 * Otherwise applications with the file mmap'd can wander in and change
933 * the page contents while we are compressing them.
934 */
a3948437
QW
935 ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
936
937 /*
938 * All the folios should have been locked thus no failure.
939 *
940 * And even if some folios are missing, btrfs_compress_folios()
941 * would handle them correctly, so here just do an ASSERT() check for
942 * early logic errors.
943 */
944 ASSERT(ret == 0);
44962ca3 945
d98da499
JB
946 /*
947 * We need to save i_size before now because it could change in between
948 * us evaluating the size and assigning it. This is because we lock and
949 * unlock the page in truncate and fallocate, and then modify the i_size
950 * later on.
951 *
952 * The barriers are to emulate READ_ONCE, remove that once i_size_read
953 * does that for us.
954 */
955 barrier();
99a01bd6 956 i_size = i_size_read(&inode->vfs_inode);
d98da499
JB
957 barrier();
958 actual_end = min_t(u64, i_size, end + 1);
c8b97818 959again:
400b172b
QW
960 folios = NULL;
961 nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
962 nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
be20aa9d 963
f03d9301
CM
964 /*
965 * we don't want to send crud past the end of i_size through
966 * compression, that's just a waste of CPU time. So, if the
967 * end of the file is before the start of our current
968 * requested range of bytes, we bail out to the uncompressed
969 * cleanup code that can deal with all of this.
970 *
971 * It isn't really the fastest way to fix things, but this is a
972 * very uncommon corner.
973 */
974 if (actual_end <= start)
975 goto cleanup_and_bail_uncompressed;
976
c8b97818
CM
977 total_compressed = actual_end - start;
978
4bcbb332 979 /*
0cf9b244 980 * Skip compression for a small file range(<=blocksize) that
01327610 981 * isn't an inline extent, since it doesn't save disk space at all.
4bcbb332
SW
982 */
983 if (total_compressed <= blocksize &&
99a01bd6 984 (start > 0 || end + 1 < inode->disk_i_size))
4bcbb332
SW
985 goto cleanup_and_bail_uncompressed;
986
0cf9b244
QW
987 /*
988 * For subpage case, we require full page alignment for the sector
989 * aligned range.
990 * Thus we must also check against @actual_end, not just @end.
991 */
992 if (blocksize < PAGE_SIZE) {
1280d2d1
FK
993 if (!PAGE_ALIGNED(start) ||
994 !PAGE_ALIGNED(round_up(actual_end, blocksize)))
0cf9b244
QW
995 goto cleanup_and_bail_uncompressed;
996 }
997
069eac78
DS
998 total_compressed = min_t(unsigned long, total_compressed,
999 BTRFS_MAX_UNCOMPRESSED);
c8b97818
CM
1000 total_in = 0;
1001 ret = 0;
db94535d 1002
771ed689 1003 /*
e94e54e8
CH
1004 * We do compression for mount -o compress and when the inode has not
1005 * been flagged as NOCOMPRESS. This flag can change at any time if we
1006 * discover bad compression ratios.
c8b97818 1007 */
e94e54e8 1008 if (!inode_need_compress(inode, start, end))
6a7167bf 1009 goto cleanup_and_bail_uncompressed;
261507a0 1010
400b172b
QW
1011 folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
1012 if (!folios) {
4adaa611 1013 /*
e94e54e8
CH
1014 * Memory allocation failure is not a fatal error, we can fall
1015 * back to uncompressed code.
4adaa611 1016 */
6a7167bf 1017 goto cleanup_and_bail_uncompressed;
e94e54e8 1018 }
f51d2b59 1019
e94e54e8
CH
1020 if (inode->defrag_compress)
1021 compress_type = inode->defrag_compress;
1022 else if (inode->prop_compress)
1023 compress_type = inode->prop_compress;
1024
e94e54e8 1025 /* Compression level is applied here. */
400b172b
QW
1026 ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4),
1027 mapping, start, folios, &nr_folios, &total_in,
1028 &total_compressed);
e94e54e8 1029 if (ret)
184aa1ff 1030 goto mark_incompressible;
c8b97818 1031
e94e54e8
CH
1032 /*
1033 * Zero the tail end of the last page, as we might be sending it down
1034 * to disk.
1035 */
1036 poff = offset_in_page(total_compressed);
1037 if (poff)
400b172b 1038 folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
c8b97818 1039
7367253a 1040 /*
6a7167bf
CH
1041 * Try to create an inline extent.
1042 *
1043 * If we didn't compress the entire range, try to create an uncompressed
1044 * inline extent, else a compressed one.
1045 *
7367253a 1046 * Check cow_file_range() for why we don't even try to create inline
e94e54e8 1047 * extent for the subpage case.
7367253a 1048 */
6eecfa22 1049 if (total_in < actual_end)
47857437 1050 ret = cow_file_range_inline(inode, NULL, start, end, 0,
7034674b 1051 BTRFS_COMPRESS_NONE, NULL, false);
6eecfa22 1052 else
47857437 1053 ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
7034674b 1054 compress_type, folios[0], false);
6eecfa22 1055 if (ret <= 0) {
6eecfa22
JB
1056 if (ret < 0)
1057 mapping_set_error(mapping, -EIO);
6eecfa22 1058 goto free_pages;
c8b97818
CM
1059 }
1060
e94e54e8
CH
1061 /*
1062 * We aren't doing an inline extent. Round the compressed size up to a
1063 * block size boundary so the allocator does sane things.
1064 */
1065 total_compressed = ALIGN(total_compressed, blocksize);
c8b97818 1066
e94e54e8
CH
1067 /*
1068 * One last check to make sure the compression is really a win, compare
1069 * the page count read with the blocks on disk, compression must free at
1070 * least one sector.
1071 */
1072 total_in = round_up(total_in, fs_info->sectorsize);
1073 if (total_compressed + blocksize > total_in)
184aa1ff 1074 goto mark_incompressible;
c8bb0c8b 1075
e94e54e8
CH
1076 /*
1077 * The async work queues will take care of doing actual allocation on
1078 * disk for these compressed pages, and will submit the bios.
1079 */
400b172b
QW
1080 ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
1081 nr_folios, compress_type);
dbe6cda6 1082 BUG_ON(ret);
e94e54e8
CH
1083 if (start + total_in < end) {
1084 start += total_in;
1085 cond_resched();
1086 goto again;
c8b97818 1087 }
e94e54e8
CH
1088 return;
1089
184aa1ff
CH
1090mark_incompressible:
1091 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
1092 inode->flags |= BTRFS_INODE_NOCOMPRESS;
e94e54e8 1093cleanup_and_bail_uncompressed:
dbe6cda6
DS
1094 ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
1095 BTRFS_COMPRESS_NONE);
1096 BUG_ON(ret);
f778b6b8 1097free_pages:
400b172b
QW
1098 if (folios) {
1099 for (i = 0; i < nr_folios; i++) {
1100 WARN_ON(folios[i]->mapping);
1101 btrfs_free_compr_folio(folios[i]);
c8b97818 1102 }
400b172b 1103 kfree(folios);
c8b97818 1104 }
771ed689 1105}
771ed689 1106
40ae837b
FM
1107static void free_async_extent_pages(struct async_extent *async_extent)
1108{
1109 int i;
1110
400b172b 1111 if (!async_extent->folios)
40ae837b
FM
1112 return;
1113
400b172b
QW
1114 for (i = 0; i < async_extent->nr_folios; i++) {
1115 WARN_ON(async_extent->folios[i]->mapping);
1116 btrfs_free_compr_folio(async_extent->folios[i]);
40ae837b 1117 }
400b172b
QW
1118 kfree(async_extent->folios);
1119 async_extent->nr_folios = 0;
1120 async_extent->folios = NULL;
771ed689
CM
1121}
1122
ff20d6a4
CH
1123static void submit_uncompressed_range(struct btrfs_inode *inode,
1124 struct async_extent *async_extent,
1125 struct page *locked_page)
771ed689 1126{
2b83a0ee
QW
1127 u64 start = async_extent->start;
1128 u64 end = async_extent->start + async_extent->ram_size - 1;
2b83a0ee 1129 int ret;
7027f871
CH
1130 struct writeback_control wbc = {
1131 .sync_mode = WB_SYNC_ALL,
1132 .range_start = start,
1133 .range_end = end,
1134 .no_cgroup_owner = 1,
1135 };
771ed689 1136
256b0cf9
CH
1137 wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
1138 ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false);
1139 wbc_detach_inode(&wbc);
2b83a0ee 1140 if (ret < 0) {
71aa147b
NA
1141 btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
1142 if (locked_page) {
1143 const u64 page_start = page_offset(locked_page);
71aa147b 1144
71aa147b
NA
1145 set_page_writeback(locked_page);
1146 end_page_writeback(locked_page);
a7922801
JB
1147 btrfs_mark_ordered_io_finished(inode,
1148 page_folio(locked_page),
9783e4de
CH
1149 page_start, PAGE_SIZE,
1150 !ret);
9783e4de 1151 mapping_set_error(locked_page->mapping, ret);
2b83a0ee 1152 unlock_page(locked_page);
71aa147b 1153 }
2b83a0ee 1154 }
2b83a0ee 1155}
79787eaa 1156
84f262f0
CH
1157static void submit_one_async_extent(struct async_chunk *async_chunk,
1158 struct async_extent *async_extent,
1159 u64 *alloc_hint)
771ed689 1160{
84f262f0 1161 struct btrfs_inode *inode = async_chunk->inode;
b4ccace8
QW
1162 struct extent_io_tree *io_tree = &inode->io_tree;
1163 struct btrfs_root *root = inode->root;
1164 struct btrfs_fs_info *fs_info = root->fs_info;
d611935b 1165 struct btrfs_ordered_extent *ordered;
3d2ac992 1166 struct btrfs_file_extent file_extent;
771ed689 1167 struct btrfs_key ins;
2b83a0ee 1168 struct page *locked_page = NULL;
6b0a63a4 1169 struct extent_state *cached = NULL;
771ed689 1170 struct extent_map *em;
f5a84ee3 1171 int ret = 0;
b4ccace8
QW
1172 u64 start = async_extent->start;
1173 u64 end = async_extent->start + async_extent->ram_size - 1;
771ed689 1174
896d7c1a
CH
1175 if (async_chunk->blkcg_css)
1176 kthread_associate_blkcg(async_chunk->blkcg_css);
1177
2b83a0ee
QW
1178 /*
1179 * If async_chunk->locked_page is in the async_extent range, we need to
1180 * handle it.
1181 */
1182 if (async_chunk->locked_page) {
1183 u64 locked_page_start = page_offset(async_chunk->locked_page);
1184 u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
3e04e7f1 1185
2b83a0ee
QW
1186 if (!(start >= locked_page_end || end <= locked_page_start))
1187 locked_page = async_chunk->locked_page;
b4ccace8 1188 }
ce62003f 1189
67583468 1190 if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
ff20d6a4 1191 submit_uncompressed_range(inode, async_extent, locked_page);
e43a6210
CH
1192 goto done;
1193 }
ce62003f 1194
b4ccace8
QW
1195 ret = btrfs_reserve_extent(root, async_extent->ram_size,
1196 async_extent->compressed_size,
1197 async_extent->compressed_size,
1198 0, *alloc_hint, &ins, 1, 1);
1199 if (ret) {
c2167754 1200 /*
131a821a
STD
1201 * We can't reserve contiguous space for the compressed size.
1202 * Unlikely, but it's possible that we could have enough
1203 * non-contiguous space for the uncompressed size instead. So
1204 * fall back to uncompressed.
c2167754 1205 */
131a821a
STD
1206 submit_uncompressed_range(inode, async_extent, locked_page);
1207 goto done;
b4ccace8
QW
1208 }
1209
6b0a63a4 1210 lock_extent(io_tree, start, end, &cached);
8325f41a 1211
b4ccace8 1212 /* Here we're doing allocation and writeback of the compressed pages */
3d2ac992
QW
1213 file_extent.disk_bytenr = ins.objectid;
1214 file_extent.disk_num_bytes = ins.offset;
1215 file_extent.ram_bytes = async_extent->ram_size;
1216 file_extent.num_bytes = async_extent->ram_size;
1217 file_extent.offset = 0;
1218 file_extent.compression = async_extent->compress_type;
1219
9aa29a20 1220 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
b4ccace8
QW
1221 if (IS_ERR(em)) {
1222 ret = PTR_ERR(em);
1223 goto out_free_reserve;
1224 }
1225 free_extent_map(em);
771ed689 1226
e9ea31fb
QW
1227 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
1228 1 << BTRFS_ORDERED_COMPRESSED);
d611935b 1229 if (IS_ERR(ordered)) {
4c0c8cfc 1230 btrfs_drop_extent_map_range(inode, start, end, false);
d611935b 1231 ret = PTR_ERR(ordered);
b4ccace8 1232 goto out_free_reserve;
771ed689 1233 }
b4ccace8
QW
1234 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1235
1236 /* Clear dirty, set writeback and unlock the pages. */
1237 extent_clear_unlock_delalloc(inode, start, end,
6b0a63a4 1238 NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
b4ccace8 1239 PAGE_UNLOCK | PAGE_START_WRITEBACK);
d611935b 1240 btrfs_submit_compressed_write(ordered,
400b172b
QW
1241 async_extent->folios, /* compressed_folios */
1242 async_extent->nr_folios,
05d06a5c 1243 async_chunk->write_flags, true);
b4ccace8 1244 *alloc_hint = ins.objectid + ins.offset;
e43a6210 1245done:
896d7c1a
CH
1246 if (async_chunk->blkcg_css)
1247 kthread_associate_blkcg(NULL);
b4ccace8 1248 kfree(async_extent);
84f262f0 1249 return;
b4ccace8 1250
3e04e7f1 1251out_free_reserve:
0b246afa 1252 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
2ff7e61e 1253 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
a994310a 1254 mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
b4ccace8 1255 extent_clear_unlock_delalloc(inode, start, end,
6b0a63a4
JB
1256 NULL, &cached,
1257 EXTENT_LOCKED | EXTENT_DELALLOC |
a7e3b975 1258 EXTENT_DELALLOC_NEW |
151a41bc 1259 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
6869b0a8 1260 PAGE_UNLOCK | PAGE_START_WRITEBACK |
a994310a 1261 PAGE_END_WRITEBACK);
40ae837b 1262 free_async_extent_pages(async_extent);
84f262f0
CH
1263 if (async_chunk->blkcg_css)
1264 kthread_associate_blkcg(NULL);
1265 btrfs_debug(fs_info,
b4ccace8 1266"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
e094f480 1267 btrfs_root_id(root), btrfs_ino(inode), start,
84f262f0
CH
1268 async_extent->ram_size, ret);
1269 kfree(async_extent);
771ed689
CM
1270}
1271
9aa29a20
FM
1272u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1273 u64 num_bytes)
4b46fce2 1274{
43c69849 1275 struct extent_map_tree *em_tree = &inode->extent_tree;
4b46fce2
JB
1276 struct extent_map *em;
1277 u64 alloc_hint = 0;
1278
1279 read_lock(&em_tree->lock);
1280 em = search_extent_mapping(em_tree, start, num_bytes);
1281 if (em) {
1282 /*
1283 * if block start isn't an actual block number then find the
1284 * first block in this inode and use that as a hint. If that
1285 * block is also bogus then just don't worry about it.
1286 */
c77a8c61 1287 if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
4b46fce2
JB
1288 free_extent_map(em);
1289 em = search_extent_mapping(em_tree, 0, 0);
c77a8c61
QW
1290 if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
1291 alloc_hint = extent_map_block_start(em);
4b46fce2
JB
1292 if (em)
1293 free_extent_map(em);
1294 } else {
c77a8c61 1295 alloc_hint = extent_map_block_start(em);
4b46fce2
JB
1296 free_extent_map(em);
1297 }
1298 }
1299 read_unlock(&em_tree->lock);
1300
1301 return alloc_hint;
1302}
1303
771ed689
CM
1304/*
1305 * when extent_io.c finds a delayed allocation range in the file,
1306 * the call backs end up in this code. The basic idea is to
1307 * allocate extents on disk for the range, and create ordered data structs
1308 * in ram to track those extents.
1309 *
1310 * locked_page is the page that writepage had locked already. We use
1311 * it to make sure we don't do extra locks or unlocks.
1312 *
ba9145ad 1313 * When this function fails, it unlocks all pages except @locked_page.
9ce7466f 1314 *
c56cbe90
CH
1315 * When this function successfully creates an inline extent, it returns 1 and
1316 * unlocks all pages including locked_page and starts I/O on them.
ba9145ad
CH
1317 * (In reality inline extents are limited to a single page, so locked_page is
1318 * the only page handled anyway).
9ce7466f 1319 *
ba9145ad
CH
1320 * When this function succeed and creates a normal extent, the page locking
1321 * status depends on the passed in flags:
9ce7466f 1322 *
ba9145ad
CH
1323 * - If @keep_locked is set, all pages are kept locked.
1324 * - Else all pages except for @locked_page are unlocked.
9ce7466f
NA
1325 *
1326 * When a failure happens in the second or later iteration of the
1327 * while-loop, the ordered extents created in previous iterations are kept
1328 * intact. So, the caller must clean them up by calling
1329 * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
1330 * example.
771ed689 1331 */
6e26c442 1332static noinline int cow_file_range(struct btrfs_inode *inode,
c56cbe90
CH
1333 struct page *locked_page, u64 start, u64 end,
1334 u64 *done_offset,
53ffb30a 1335 bool keep_locked, bool no_inline)
771ed689 1336{
6e26c442
NB
1337 struct btrfs_root *root = inode->root;
1338 struct btrfs_fs_info *fs_info = root->fs_info;
6b0a63a4 1339 struct extent_state *cached = NULL;
771ed689 1340 u64 alloc_hint = 0;
9ce7466f 1341 u64 orig_start = start;
771ed689
CM
1342 u64 num_bytes;
1343 unsigned long ram_size;
a315e68f 1344 u64 cur_alloc_size = 0;
432cd2a1 1345 u64 min_alloc_size;
0b246afa 1346 u64 blocksize = fs_info->sectorsize;
771ed689
CM
1347 struct btrfs_key ins;
1348 struct extent_map *em;
a315e68f
FM
1349 unsigned clear_bits;
1350 unsigned long page_ops;
1351 bool extent_reserved = false;
771ed689
CM
1352 int ret = 0;
1353
6e26c442 1354 if (btrfs_is_free_space_inode(inode)) {
29bce2f3
JB
1355 ret = -EINVAL;
1356 goto out_unlock;
02ecd2c2 1357 }
771ed689 1358
fda2832f 1359 num_bytes = ALIGN(end - start + 1, blocksize);
771ed689 1360 num_bytes = max(blocksize, num_bytes);
566b1760 1361 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
771ed689 1362
6e26c442 1363 inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
4cb5300b 1364
6eecfa22 1365 if (!no_inline) {
771ed689 1366 /* lets try to make an inline extent */
47857437 1367 ret = cow_file_range_inline(inode, locked_page, start, end, 0,
d9496e8a 1368 BTRFS_COMPRESS_NONE, NULL, false);
0586d0a8 1369 if (ret <= 0) {
4750af3b 1370 /*
0586d0a8
JB
1371 * We succeeded, return 1 so the caller knows we're done
1372 * with this page and already handled the IO.
4750af3b 1373 *
0586d0a8
JB
1374 * If there was an error then cow_file_range_inline() has
1375 * already done the cleanup.
4750af3b 1376 */
0586d0a8
JB
1377 if (ret == 0)
1378 ret = 1;
6e144bf1 1379 goto done;
771ed689
CM
1380 }
1381 }
1382
9aa29a20 1383 alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
771ed689 1384
432cd2a1
FM
1385 /*
1386 * Relocation relies on the relocated extents to have exactly the same
1387 * size as the original extents. Normally writeback for relocation data
1388 * extents follows a NOCOW path because relocation preallocates the
1389 * extents. However, due to an operation such as scrub turning a block
1390 * group to RO mode, it may fallback to COW mode, so we must make sure
1391 * an extent allocated during COW has exactly the requested size and can
1392 * not be split into smaller extents, otherwise relocation breaks and
1393 * fails during the stage where it updates the bytenr of file extent
1394 * items.
1395 */
37f00a6d 1396 if (btrfs_is_data_reloc_root(root))
432cd2a1
FM
1397 min_alloc_size = num_bytes;
1398 else
1399 min_alloc_size = fs_info->sectorsize;
1400
3752d22f 1401 while (num_bytes > 0) {
34bfaf15 1402 struct btrfs_ordered_extent *ordered;
3d2ac992 1403 struct btrfs_file_extent file_extent;
34bfaf15 1404
3752d22f 1405 cur_alloc_size = num_bytes;
18513091 1406 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
432cd2a1 1407 min_alloc_size, 0, alloc_hint,
e570fd27 1408 &ins, 1, 1);
6e144bf1
CH
1409 if (ret == -EAGAIN) {
1410 /*
1411 * btrfs_reserve_extent only returns -EAGAIN for zoned
1412 * file systems, which is an indication that there are
1413 * no active zones to allocate from at the moment.
1414 *
1415 * If this is the first loop iteration, wait for at
1416 * least one zone to finish before retrying the
1417 * allocation. Otherwise ask the caller to write out
1418 * the already allocated blocks before coming back to
1419 * us, or return -ENOSPC if it can't handle retries.
1420 */
1421 ASSERT(btrfs_is_zoned(fs_info));
1422 if (start == orig_start) {
1423 wait_on_bit_io(&inode->root->fs_info->flags,
1424 BTRFS_FS_NEED_ZONE_FINISH,
1425 TASK_UNINTERRUPTIBLE);
1426 continue;
1427 }
1428 if (done_offset) {
1429 *done_offset = start - 1;
1430 return 0;
1431 }
1432 ret = -ENOSPC;
1433 }
00361589 1434 if (ret < 0)
79787eaa 1435 goto out_unlock;
a315e68f
FM
1436 cur_alloc_size = ins.offset;
1437 extent_reserved = true;
d397712b 1438
771ed689 1439 ram_size = ins.offset;
3d2ac992
QW
1440 file_extent.disk_bytenr = ins.objectid;
1441 file_extent.disk_num_bytes = ins.offset;
1442 file_extent.num_bytes = ins.offset;
1443 file_extent.ram_bytes = ins.offset;
1444 file_extent.offset = 0;
1445 file_extent.compression = BTRFS_COMPRESS_NONE;
d456c25d 1446
6b0a63a4
JB
1447 lock_extent(&inode->io_tree, start, start + ram_size - 1,
1448 &cached);
d456c25d 1449
9aa29a20
FM
1450 em = btrfs_create_io_em(inode, start, &file_extent,
1451 BTRFS_ORDERED_REGULAR);
090a127a 1452 if (IS_ERR(em)) {
d456c25d 1453 unlock_extent(&inode->io_tree, start,
6b0a63a4 1454 start + ram_size - 1, &cached);
090a127a 1455 ret = PTR_ERR(em);
ace68bac 1456 goto out_reserve;
090a127a 1457 }
6f9994db 1458 free_extent_map(em);
e6dcd2dc 1459
e9ea31fb
QW
1460 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
1461 1 << BTRFS_ORDERED_REGULAR);
34bfaf15 1462 if (IS_ERR(ordered)) {
d456c25d 1463 unlock_extent(&inode->io_tree, start,
6b0a63a4 1464 start + ram_size - 1, &cached);
34bfaf15 1465 ret = PTR_ERR(ordered);
d9f85963 1466 goto out_drop_extent_cache;
34bfaf15 1467 }
c8b97818 1468
37f00a6d 1469 if (btrfs_is_data_reloc_root(root)) {
34bfaf15
CH
1470 ret = btrfs_reloc_clone_csums(ordered);
1471
4dbd80fb
QW
1472 /*
1473 * Only drop cache here, and process as normal.
1474 *
1475 * We must not allow extent_clear_unlock_delalloc()
1476 * at out_unlock label to free meta of this ordered
1477 * extent, as its meta should be freed by
1478 * btrfs_finish_ordered_io().
1479 *
1480 * So we must continue until @start is increased to
1481 * skip current ordered extent.
1482 */
00361589 1483 if (ret)
4c0c8cfc
FM
1484 btrfs_drop_extent_map_range(inode, start,
1485 start + ram_size - 1,
1486 false);
17d217fe 1487 }
34bfaf15 1488 btrfs_put_ordered_extent(ordered);
17d217fe 1489
0b246afa 1490 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9cfa3e34 1491
f57ad937
QW
1492 /*
1493 * We're not doing compressed IO, don't unlock the first page
1494 * (which the caller expects to stay locked), don't clear any
1495 * dirty bits and don't set any writeback bits
8b62b72b 1496 *
f57ad937
QW
1497 * Do set the Ordered (Private2) bit so we know this page was
1498 * properly setup for writepage.
c8b97818 1499 */
ba9145ad 1500 page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
f57ad937 1501 page_ops |= PAGE_SET_ORDERED;
a791e35e 1502
6e26c442 1503 extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
6b0a63a4 1504 locked_page, &cached,
c2790a2e 1505 EXTENT_LOCKED | EXTENT_DELALLOC,
a315e68f 1506 page_ops);
3752d22f
AJ
1507 if (num_bytes < cur_alloc_size)
1508 num_bytes = 0;
4dbd80fb 1509 else
3752d22f 1510 num_bytes -= cur_alloc_size;
c59f8951
CM
1511 alloc_hint = ins.objectid + ins.offset;
1512 start += cur_alloc_size;
a315e68f 1513 extent_reserved = false;
4dbd80fb
QW
1514
1515 /*
1516 * btrfs_reloc_clone_csums() error, since start is increased
1517 * extent_clear_unlock_delalloc() at out_unlock label won't
1518 * free metadata of current ordered extent, we're OK to exit.
1519 */
1520 if (ret)
1521 goto out_unlock;
b888db2b 1522 }
6e144bf1
CH
1523done:
1524 if (done_offset)
1525 *done_offset = end;
be20aa9d 1526 return ret;
b7d5b0a8 1527
d9f85963 1528out_drop_extent_cache:
4c0c8cfc 1529 btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
ace68bac 1530out_reserve:
0b246afa 1531 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
2ff7e61e 1532 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
79787eaa 1533out_unlock:
9ce7466f
NA
1534 /*
1535 * Now, we have three regions to clean up:
1536 *
1537 * |-------(1)----|---(2)---|-------------(3)----------|
1538 * `- orig_start `- start `- start + cur_alloc_size `- end
1539 *
1540 * We process each region below.
1541 */
1542
a7e3b975
FM
1543 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1544 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
6869b0a8 1545 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
9ce7466f 1546
a315e68f 1547 /*
9ce7466f
NA
1548 * For the range (1). We have already instantiated the ordered extents
1549 * for this region. They are cleaned up by
1550 * btrfs_cleanup_ordered_extents() in e.g,
1551 * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
1552 * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
1553 * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
1554 * function.
1555 *
ba9145ad 1556 * However, in case of @keep_locked, we still need to unlock the pages
9ce7466f
NA
1557 * (except @locked_page) to ensure all the pages are unlocked.
1558 */
ba9145ad 1559 if (keep_locked && orig_start < start) {
71aa147b
NA
1560 if (!locked_page)
1561 mapping_set_error(inode->vfs_inode.i_mapping, ret);
9ce7466f 1562 extent_clear_unlock_delalloc(inode, orig_start, start - 1,
6b0a63a4 1563 locked_page, NULL, 0, page_ops);
71aa147b 1564 }
9ce7466f 1565
d456c25d
JB
1566 /*
1567 * At this point we're unlocked, we want to make sure we're only
1568 * clearing these flags under the extent lock, so lock the rest of the
1569 * range and clear everything up.
1570 */
1571 lock_extent(&inode->io_tree, start, end, NULL);
1572
a315e68f 1573 /*
9ce7466f
NA
1574 * For the range (2). If we reserved an extent for our delalloc range
1575 * (or a subrange) and failed to create the respective ordered extent,
1576 * then it means that when we reserved the extent we decremented the
1577 * extent's size from the data space_info's bytes_may_use counter and
1578 * incremented the space_info's bytes_reserved counter by the same
1579 * amount. We must make sure extent_clear_unlock_delalloc() does not try
1580 * to decrement again the data space_info's bytes_may_use counter,
1581 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
a315e68f
FM
1582 */
1583 if (extent_reserved) {
6e26c442 1584 extent_clear_unlock_delalloc(inode, start,
e2c8e92d 1585 start + cur_alloc_size - 1,
6b0a63a4 1586 locked_page, &cached,
a315e68f
FM
1587 clear_bits,
1588 page_ops);
30479f31 1589 btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
a315e68f 1590 start += cur_alloc_size;
a315e68f 1591 }
9ce7466f
NA
1592
1593 /*
1594 * For the range (3). We never touched the region. In addition to the
1595 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
1596 * space_info's bytes_may_use counter, reserved in
1597 * btrfs_check_data_free_space().
1598 */
12b2d64e
CH
1599 if (start < end) {
1600 clear_bits |= EXTENT_CLEAR_DATA_RESV;
1601 extent_clear_unlock_delalloc(inode, start, end, locked_page,
6b0a63a4 1602 &cached, clear_bits, page_ops);
30479f31 1603 btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
12b2d64e 1604 }
aaafa1eb 1605 return ret;
771ed689 1606}
c8b97818 1607
771ed689 1608/*
c15d8cf2
CH
1609 * Phase two of compressed writeback. This is the ordered portion of the code,
1610 * which only gets called in the order the work was queued. We walk all the
1611 * async extents created by compress_file_range and send them down to the disk.
078b8b90
DS
1612 *
1613 * If called with @do_free == true then it'll try to finish the work and free
1614 * the work struct eventually.
771ed689 1615 */
078b8b90 1616static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
771ed689 1617{
c5a68aec
NB
1618 struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1619 work);
1620 struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
00d31d17 1621 struct async_extent *async_extent;
771ed689 1622 unsigned long nr_pages;
00d31d17 1623 u64 alloc_hint = 0;
771ed689 1624
078b8b90 1625 if (do_free) {
078b8b90
DS
1626 struct async_cow *async_cow;
1627
078b8b90
DS
1628 btrfs_add_delayed_iput(async_chunk->inode);
1629 if (async_chunk->blkcg_css)
1630 css_put(async_chunk->blkcg_css);
1631
1632 async_cow = async_chunk->async_cow;
1633 if (atomic_dec_and_test(&async_cow->num_chunks))
1634 kvfree(async_cow);
1635 return;
1636 }
1637
b5326271 1638 nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
09cbfeaf 1639 PAGE_SHIFT;
771ed689 1640
00d31d17
CH
1641 while (!list_empty(&async_chunk->extents)) {
1642 async_extent = list_entry(async_chunk->extents.next,
1643 struct async_extent, list);
1644 list_del(&async_extent->list);
1645 submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
1646 }
ac98141d
JB
1647
1648 /* atomic_sub_return implies a barrier */
1649 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1650 5 * SZ_1M)
1651 cond_wake_up_nomb(&fs_info->async_submit_wait);
771ed689 1652}
c8b97818 1653
bb7b05fe 1654static bool run_delalloc_compressed(struct btrfs_inode *inode,
c56cbe90
CH
1655 struct page *locked_page, u64 start,
1656 u64 end, struct writeback_control *wbc)
771ed689 1657{
751b6431 1658 struct btrfs_fs_info *fs_info = inode->root->fs_info;
ec39f769 1659 struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
97db1204
NB
1660 struct async_cow *ctx;
1661 struct async_chunk *async_chunk;
771ed689 1662 unsigned long nr_pages;
97db1204
NB
1663 u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1664 int i;
b1c16ac9 1665 unsigned nofs_flag;
bf9486d6 1666 const blk_opf_t write_flags = wbc_to_write_flags(wbc);
771ed689 1667
b1c16ac9
NB
1668 nofs_flag = memalloc_nofs_save();
1669 ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1670 memalloc_nofs_restore(nofs_flag);
973fb26e
CH
1671 if (!ctx)
1672 return false;
b1c16ac9 1673
973fb26e 1674 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
97db1204
NB
1675
1676 async_chunk = ctx->chunks;
1677 atomic_set(&ctx->num_chunks, num_chunks);
1678
1679 for (i = 0; i < num_chunks; i++) {
973fb26e 1680 u64 cur_end = min(end, start + SZ_512K - 1);
771ed689 1681
bd4691a0
NB
1682 /*
1683 * igrab is called higher up in the call chain, take only the
1684 * lightweight reference for the callback lifetime
1685 */
751b6431 1686 ihold(&inode->vfs_inode);
9e895a8f 1687 async_chunk[i].async_cow = ctx;
99a81a44 1688 async_chunk[i].inode = inode;
97db1204
NB
1689 async_chunk[i].start = start;
1690 async_chunk[i].end = cur_end;
97db1204
NB
1691 async_chunk[i].write_flags = write_flags;
1692 INIT_LIST_HEAD(&async_chunk[i].extents);
1693
1d53c9e6
CM
1694 /*
1695 * The locked_page comes all the way from writepage and its
1696 * the original page we were actually given. As we spread
1697 * this large delalloc region across multiple async_chunk
1698 * structs, only the first struct needs a pointer to locked_page
1699 *
1700 * This way we don't need racey decisions about who is supposed
1701 * to unlock it.
1702 */
1703 if (locked_page) {
ec39f769
CM
1704 /*
1705 * Depending on the compressibility, the pages might or
1706 * might not go through async. We want all of them to
1707 * be accounted against wbc once. Let's do it here
1708 * before the paths diverge. wbc accounting is used
1709 * only for foreign writeback detection and doesn't
1710 * need full accuracy. Just account the whole thing
1711 * against the first page.
1712 */
1713 wbc_account_cgroup_owner(wbc, locked_page,
1714 cur_end - start);
1d53c9e6
CM
1715 async_chunk[i].locked_page = locked_page;
1716 locked_page = NULL;
1717 } else {
1718 async_chunk[i].locked_page = NULL;
1719 }
1720
ec39f769
CM
1721 if (blkcg_css != blkcg_root_css) {
1722 css_get(blkcg_css);
1723 async_chunk[i].blkcg_css = blkcg_css;
3480373e 1724 async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
ec39f769
CM
1725 } else {
1726 async_chunk[i].blkcg_css = NULL;
1727 }
1728
c15d8cf2 1729 btrfs_init_work(&async_chunk[i].work, compress_file_range,
078b8b90 1730 submit_compressed_extents);
771ed689 1731
97db1204 1732 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
0b246afa 1733 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
771ed689 1734
97db1204 1735 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
771ed689 1736
771ed689
CM
1737 start = cur_end + 1;
1738 }
973fb26e 1739 return true;
be20aa9d
CM
1740}
1741
256b0cf9
CH
1742/*
1743 * Run the delalloc range from start to end, and write back any dirty pages
1744 * covered by the range.
1745 */
1746static noinline int run_delalloc_cow(struct btrfs_inode *inode,
1747 struct page *locked_page, u64 start,
1748 u64 end, struct writeback_control *wbc,
1749 bool pages_dirty)
42c01100 1750{
898793d9 1751 u64 done_offset = end;
42c01100
NA
1752 int ret;
1753
898793d9 1754 while (start <= end) {
c56cbe90
CH
1755 ret = cow_file_range(inode, locked_page, start, end, &done_offset,
1756 true, false);
6e144bf1 1757 if (ret)
898793d9 1758 return ret;
778b8785 1759 extent_write_locked_range(&inode->vfs_inode, locked_page, start,
256b0cf9 1760 done_offset, wbc, pages_dirty);
898793d9
NA
1761 start = done_offset + 1;
1762 }
42c01100 1763
c56cbe90 1764 return 1;
42c01100
NA
1765}
1766
8ba96f3d 1767static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
53ffb30a 1768 const u64 start, const u64 end)
467dc47e 1769{
8ba96f3d 1770 const bool is_space_ino = btrfs_is_free_space_inode(inode);
37f00a6d 1771 const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
2166e5ed 1772 const u64 range_bytes = end + 1 - start;
8ba96f3d 1773 struct extent_io_tree *io_tree = &inode->io_tree;
a0766d8f 1774 struct extent_state *cached_state = NULL;
467dc47e
FM
1775 u64 range_start = start;
1776 u64 count;
53ffb30a 1777 int ret;
467dc47e
FM
1778
1779 /*
1780 * If EXTENT_NORESERVE is set it means that when the buffered write was
1781 * made we had not enough available data space and therefore we did not
1782 * reserve data space for it, since we though we could do NOCOW for the
1783 * respective file range (either there is prealloc extent or the inode
1784 * has the NOCOW bit set).
1785 *
1786 * However when we need to fallback to COW mode (because for example the
1787 * block group for the corresponding extent was turned to RO mode by a
1788 * scrub or relocation) we need to do the following:
1789 *
1790 * 1) We increment the bytes_may_use counter of the data space info.
1791 * If COW succeeds, it allocates a new data extent and after doing
1792 * that it decrements the space info's bytes_may_use counter and
1793 * increments its bytes_reserved counter by the same amount (we do
1794 * this at btrfs_add_reserved_bytes()). So we need to increment the
1795 * bytes_may_use counter to compensate (when space is reserved at
1796 * buffered write time, the bytes_may_use counter is incremented);
1797 *
1798 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1799 * that if the COW path fails for any reason, it decrements (through
1800 * extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1801 * data space info, which we incremented in the step above.
2166e5ed
FM
1802 *
1803 * If we need to fallback to cow and the inode corresponds to a free
6bd335b4
FM
1804 * space cache inode or an inode of the data relocation tree, we must
1805 * also increment bytes_may_use of the data space_info for the same
1806 * reason. Space caches and relocated data extents always get a prealloc
2166e5ed 1807 * extent for them, however scrub or balance may have set the block
6bd335b4
FM
1808 * group that contains that extent to RO mode and therefore force COW
1809 * when starting writeback.
467dc47e 1810 */
a0766d8f 1811 lock_extent(io_tree, start, end, &cached_state);
2166e5ed 1812 count = count_range_bits(io_tree, &range_start, end, range_bytes,
8c6e53a7 1813 EXTENT_NORESERVE, 0, NULL);
6bd335b4
FM
1814 if (count > 0 || is_space_ino || is_reloc_ino) {
1815 u64 bytes = count;
8ba96f3d 1816 struct btrfs_fs_info *fs_info = inode->root->fs_info;
467dc47e
FM
1817 struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1818
6bd335b4
FM
1819 if (is_space_ino || is_reloc_ino)
1820 bytes = range_bytes;
1821
467dc47e 1822 spin_lock(&sinfo->lock);
2166e5ed 1823 btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
467dc47e
FM
1824 spin_unlock(&sinfo->lock);
1825
2166e5ed
FM
1826 if (count > 0)
1827 clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
bd015294 1828 NULL);
467dc47e 1829 }
a0766d8f 1830 unlock_extent(io_tree, start, end, &cached_state);
467dc47e 1831
53ffb30a
CH
1832 /*
1833 * Don't try to create inline extents, as a mix of inline extent that
1834 * is written out and unlocked directly and a normal NOCOW extent
1835 * doesn't work.
1836 */
c56cbe90
CH
1837 ret = cow_file_range(inode, locked_page, start, end, NULL, false, true);
1838 ASSERT(ret != 1);
53ffb30a 1839 return ret;
467dc47e
FM
1840}
1841
619104ba
FM
1842struct can_nocow_file_extent_args {
1843 /* Input fields. */
1844
1845 /* Start file offset of the range we want to NOCOW. */
1846 u64 start;
1847 /* End file offset (inclusive) of the range we want to NOCOW. */
1848 u64 end;
1849 bool writeback_path;
1850 bool strict;
1851 /*
1852 * Free the path passed to can_nocow_file_extent() once it's not needed
1853 * anymore.
1854 */
1855 bool free_path;
1856
cdc627e6
QW
1857 /*
1858 * Output fields. Only set when can_nocow_file_extent() returns 1.
1859 * The expected file extent for the NOCOW write.
1860 */
87a6962f 1861 struct btrfs_file_extent file_extent;
619104ba
FM
1862};
1863
1864/*
1865 * Check if we can NOCOW the file extent that the path points to.
1866 * This function may return with the path released, so the caller should check
1867 * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
1868 *
1869 * Returns: < 0 on error
1870 * 0 if we can not NOCOW
1871 * 1 if we can NOCOW
1872 */
1873static int can_nocow_file_extent(struct btrfs_path *path,
1874 struct btrfs_key *key,
1875 struct btrfs_inode *inode,
1876 struct can_nocow_file_extent_args *args)
1877{
1878 const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
1879 struct extent_buffer *leaf = path->nodes[0];
1880 struct btrfs_root *root = inode->root;
1881 struct btrfs_file_extent_item *fi;
236e3107 1882 struct btrfs_root *csum_root;
cdc627e6 1883 u64 io_start;
619104ba
FM
1884 u64 extent_end;
1885 u8 extent_type;
1886 int can_nocow = 0;
1887 int ret = 0;
26ce9114 1888 bool nowait = path->nowait;
619104ba
FM
1889
1890 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
1891 extent_type = btrfs_file_extent_type(leaf, fi);
1892
1893 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1894 goto out;
1895
619104ba
FM
1896 if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
1897 extent_type == BTRFS_FILE_EXTENT_REG)
1898 goto out;
1899
1900 /*
1901 * If the extent was created before the generation where the last snapshot
1902 * for its subvolume was created, then this implies the extent is shared,
1903 * hence we must COW.
1904 */
a7bb6bd4 1905 if (!args->strict &&
619104ba
FM
1906 btrfs_file_extent_generation(leaf, fi) <=
1907 btrfs_root_last_snapshot(&root->root_item))
1908 goto out;
1909
1910 /* An explicit hole, must COW. */
cdc627e6 1911 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
619104ba
FM
1912 goto out;
1913
1914 /* Compressed/encrypted/encoded extents must be COWed. */
1915 if (btrfs_file_extent_compression(leaf, fi) ||
1916 btrfs_file_extent_encryption(leaf, fi) ||
1917 btrfs_file_extent_other_encoding(leaf, fi))
1918 goto out;
1919
1920 extent_end = btrfs_file_extent_end(path);
1921
87a6962f
QW
1922 args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1923 args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1924 args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1925 args->file_extent.offset = btrfs_file_extent_offset(leaf, fi);
1926 args->file_extent.compression = btrfs_file_extent_compression(leaf, fi);
1927
619104ba
FM
1928 /*
1929 * The following checks can be expensive, as they need to take other
1930 * locks and do btree or rbtree searches, so release the path to avoid
1931 * blocking other tasks for too long.
1932 */
1933 btrfs_release_path(path);
1934
1935 ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
cdc627e6
QW
1936 key->offset - args->file_extent.offset,
1937 args->file_extent.disk_bytenr, args->strict, path);
619104ba
FM
1938 WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1939 if (ret != 0)
1940 goto out;
1941
1942 if (args->free_path) {
1943 /*
1944 * We don't need the path anymore, plus through the
236e3107 1945 * btrfs_lookup_csums_list() call below we will end up allocating
619104ba
FM
1946 * another path. So free the path to avoid unnecessary extra
1947 * memory usage.
1948 */
1949 btrfs_free_path(path);
1950 path = NULL;
1951 }
1952
1953 /* If there are pending snapshots for this root, we must COW. */
1954 if (args->writeback_path && !is_freespace_inode &&
1955 atomic_read(&root->snapshot_force_cow))
1956 goto out;
1957
cdc627e6 1958 args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start;
87a6962f 1959 args->file_extent.offset += args->start - key->offset;
cdc627e6 1960 io_start = args->file_extent.disk_bytenr + args->file_extent.offset;
619104ba
FM
1961
1962 /*
1963 * Force COW if csums exist in the range. This ensures that csums for a
1964 * given extent are either valid or do not exist.
1965 */
236e3107 1966
cdc627e6
QW
1967 csum_root = btrfs_csum_root(root->fs_info, io_start);
1968 ret = btrfs_lookup_csums_list(csum_root, io_start,
1969 io_start + args->file_extent.num_bytes - 1,
236e3107 1970 NULL, nowait);
619104ba
FM
1971 WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1972 if (ret != 0)
1973 goto out;
1974
1975 can_nocow = 1;
1976 out:
1977 if (args->free_path && path)
1978 btrfs_free_path(path);
1979
1980 return ret < 0 ? ret : can_nocow;
1981}
1982
d352ac68
CM
1983/*
1984 * when nowcow writeback call back. This checks for snapshots or COW copies
1985 * of the extents that exist in the file, and COWs the file as required.
1986 *
1987 * If no cow copies or snapshots exist, we write directly to the existing
1988 * blocks on disk
1989 */
968322c8 1990static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
7f366cfe 1991 struct page *locked_page,
53ffb30a 1992 const u64 start, const u64 end)
be20aa9d 1993{
968322c8
NB
1994 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1995 struct btrfs_root *root = inode->root;
be20aa9d 1996 struct btrfs_path *path;
3e024846
NB
1997 u64 cow_start = (u64)-1;
1998 u64 cur_offset = start;
8ecebf4d 1999 int ret;
3e024846 2000 bool check_prev = true;
968322c8 2001 u64 ino = btrfs_ino(inode);
619104ba 2002 struct can_nocow_file_extent_args nocow_args = { 0 };
be20aa9d 2003
76c5126e
CH
2004 /*
2005 * Normally on a zoned device we're only doing COW writes, but in case
2006 * of relocation on a zoned filesystem serializes I/O so that we're only
2007 * writing sequentially and can end up here as well.
2008 */
2009 ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
2010
be20aa9d 2011 path = btrfs_alloc_path();
17ca04af 2012 if (!path) {
38dc8889
CH
2013 ret = -ENOMEM;
2014 goto error;
17ca04af 2015 }
82d5902d 2016
619104ba
FM
2017 nocow_args.end = end;
2018 nocow_args.writeback_path = true;
2019
0ed30c17 2020 while (cur_offset <= end) {
18f62b86 2021 struct btrfs_block_group *nocow_bg = NULL;
34bfaf15 2022 struct btrfs_ordered_extent *ordered;
3e024846
NB
2023 struct btrfs_key found_key;
2024 struct btrfs_file_extent_item *fi;
2025 struct extent_buffer *leaf;
aa56b0aa 2026 struct extent_state *cached_state = NULL;
3e024846 2027 u64 extent_end;
619104ba 2028 u64 nocow_end;
3e024846 2029 int extent_type;
3daea5fd 2030 bool is_prealloc;
762bf098 2031
e4c3b2dc 2032 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
80ff3856 2033 cur_offset, 0);
d788a349 2034 if (ret < 0)
79787eaa 2035 goto error;
a6bd9cd1
NB
2036
2037 /*
2038 * If there is no extent for our range when doing the initial
2039 * search, then go back to the previous slot as it will be the
2040 * one containing the search offset
2041 */
80ff3856
YZ
2042 if (ret > 0 && path->slots[0] > 0 && check_prev) {
2043 leaf = path->nodes[0];
2044 btrfs_item_key_to_cpu(leaf, &found_key,
2045 path->slots[0] - 1);
33345d01 2046 if (found_key.objectid == ino &&
80ff3856
YZ
2047 found_key.type == BTRFS_EXTENT_DATA_KEY)
2048 path->slots[0]--;
2049 }
3e024846 2050 check_prev = false;
80ff3856 2051next_slot:
a6bd9cd1 2052 /* Go to next leaf if we have exhausted the current one */
80ff3856
YZ
2053 leaf = path->nodes[0];
2054 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2055 ret = btrfs_next_leaf(root, path);
953fa5ce 2056 if (ret < 0)
79787eaa 2057 goto error;
80ff3856
YZ
2058 if (ret > 0)
2059 break;
2060 leaf = path->nodes[0];
2061 }
be20aa9d 2062
80ff3856
YZ
2063 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2064
a6bd9cd1 2065 /* Didn't find anything for our INO */
1d512cb7
FM
2066 if (found_key.objectid > ino)
2067 break;
a6bd9cd1
NB
2068 /*
2069 * Keep searching until we find an EXTENT_ITEM or there are no
2070 * more extents for this inode
2071 */
1d512cb7
FM
2072 if (WARN_ON_ONCE(found_key.objectid < ino) ||
2073 found_key.type < BTRFS_EXTENT_DATA_KEY) {
2074 path->slots[0]++;
2075 goto next_slot;
2076 }
a6bd9cd1
NB
2077
2078 /* Found key is not EXTENT_DATA_KEY or starts after req range */
1d512cb7 2079 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
80ff3856
YZ
2080 found_key.offset > end)
2081 break;
2082
a6bd9cd1
NB
2083 /*
2084 * If the found extent starts after requested offset, then
2085 * adjust extent_end to be right before this extent begins
2086 */
80ff3856
YZ
2087 if (found_key.offset > cur_offset) {
2088 extent_end = found_key.offset;
e9061e21 2089 extent_type = 0;
18f62b86 2090 goto must_cow;
80ff3856
YZ
2091 }
2092
a6bd9cd1
NB
2093 /*
2094 * Found extent which begins before our range and potentially
2095 * intersect it
2096 */
80ff3856
YZ
2097 fi = btrfs_item_ptr(leaf, path->slots[0],
2098 struct btrfs_file_extent_item);
2099 extent_type = btrfs_file_extent_type(leaf, fi);
619104ba
FM
2100 /* If this is triggered then we have a memory corruption. */
2101 ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
2102 if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
2103 ret = -EUCLEAN;
2104 goto error;
2105 }
619104ba 2106 extent_end = btrfs_file_extent_end(path);
c65ca98f 2107
619104ba
FM
2108 /*
2109 * If the extent we got ends before our current offset, skip to
2110 * the next extent.
2111 */
2112 if (extent_end <= cur_offset) {
2113 path->slots[0]++;
2114 goto next_slot;
2115 }
c65ca98f 2116
619104ba
FM
2117 nocow_args.start = cur_offset;
2118 ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
953fa5ce 2119 if (ret < 0)
619104ba 2120 goto error;
953fa5ce 2121 if (ret == 0)
18f62b86 2122 goto must_cow;
58113753 2123
619104ba 2124 ret = 0;
cdc627e6
QW
2125 nocow_bg = btrfs_inc_nocow_writers(fs_info,
2126 nocow_args.file_extent.disk_bytenr +
2127 nocow_args.file_extent.offset);
18f62b86
CH
2128 if (!nocow_bg) {
2129must_cow:
2130 /*
2131 * If we can't perform NOCOW writeback for the range,
2132 * then record the beginning of the range that needs to
2133 * be COWed. It will be written out before the next
2134 * NOCOW range if we find one, or when exiting this
2135 * loop.
2136 */
80ff3856
YZ
2137 if (cow_start == (u64)-1)
2138 cow_start = cur_offset;
2139 cur_offset = extent_end;
2140 if (cur_offset > end)
2141 break;
c65ca98f
FM
2142 if (!path->nodes[0])
2143 continue;
80ff3856
YZ
2144 path->slots[0]++;
2145 goto next_slot;
7ea394f1
YZ
2146 }
2147
a6bd9cd1
NB
2148 /*
2149 * COW range from cow_start to found_key.offset - 1. As the key
2150 * will contain the beginning of the first extent that can be
2151 * NOCOW, following one which needs to be COW'ed
2152 */
80ff3856 2153 if (cow_start != (u64)-1) {
968322c8 2154 ret = fallback_to_cow(inode, locked_page,
53ffb30a 2155 cow_start, found_key.offset - 1);
80ff3856 2156 cow_start = (u64)-1;
18f62b86
CH
2157 if (ret) {
2158 btrfs_dec_nocow_writers(nocow_bg);
79787eaa 2159 goto error;
18f62b86 2160 }
7ea394f1 2161 }
80ff3856 2162
cdc627e6 2163 nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
aa56b0aa
JB
2164 lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);
2165
3daea5fd
CH
2166 is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
2167 if (is_prealloc) {
3e024846 2168 struct extent_map *em;
6f9994db 2169
9aa29a20
FM
2170 em = btrfs_create_io_em(inode, cur_offset,
2171 &nocow_args.file_extent,
2172 BTRFS_ORDERED_PREALLOC);
6f9994db 2173 if (IS_ERR(em)) {
aa56b0aa
JB
2174 unlock_extent(&inode->io_tree, cur_offset,
2175 nocow_end, &cached_state);
18f62b86 2176 btrfs_dec_nocow_writers(nocow_bg);
6f9994db
LB
2177 ret = PTR_ERR(em);
2178 goto error;
d899e052 2179 }
6f9994db 2180 free_extent_map(em);
3daea5fd
CH
2181 }
2182
34bfaf15 2183 ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
e9ea31fb 2184 &nocow_args.file_extent,
3daea5fd
CH
2185 is_prealloc
2186 ? (1 << BTRFS_ORDERED_PREALLOC)
e9ea31fb 2187 : (1 << BTRFS_ORDERED_NOCOW));
18f62b86 2188 btrfs_dec_nocow_writers(nocow_bg);
34bfaf15 2189 if (IS_ERR(ordered)) {
3daea5fd 2190 if (is_prealloc) {
4c0c8cfc
FM
2191 btrfs_drop_extent_map_range(inode, cur_offset,
2192 nocow_end, false);
762bf098 2193 }
aa56b0aa
JB
2194 unlock_extent(&inode->io_tree, cur_offset,
2195 nocow_end, &cached_state);
34bfaf15 2196 ret = PTR_ERR(ordered);
3daea5fd 2197 goto error;
d899e052 2198 }
80ff3856 2199
37f00a6d 2200 if (btrfs_is_data_reloc_root(root))
4dbd80fb
QW
2201 /*
2202 * Error handled later, as we must prevent
2203 * extent_clear_unlock_delalloc() in error handler
2204 * from freeing metadata of created ordered extent.
2205 */
34bfaf15
CH
2206 ret = btrfs_reloc_clone_csums(ordered);
2207 btrfs_put_ordered_extent(ordered);
efa56464 2208
619104ba 2209 extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
6b0a63a4
JB
2210 locked_page, &cached_state,
2211 EXTENT_LOCKED | EXTENT_DELALLOC |
18513091 2212 EXTENT_CLEAR_DATA_RESV,
f57ad937 2213 PAGE_UNLOCK | PAGE_SET_ORDERED);
18513091 2214
80ff3856 2215 cur_offset = extent_end;
4dbd80fb
QW
2216
2217 /*
2218 * btrfs_reloc_clone_csums() error, now we're OK to call error
2219 * handler, as metadata for created ordered extent will only
2220 * be freed by btrfs_finish_ordered_io().
2221 */
2222 if (ret)
2223 goto error;
be20aa9d 2224 }
b3b4aa74 2225 btrfs_release_path(path);
80ff3856 2226
506481b2 2227 if (cur_offset <= end && cow_start == (u64)-1)
80ff3856 2228 cow_start = cur_offset;
17ca04af 2229
80ff3856 2230 if (cow_start != (u64)-1) {
506481b2 2231 cur_offset = end;
53ffb30a 2232 ret = fallback_to_cow(inode, locked_page, cow_start, end);
953fa5ce 2233 cow_start = (u64)-1;
d788a349 2234 if (ret)
79787eaa 2235 goto error;
80ff3856
YZ
2236 }
2237
18f62b86
CH
2238 btrfs_free_path(path);
2239 return 0;
762bf098 2240
18f62b86 2241error:
953fa5ce
CH
2242 /*
2243 * If an error happened while a COW region is outstanding, cur_offset
2244 * needs to be reset to cow_start to ensure the COW region is unlocked
2245 * as well.
2246 */
2247 if (cow_start != (u64)-1)
2248 cur_offset = cow_start;
aa56b0aa
JB
2249
2250 /*
2251 * We need to lock the extent here because we're clearing DELALLOC and
2252 * we're not locked at this point.
2253 */
2254 if (cur_offset < end) {
6b0a63a4
JB
2255 struct extent_state *cached = NULL;
2256
2257 lock_extent(&inode->io_tree, cur_offset, end, &cached);
968322c8 2258 extent_clear_unlock_delalloc(inode, cur_offset, end,
6b0a63a4
JB
2259 locked_page, &cached,
2260 EXTENT_LOCKED | EXTENT_DELALLOC |
2261 EXTENT_DEFRAG |
151a41bc 2262 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
6869b0a8 2263 PAGE_START_WRITEBACK |
c2790a2e 2264 PAGE_END_WRITEBACK);
30479f31 2265 btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
aa56b0aa 2266 }
7ea394f1 2267 btrfs_free_path(path);
79787eaa 2268 return ret;
be20aa9d
CM
2269}
2270
6e65ae76 2271static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
47059d93 2272{
6e65ae76
GR
2273 if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
2274 if (inode->defrag_bytes &&
99be1a66 2275 test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
6e65ae76
GR
2276 return false;
2277 return true;
2278 }
2279 return false;
47059d93
WS
2280}
2281
d352ac68 2282/*
5eaad97a
NB
2283 * Function to process delayed allocation (create CoW) for ranges which are
2284 * being touched for the first time.
d352ac68 2285 */
98456b9c 2286int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
c56cbe90 2287 u64 start, u64 end, struct writeback_control *wbc)
be20aa9d 2288{
42c01100 2289 const bool zoned = btrfs_is_zoned(inode->root->fs_info);
c56cbe90 2290 int ret;
a2135011 2291
2749f7ef 2292 /*
c56cbe90
CH
2293 * The range must cover part of the @locked_page, or a return of 1
2294 * can confuse the caller.
2749f7ef
QW
2295 */
2296 ASSERT(!(end <= page_offset(locked_page) ||
2297 start >= page_offset(locked_page) + PAGE_SIZE));
2298
6e65ae76 2299 if (should_nocow(inode, start, end)) {
53ffb30a 2300 ret = run_delalloc_nocow(inode, locked_page, start, end);
973fb26e 2301 goto out;
7ddf5a42 2302 }
973fb26e
CH
2303
2304 if (btrfs_inode_can_compress(inode) &&
2305 inode_need_compress(inode, start, end) &&
c56cbe90
CH
2306 run_delalloc_compressed(inode, locked_page, start, end, wbc))
2307 return 1;
973fb26e
CH
2308
2309 if (zoned)
256b0cf9
CH
2310 ret = run_delalloc_cow(inode, locked_page, start, end, wbc,
2311 true);
973fb26e 2312 else
c56cbe90
CH
2313 ret = cow_file_range(inode, locked_page, start, end, NULL,
2314 false, false);
973fb26e
CH
2315
2316out:
c56cbe90 2317 if (ret < 0)
98456b9c 2318 btrfs_cleanup_ordered_extents(inode, locked_page, start,
d1051d6e 2319 end - start + 1);
b888db2b
CM
2320 return ret;
2321}
2322
62798a49 2323void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
abbb55f4 2324 struct extent_state *orig, u64 split)
9ed74f2d 2325{
62798a49 2326 struct btrfs_fs_info *fs_info = inode->root->fs_info;
dcab6a3b
JB
2327 u64 size;
2328
b5d56392
FM
2329 lockdep_assert_held(&inode->io_tree.lock);
2330
0ca1f7ce 2331 /* not delalloc, ignore it */
9ed74f2d 2332 if (!(orig->state & EXTENT_DELALLOC))
1bf85046 2333 return;
9ed74f2d 2334
dcab6a3b 2335 size = orig->end - orig->start + 1;
f7b12a62 2336 if (size > fs_info->max_extent_size) {
823bb20a 2337 u32 num_extents;
dcab6a3b
JB
2338 u64 new_size;
2339
2340 /*
5c848198 2341 * See the explanation in btrfs_merge_delalloc_extent, the same
ba117213 2342 * applies here, just in reverse.
dcab6a3b
JB
2343 */
2344 new_size = orig->end - split + 1;
7d7672bc 2345 num_extents = count_max_extents(fs_info, new_size);
ba117213 2346 new_size = split - orig->start;
7d7672bc
NA
2347 num_extents += count_max_extents(fs_info, new_size);
2348 if (count_max_extents(fs_info, size) >= num_extents)
dcab6a3b
JB
2349 return;
2350 }
2351
62798a49
DS
2352 spin_lock(&inode->lock);
2353 btrfs_mod_outstanding_extents(inode, 1);
2354 spin_unlock(&inode->lock);
9ed74f2d
JB
2355}
2356
2357/*
5c848198
NB
2358 * Handle merged delayed allocation extents so we can keep track of new extents
2359 * that are just merged onto old extents, such as when we are doing sequential
2360 * writes, so we can properly account for the metadata space we'll need.
9ed74f2d 2361 */
2454151c 2362void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
5c848198 2363 struct extent_state *other)
9ed74f2d 2364{
2454151c 2365 struct btrfs_fs_info *fs_info = inode->root->fs_info;
dcab6a3b 2366 u64 new_size, old_size;
823bb20a 2367 u32 num_extents;
dcab6a3b 2368
b5d56392
FM
2369 lockdep_assert_held(&inode->io_tree.lock);
2370
9ed74f2d
JB
2371 /* not delalloc, ignore it */
2372 if (!(other->state & EXTENT_DELALLOC))
1bf85046 2373 return;
9ed74f2d 2374
8461a3de
JB
2375 if (new->start > other->start)
2376 new_size = new->end - other->start + 1;
2377 else
2378 new_size = other->end - new->start + 1;
dcab6a3b
JB
2379
2380 /* we're not bigger than the max, unreserve the space and go */
f7b12a62 2381 if (new_size <= fs_info->max_extent_size) {
2454151c
DS
2382 spin_lock(&inode->lock);
2383 btrfs_mod_outstanding_extents(inode, -1);
2384 spin_unlock(&inode->lock);
dcab6a3b
JB
2385 return;
2386 }
2387
2388 /*
ba117213
JB
2389 * We have to add up either side to figure out how many extents were
2390 * accounted for before we merged into one big extent. If the number of
2391 * extents we accounted for is <= the amount we need for the new range
2392 * then we can return, otherwise drop. Think of it like this
2393 *
2394 * [ 4k][MAX_SIZE]
2395 *
2396 * So we've grown the extent by a MAX_SIZE extent, this would mean we
2397 * need 2 outstanding extents, on one side we have 1 and the other side
2398 * we have 1 so they are == and we can return. But in this case
2399 *
2400 * [MAX_SIZE+4k][MAX_SIZE+4k]
2401 *
2402 * Each range on their own accounts for 2 extents, but merged together
2403 * they are only 3 extents worth of accounting, so we need to drop in
2404 * this case.
dcab6a3b 2405 */
ba117213 2406 old_size = other->end - other->start + 1;
7d7672bc 2407 num_extents = count_max_extents(fs_info, old_size);
ba117213 2408 old_size = new->end - new->start + 1;
7d7672bc
NA
2409 num_extents += count_max_extents(fs_info, old_size);
2410 if (count_max_extents(fs_info, new_size) >= num_extents)
dcab6a3b
JB
2411 return;
2412
2454151c
DS
2413 spin_lock(&inode->lock);
2414 btrfs_mod_outstanding_extents(inode, -1);
2415 spin_unlock(&inode->lock);
9ed74f2d
JB
2416}
2417
f4f15454 2418static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
eb73c1b7 2419{
8a46e55a
FM
2420 struct btrfs_root *root = inode->root;
2421 struct btrfs_fs_info *fs_info = root->fs_info;
0b246afa 2422
eb73c1b7 2423 spin_lock(&root->delalloc_lock);
bdc0f89e
FM
2424 ASSERT(list_empty(&inode->delalloc_inodes));
2425 list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
2426 root->nr_delalloc_inodes++;
2427 if (root->nr_delalloc_inodes == 1) {
2428 spin_lock(&fs_info->delalloc_root_lock);
d23626d8 2429 ASSERT(list_empty(&root->delalloc_root));
bdc0f89e
FM
2430 list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
2431 spin_unlock(&fs_info->delalloc_root_lock);
eb73c1b7
MX
2432 }
2433 spin_unlock(&root->delalloc_lock);
2434}
2435
5a8a57f9 2436void btrfs_del_delalloc_inode(struct btrfs_inode *inode)
eb73c1b7 2437{
f5169f12 2438 struct btrfs_root *root = inode->root;
3ffbd68c 2439 struct btrfs_fs_info *fs_info = root->fs_info;
0b246afa 2440
f23f8952
FM
2441 lockdep_assert_held(&root->delalloc_lock);
2442
bdc0f89e
FM
2443 /*
2444 * We may be called after the inode was already deleted from the list,
2445 * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
2446 * and then later through btrfs_clear_delalloc_extent() while the inode
2447 * still has ->delalloc_bytes > 0.
2448 */
9e3e97f4
NB
2449 if (!list_empty(&inode->delalloc_inodes)) {
2450 list_del_init(&inode->delalloc_inodes);
eb73c1b7
MX
2451 root->nr_delalloc_inodes--;
2452 if (!root->nr_delalloc_inodes) {
7c8a0d36 2453 ASSERT(list_empty(&root->delalloc_inodes));
0b246afa 2454 spin_lock(&fs_info->delalloc_root_lock);
d23626d8 2455 ASSERT(!list_empty(&root->delalloc_root));
eb73c1b7 2456 list_del_init(&root->delalloc_root);
0b246afa 2457 spin_unlock(&fs_info->delalloc_root_lock);
eb73c1b7
MX
2458 }
2459 }
2b877331
NB
2460}
2461
d352ac68 2462/*
e06a1fc9
NB
2463 * Properly track delayed allocation bytes in the inode and to maintain the
2464 * list of inodes that have pending delalloc work to be done.
d352ac68 2465 */
4c5d166f 2466void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
6d92b304 2467 u32 bits)
291d673e 2468{
4c5d166f 2469 struct btrfs_fs_info *fs_info = inode->root->fs_info;
0b246afa 2470
bdc0f89e
FM
2471 lockdep_assert_held(&inode->io_tree.lock);
2472
6d92b304 2473 if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
47059d93 2474 WARN_ON(1);
75eff68e
CM
2475 /*
2476 * set_bit and clear bit hooks normally require _irqsave/restore
27160b6b 2477 * but in this case, we are only testing for the DELALLOC
75eff68e
CM
2478 * bit, which is only set or cleared with irqs on
2479 */
6d92b304 2480 if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
0ca1f7ce 2481 u64 len = state->end + 1 - state->start;
bdc0f89e 2482 u64 prev_delalloc_bytes;
7d7672bc 2483 u32 num_extents = count_max_extents(fs_info, len);
9ed74f2d 2484
4c5d166f
DS
2485 spin_lock(&inode->lock);
2486 btrfs_mod_outstanding_extents(inode, num_extents);
2487 spin_unlock(&inode->lock);
287a0ab9 2488
6a3891c5 2489 /* For sanity tests */
0b246afa 2490 if (btrfs_is_testing(fs_info))
6a3891c5
JB
2491 return;
2492
104b4e51
NB
2493 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2494 fs_info->delalloc_batch);
4c5d166f 2495 spin_lock(&inode->lock);
bdc0f89e 2496 prev_delalloc_bytes = inode->delalloc_bytes;
4c5d166f 2497 inode->delalloc_bytes += len;
6d92b304 2498 if (bits & EXTENT_DEFRAG)
4c5d166f 2499 inode->defrag_bytes += len;
4c5d166f 2500 spin_unlock(&inode->lock);
bdc0f89e
FM
2501
2502 /*
2503 * We don't need to be under the protection of the inode's lock,
2504 * because we are called while holding the inode's io_tree lock
2505 * and are therefore protected against concurrent calls of this
2506 * function and btrfs_clear_delalloc_extent().
2507 */
99c15fec 2508 if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
bdc0f89e 2509 btrfs_add_delalloc_inode(inode);
291d673e 2510 }
a7e3b975
FM
2511
2512 if (!(state->state & EXTENT_DELALLOC_NEW) &&
6d92b304 2513 (bits & EXTENT_DELALLOC_NEW)) {
4c5d166f
DS
2514 spin_lock(&inode->lock);
2515 inode->new_delalloc_bytes += state->end + 1 - state->start;
2516 spin_unlock(&inode->lock);
a7e3b975 2517 }
291d673e
CM
2518}
2519
d352ac68 2520/*
a36bb5f9
NB
2521 * Once a range is no longer delalloc this function ensures that proper
2522 * accounting happens.
d352ac68 2523 */
bd54766e 2524void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
6d92b304 2525 struct extent_state *state, u32 bits)
291d673e 2526{
bd54766e 2527 struct btrfs_fs_info *fs_info = inode->root->fs_info;
47059d93 2528 u64 len = state->end + 1 - state->start;
7d7672bc 2529 u32 num_extents = count_max_extents(fs_info, len);
47059d93 2530
bdc0f89e
FM
2531 lockdep_assert_held(&inode->io_tree.lock);
2532
6d92b304 2533 if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
4a4b964f 2534 spin_lock(&inode->lock);
6fc0ef68 2535 inode->defrag_bytes -= len;
4a4b964f
FM
2536 spin_unlock(&inode->lock);
2537 }
47059d93 2538
75eff68e
CM
2539 /*
2540 * set_bit and clear bit hooks normally require _irqsave/restore
27160b6b 2541 * but in this case, we are only testing for the DELALLOC
75eff68e
CM
2542 * bit, which is only set or cleared with irqs on
2543 */
6d92b304 2544 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
6fc0ef68 2545 struct btrfs_root *root = inode->root;
bdc0f89e 2546 u64 new_delalloc_bytes;
bcbfce8a 2547
8b62f87b
JB
2548 spin_lock(&inode->lock);
2549 btrfs_mod_outstanding_extents(inode, -num_extents);
2550 spin_unlock(&inode->lock);
0ca1f7ce 2551
b6d08f06
JB
2552 /*
2553 * We don't reserve metadata space for space cache inodes so we
52042d8e 2554 * don't need to call delalloc_release_metadata if there is an
b6d08f06
JB
2555 * error.
2556 */
6d92b304 2557 if (bits & EXTENT_CLEAR_META_RESV &&
0b246afa 2558 root != fs_info->tree_root)
3c6f0c5e 2559 btrfs_delalloc_release_metadata(inode, len, true);
0ca1f7ce 2560
6a3891c5 2561 /* For sanity tests. */
0b246afa 2562 if (btrfs_is_testing(fs_info))
6a3891c5
JB
2563 return;
2564
37f00a6d 2565 if (!btrfs_is_data_reloc_root(root) &&
4e94ee80
FM
2566 !btrfs_is_free_space_inode(inode) &&
2567 !(state->state & EXTENT_NORESERVE) &&
6d92b304 2568 (bits & EXTENT_CLEAR_DATA_RESV))
9db5d510 2569 btrfs_free_reserved_data_space_noquota(fs_info, len);
9ed74f2d 2570
104b4e51
NB
2571 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2572 fs_info->delalloc_batch);
6fc0ef68
NB
2573 spin_lock(&inode->lock);
2574 inode->delalloc_bytes -= len;
bdc0f89e 2575 new_delalloc_bytes = inode->delalloc_bytes;
6fc0ef68 2576 spin_unlock(&inode->lock);
bdc0f89e
FM
2577
2578 /*
2579 * We don't need to be under the protection of the inode's lock,
2580 * because we are called while holding the inode's io_tree lock
2581 * and are therefore protected against concurrent calls of this
2582 * function and btrfs_set_delalloc_extent().
2583 */
5a8a57f9
DS
2584 if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) {
2585 spin_lock(&root->delalloc_lock);
bdc0f89e 2586 btrfs_del_delalloc_inode(inode);
5a8a57f9
DS
2587 spin_unlock(&root->delalloc_lock);
2588 }
291d673e 2589 }
a7e3b975
FM
2590
2591 if ((state->state & EXTENT_DELALLOC_NEW) &&
6d92b304 2592 (bits & EXTENT_DELALLOC_NEW)) {
a7e3b975
FM
2593 spin_lock(&inode->lock);
2594 ASSERT(inode->new_delalloc_bytes >= len);
2595 inode->new_delalloc_bytes -= len;
6d92b304 2596 if (bits & EXTENT_ADD_INODE_BYTES)
2766ff61 2597 inode_add_bytes(&inode->vfs_inode, len);
a7e3b975
FM
2598 spin_unlock(&inode->lock);
2599 }
291d673e
CM
2600}
2601
d352ac68
CM
2602/*
2603 * given a list of ordered sums record them in the inode. This happens
2604 * at IO completion time based on sums calculated at bio submission time.
2605 */
510f85ed
NB
2606static int add_pending_csums(struct btrfs_trans_handle *trans,
2607 struct list_head *list)
e6dcd2dc 2608{
e6dcd2dc 2609 struct btrfs_ordered_sum *sum;
fc28b25e 2610 struct btrfs_root *csum_root = NULL;
ac01f26a 2611 int ret;
e6dcd2dc 2612
c6e30871 2613 list_for_each_entry(sum, list, list) {
7c2871a2 2614 trans->adding_csums = true;
fc28b25e
JB
2615 if (!csum_root)
2616 csum_root = btrfs_csum_root(trans->fs_info,
5cfe76f8 2617 sum->logical);
fc28b25e 2618 ret = btrfs_csum_file_blocks(trans, csum_root, sum);
7c2871a2 2619 trans->adding_csums = false;
ac01f26a
NB
2620 if (ret)
2621 return ret;
e6dcd2dc
CM
2622 }
2623 return 0;
2624}
2625
c3347309
FM
2626static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2627 const u64 start,
2628 const u64 len,
2629 struct extent_state **cached_state)
2630{
2631 u64 search_start = start;
2632 const u64 end = start + len - 1;
2633
2634 while (search_start < end) {
2635 const u64 search_len = end - search_start + 1;
2636 struct extent_map *em;
2637 u64 em_len;
2638 int ret = 0;
2639
8bab0a30 2640 em = btrfs_get_extent(inode, NULL, search_start, search_len);
c3347309
FM
2641 if (IS_ERR(em))
2642 return PTR_ERR(em);
2643
c77a8c61 2644 if (em->disk_bytenr != EXTENT_MAP_HOLE)
c3347309
FM
2645 goto next;
2646
2647 em_len = em->len;
2648 if (em->start < search_start)
2649 em_len -= search_start - em->start;
2650 if (em_len > search_len)
2651 em_len = search_len;
2652
2653 ret = set_extent_bit(&inode->io_tree, search_start,
2654 search_start + em_len - 1,
1d126800 2655 EXTENT_DELALLOC_NEW, cached_state);
c3347309
FM
2656next:
2657 search_start = extent_map_end(em);
2658 free_extent_map(em);
2659 if (ret)
2660 return ret;
2661 }
2662 return 0;
2663}
2664
c2566f22 2665int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
e3b8a485 2666 unsigned int extra_bits,
330a5827 2667 struct extent_state **cached_state)
ea8c2819 2668{
fdb1e121 2669 WARN_ON(PAGE_ALIGNED(end));
c3347309
FM
2670
2671 if (start >= i_size_read(&inode->vfs_inode) &&
2672 !(inode->flags & BTRFS_INODE_PREALLOC)) {
2673 /*
2674 * There can't be any extents following eof in this case so just
2675 * set the delalloc new bit for the range directly.
2676 */
2677 extra_bits |= EXTENT_DELALLOC_NEW;
2678 } else {
2679 int ret;
2680
2681 ret = btrfs_find_new_delalloc_bytes(inode, start,
2682 end + 1 - start,
2683 cached_state);
2684 if (ret)
2685 return ret;
2686 }
2687
66240ab1 2688 return set_extent_bit(&inode->io_tree, start, end,
1d126800 2689 EXTENT_DELALLOC | extra_bits, cached_state);
ea8c2819
CM
2690}
2691
d352ac68 2692/* see btrfs_writepage_start_hook for details on why this is required */
247e743c
CM
2693struct btrfs_writepage_fixup {
2694 struct page *page;
36eeaef5 2695 struct btrfs_inode *inode;
247e743c
CM
2696 struct btrfs_work work;
2697};
2698
b2950863 2699static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
247e743c 2700{
9783e4de
CH
2701 struct btrfs_writepage_fixup *fixup =
2702 container_of(work, struct btrfs_writepage_fixup, work);
247e743c 2703 struct btrfs_ordered_extent *ordered;
2ac55d41 2704 struct extent_state *cached_state = NULL;
364ecf36 2705 struct extent_changeset *data_reserved = NULL;
9783e4de
CH
2706 struct page *page = fixup->page;
2707 struct btrfs_inode *inode = fixup->inode;
2708 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2709 u64 page_start = page_offset(page);
2710 u64 page_end = page_offset(page) + PAGE_SIZE - 1;
25f3c502 2711 int ret = 0;
f4b1363c 2712 bool free_delalloc_space = true;
247e743c 2713
f4b1363c
JB
2714 /*
2715 * This is similar to page_mkwrite, we need to reserve the space before
2716 * we take the page lock.
2717 */
65d87f79
NB
2718 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2719 PAGE_SIZE);
4a096752 2720again:
247e743c 2721 lock_page(page);
25f3c502
CM
2722
2723 /*
2724 * Before we queued this fixup, we took a reference on the page.
2725 * page->mapping may go NULL, but it shouldn't be moved to a different
2726 * address space.
2727 */
f4b1363c
JB
2728 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2729 /*
2730 * Unfortunately this is a little tricky, either
2731 *
2732 * 1) We got here and our page had already been dealt with and
2733 * we reserved our space, thus ret == 0, so we need to just
2734 * drop our space reservation and bail. This can happen the
2735 * first time we come into the fixup worker, or could happen
2736 * while waiting for the ordered extent.
2737 * 2) Our page was already dealt with, but we happened to get an
2738 * ENOSPC above from the btrfs_delalloc_reserve_space. In
2739 * this case we obviously don't have anything to release, but
2740 * because the page was already dealt with we don't want to
2741 * mark the page with an error, so make sure we're resetting
2742 * ret to 0. This is why we have this check _before_ the ret
2743 * check, because we do not want to have a surprise ENOSPC
2744 * when the page was already properly dealt with.
2745 */
2746 if (!ret) {
65d87f79
NB
2747 btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2748 btrfs_delalloc_release_space(inode, data_reserved,
f4b1363c
JB
2749 page_start, PAGE_SIZE,
2750 true);
2751 }
2752 ret = 0;
247e743c 2753 goto out_page;
f4b1363c 2754 }
247e743c 2755
25f3c502 2756 /*
f4b1363c
JB
2757 * We can't mess with the page state unless it is locked, so now that
2758 * it is locked bail if we failed to make our space reservation.
25f3c502 2759 */
f4b1363c
JB
2760 if (ret)
2761 goto out_page;
247e743c 2762
570eb97b 2763 lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
4a096752
CM
2764
2765 /* already ordered? We're done */
f57ad937 2766 if (PageOrdered(page))
f4b1363c 2767 goto out_reserved;
4a096752 2768
65d87f79 2769 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
4a096752 2770 if (ordered) {
570eb97b
JB
2771 unlock_extent(&inode->io_tree, page_start, page_end,
2772 &cached_state);
4a096752 2773 unlock_page(page);
36d45567 2774 btrfs_start_ordered_extent(ordered);
87826df0 2775 btrfs_put_ordered_extent(ordered);
4a096752
CM
2776 goto again;
2777 }
247e743c 2778
65d87f79 2779 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
330a5827 2780 &cached_state);
25f3c502 2781 if (ret)
53687007 2782 goto out_reserved;
f3038ee3 2783
25f3c502
CM
2784 /*
2785 * Everything went as planned, we're now the owner of a dirty page with
2786 * delayed allocation bits set and space reserved for our COW
2787 * destination.
2788 *
2789 * The page was dirty when we started, nothing should have cleaned it.
2790 */
2791 BUG_ON(!PageDirty(page));
f4b1363c 2792 free_delalloc_space = false;
53687007 2793out_reserved:
65d87f79 2794 btrfs_delalloc_release_extents(inode, PAGE_SIZE);
f4b1363c 2795 if (free_delalloc_space)
65d87f79
NB
2796 btrfs_delalloc_release_space(inode, data_reserved, page_start,
2797 PAGE_SIZE, true);
570eb97b 2798 unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
247e743c 2799out_page:
25f3c502
CM
2800 if (ret) {
2801 /*
2802 * We hit ENOSPC or other errors. Update the mapping and page
2803 * to reflect the errors and clean the page.
2804 */
2805 mapping_set_error(page->mapping, ret);
a7922801
JB
2806 btrfs_mark_ordered_io_finished(inode, page_folio(page),
2807 page_start, PAGE_SIZE, !ret);
25f3c502 2808 clear_page_dirty_for_io(page);
25f3c502 2809 }
55151ea9 2810 btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE);
247e743c 2811 unlock_page(page);
09cbfeaf 2812 put_page(page);
b897abec 2813 kfree(fixup);
364ecf36 2814 extent_changeset_free(data_reserved);
f4b1363c
JB
2815 /*
2816 * As a precaution, do a delayed iput in case it would be the last iput
2817 * that could need flushing space. Recursing back to fixup worker would
2818 * deadlock.
2819 */
e55cf7ca 2820 btrfs_add_delayed_iput(inode);
247e743c
CM
2821}
2822
2823/*
2824 * There are a few paths in the higher layers of the kernel that directly
2825 * set the page dirty bit without asking the filesystem if it is a
2826 * good idea. This causes problems because we want to make sure COW
2827 * properly happens and the data=ordered rules are followed.
2828 *
c8b97818 2829 * In our case any range that doesn't have the ORDERED bit set
247e743c
CM
2830 * hasn't been properly setup for IO. We kick off an async process
2831 * to fix it up. The async helper will wait for ordered extents, set
2832 * the delalloc bit and make it safe to write the page.
2833 */
a129ffb8 2834int btrfs_writepage_cow_fixup(struct page *page)
247e743c
CM
2835{
2836 struct inode *inode = page->mapping->host;
41044b41 2837 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
247e743c 2838 struct btrfs_writepage_fixup *fixup;
247e743c 2839
f57ad937
QW
2840 /* This page has ordered extent covering it already */
2841 if (PageOrdered(page))
247e743c
CM
2842 return 0;
2843
25f3c502
CM
2844 /*
2845 * PageChecked is set below when we create a fixup worker for this page,
2846 * don't try to create another one if we're already PageChecked()
2847 *
2848 * The extent_io writepage code will redirty the page if we send back
2849 * EAGAIN.
2850 */
247e743c
CM
2851 if (PageChecked(page))
2852 return -EAGAIN;
2853
2854 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2855 if (!fixup)
2856 return -EAGAIN;
f421950f 2857
f4b1363c
JB
2858 /*
2859 * We are already holding a reference to this inode from
2860 * write_cache_pages. We need to hold it because the space reservation
2861 * takes place outside of the page lock, and we can't trust
2862 * page->mapping outside of the page lock.
2863 */
2864 ihold(inode);
55151ea9 2865 btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE);
09cbfeaf 2866 get_page(page);
078b8b90 2867 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
247e743c 2868 fixup->page = page;
36eeaef5 2869 fixup->inode = BTRFS_I(inode);
0b246afa 2870 btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
25f3c502
CM
2871
2872 return -EAGAIN;
247e743c
CM
2873}
2874
d899e052 2875static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
c553f94d 2876 struct btrfs_inode *inode, u64 file_pos,
9729f10a 2877 struct btrfs_file_extent_item *stack_fi,
2766ff61 2878 const bool update_inode_bytes,
9729f10a 2879 u64 qgroup_reserved)
d899e052 2880{
c553f94d 2881 struct btrfs_root *root = inode->root;
2766ff61 2882 const u64 sectorsize = root->fs_info->sectorsize;
d899e052
YZ
2883 struct btrfs_path *path;
2884 struct extent_buffer *leaf;
2885 struct btrfs_key ins;
203f44c5
QW
2886 u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2887 u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
cb36a9bb 2888 u64 offset = btrfs_stack_file_extent_offset(stack_fi);
203f44c5
QW
2889 u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2890 u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
5893dfb9 2891 struct btrfs_drop_extents_args drop_args = { 0 };
d899e052
YZ
2892 int ret;
2893
2894 path = btrfs_alloc_path();
d8926bb3
MF
2895 if (!path)
2896 return -ENOMEM;
d899e052 2897
a1ed835e
CM
2898 /*
2899 * we may be replacing one extent in the tree with another.
2900 * The new extent is pinned in the extent map, and we don't want
2901 * to drop it from the cache until it is completely in the btree.
2902 *
2903 * So, tell btrfs_drop_extents to leave this extent in the cache.
2904 * the caller is expected to unpin it and allow it to be merged
2905 * with the others.
2906 */
5893dfb9
FM
2907 drop_args.path = path;
2908 drop_args.start = file_pos;
2909 drop_args.end = file_pos + num_bytes;
2910 drop_args.replace_extent = true;
2911 drop_args.extent_item_size = sizeof(*stack_fi);
2912 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
79787eaa
JM
2913 if (ret)
2914 goto out;
d899e052 2915
5893dfb9 2916 if (!drop_args.extent_inserted) {
c553f94d 2917 ins.objectid = btrfs_ino(inode);
1acae57b
FDBM
2918 ins.offset = file_pos;
2919 ins.type = BTRFS_EXTENT_DATA_KEY;
2920
1acae57b 2921 ret = btrfs_insert_empty_item(trans, root, path, &ins,
203f44c5 2922 sizeof(*stack_fi));
1acae57b
FDBM
2923 if (ret)
2924 goto out;
2925 }
d899e052 2926 leaf = path->nodes[0];
203f44c5
QW
2927 btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2928 write_extent_buffer(leaf, stack_fi,
2929 btrfs_item_ptr_offset(leaf, path->slots[0]),
2930 sizeof(struct btrfs_file_extent_item));
b9473439 2931
50564b65 2932 btrfs_mark_buffer_dirty(trans, leaf);
ce195332 2933 btrfs_release_path(path);
d899e052 2934
2766ff61
FM
2935 /*
2936 * If we dropped an inline extent here, we know the range where it is
2937 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
1a9fd417 2938 * number of bytes only for that range containing the inline extent.
2766ff61
FM
2939 * The remaining of the range will be processed when clearning the
2940 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
2941 */
2942 if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
2943 u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
2944
2945 inline_size = drop_args.bytes_found - inline_size;
2946 btrfs_update_inode_bytes(inode, sectorsize, inline_size);
2947 drop_args.bytes_found -= inline_size;
2948 num_bytes -= sectorsize;
2949 }
2950
2951 if (update_inode_bytes)
2952 btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
d899e052
YZ
2953
2954 ins.objectid = disk_bytenr;
2955 ins.offset = disk_num_bytes;
2956 ins.type = BTRFS_EXTENT_ITEM_KEY;
a12b877b 2957
c553f94d 2958 ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
9ddc959e
JB
2959 if (ret)
2960 goto out;
2961
c553f94d 2962 ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
cb36a9bb
OS
2963 file_pos - offset,
2964 qgroup_reserved, &ins);
79787eaa 2965out:
d899e052 2966 btrfs_free_path(path);
b9473439 2967
79787eaa 2968 return ret;
d899e052
YZ
2969}
2970
2ff7e61e 2971static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
e570fd27
MX
2972 u64 start, u64 len)
2973{
32da5386 2974 struct btrfs_block_group *cache;
e570fd27 2975
0b246afa 2976 cache = btrfs_lookup_block_group(fs_info, start);
e570fd27
MX
2977 ASSERT(cache);
2978
2979 spin_lock(&cache->lock);
2980 cache->delalloc_bytes -= len;
2981 spin_unlock(&cache->lock);
2982
2983 btrfs_put_block_group(cache);
2984}
2985
203f44c5 2986static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
203f44c5
QW
2987 struct btrfs_ordered_extent *oe)
2988{
2989 struct btrfs_file_extent_item stack_fi;
2766ff61 2990 bool update_inode_bytes;
cb36a9bb
OS
2991 u64 num_bytes = oe->num_bytes;
2992 u64 ram_bytes = oe->ram_bytes;
203f44c5
QW
2993
2994 memset(&stack_fi, 0, sizeof(stack_fi));
2995 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
2996 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
2997 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
2998 oe->disk_num_bytes);
cb36a9bb 2999 btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
896c8b92 3000 if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
c1867eb3 3001 num_bytes = oe->truncated_len;
cb36a9bb
OS
3002 btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
3003 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
203f44c5
QW
3004 btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
3005 /* Encryption and other encoding is reserved and all 0 */
3006
2766ff61
FM
3007 /*
3008 * For delalloc, when completing an ordered extent we update the inode's
3009 * bytes when clearing the range in the inode's io tree, so pass false
3010 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
3011 * except if the ordered extent was truncated.
3012 */
3013 update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
7c0c7269 3014 test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
2766ff61
FM
3015 test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
3016
a1f4e3d7 3017 return insert_reserved_file_extent(trans, oe->inode,
3c38c877 3018 oe->file_offset, &stack_fi,
2766ff61 3019 update_inode_bytes, oe->qgroup_rsv);
203f44c5
QW
3020}
3021
3022/*
3023 * As ordered data IO finishes, this gets called so we can finish
d352ac68
CM
3024 * an ordered extent if the range of bytes in the file it covers are
3025 * fully written.
3026 */
71df088c 3027int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
e6dcd2dc 3028{
a1f4e3d7 3029 struct btrfs_inode *inode = ordered_extent->inode;
72e7e6ed
NB
3030 struct btrfs_root *root = inode->root;
3031 struct btrfs_fs_info *fs_info = root->fs_info;
0ca1f7ce 3032 struct btrfs_trans_handle *trans = NULL;
72e7e6ed 3033 struct extent_io_tree *io_tree = &inode->io_tree;
2ac55d41 3034 struct extent_state *cached_state = NULL;
bffe633e 3035 u64 start, end;
261507a0 3036 int compress_type = 0;
77cef2ec 3037 int ret = 0;
bffe633e 3038 u64 logical_len = ordered_extent->num_bytes;
8d510121 3039 bool freespace_inode;
77cef2ec 3040 bool truncated = false;
49940bdd 3041 bool clear_reserved_extent = true;
2766ff61 3042 unsigned int clear_bits = EXTENT_DEFRAG;
a7e3b975 3043
bffe633e
OS
3044 start = ordered_extent->file_offset;
3045 end = start + ordered_extent->num_bytes - 1;
3046
a7e3b975
FM
3047 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3048 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
7c0c7269
OS
3049 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
3050 !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
2766ff61 3051 clear_bits |= EXTENT_DELALLOC_NEW;
e6dcd2dc 3052
72e7e6ed 3053 freespace_inode = btrfs_is_free_space_inode(inode);
5f4403e1
IA
3054 if (!freespace_inode)
3055 btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
0cb59c99 3056
5fd02043
JB
3057 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3058 ret = -EIO;
3059 goto out;
3060 }
3061
71df088c 3062 if (btrfs_is_zoned(fs_info))
be1a1d7a
NA
3063 btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3064 ordered_extent->disk_num_bytes);
d8e3fb10 3065
77cef2ec
JB
3066 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3067 truncated = true;
3068 logical_len = ordered_extent->truncated_len;
3069 /* Truncated the entire extent, don't bother adding */
3070 if (!logical_len)
3071 goto out;
3072 }
3073
c2167754 3074 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
79787eaa 3075 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
94ed938a 3076
72e7e6ed 3077 btrfs_inode_safe_disk_i_size_write(inode, 0);
8d510121
NB
3078 if (freespace_inode)
3079 trans = btrfs_join_transaction_spacecache(root);
6c760c07
JB
3080 else
3081 trans = btrfs_join_transaction(root);
3082 if (IS_ERR(trans)) {
3083 ret = PTR_ERR(trans);
3084 trans = NULL;
3085 goto out;
c2167754 3086 }
72e7e6ed 3087 trans->block_rsv = &inode->block_rsv;
0a5d0dc5 3088 ret = btrfs_update_inode_fallback(trans, inode);
6c760c07 3089 if (ret) /* -ENOMEM or corruption */
66642832 3090 btrfs_abort_transaction(trans, ret);
c2167754
YZ
3091 goto out;
3092 }
e6dcd2dc 3093
2766ff61 3094 clear_bits |= EXTENT_LOCKED;
570eb97b 3095 lock_extent(io_tree, start, end, &cached_state);
e6dcd2dc 3096
8d510121
NB
3097 if (freespace_inode)
3098 trans = btrfs_join_transaction_spacecache(root);
0cb59c99 3099 else
7a7eaa40 3100 trans = btrfs_join_transaction(root);
79787eaa
JM
3101 if (IS_ERR(trans)) {
3102 ret = PTR_ERR(trans);
3103 trans = NULL;
a7e3b975 3104 goto out;
79787eaa 3105 }
a79b7d4b 3106
72e7e6ed 3107 trans->block_rsv = &inode->block_rsv;
c2167754 3108
02c372e1
JT
3109 ret = btrfs_insert_raid_extent(trans, ordered_extent);
3110 if (ret)
3111 goto out;
3112
c8b97818 3113 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
261507a0 3114 compress_type = ordered_extent->compress_type;
d899e052 3115 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
261507a0 3116 BUG_ON(compress_type);
72e7e6ed 3117 ret = btrfs_mark_extent_written(trans, inode,
d899e052
YZ
3118 ordered_extent->file_offset,
3119 ordered_extent->file_offset +
77cef2ec 3120 logical_len);
343d8a30
NA
3121 btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
3122 ordered_extent->disk_num_bytes);
d899e052 3123 } else {
0b246afa 3124 BUG_ON(root == fs_info->tree_root);
3c38c877 3125 ret = insert_ordered_extent_file_extent(trans, ordered_extent);
49940bdd
JB
3126 if (!ret) {
3127 clear_reserved_extent = false;
2ff7e61e 3128 btrfs_release_delalloc_bytes(fs_info,
bffe633e
OS
3129 ordered_extent->disk_bytenr,
3130 ordered_extent->disk_num_bytes);
49940bdd 3131 }
d899e052 3132 }
c03c89f8
DS
3133 if (ret < 0) {
3134 btrfs_abort_transaction(trans, ret);
3135 goto out;
3136 }
3137
3138 ret = unpin_extent_cache(inode, ordered_extent->file_offset,
3139 ordered_extent->num_bytes, trans->transid);
79787eaa 3140 if (ret < 0) {
66642832 3141 btrfs_abort_transaction(trans, ret);
a7e3b975 3142 goto out;
79787eaa 3143 }
2ac55d41 3144
510f85ed 3145 ret = add_pending_csums(trans, &ordered_extent->list);
ac01f26a
NB
3146 if (ret) {
3147 btrfs_abort_transaction(trans, ret);
3148 goto out;
3149 }
e6dcd2dc 3150
2766ff61
FM
3151 /*
3152 * If this is a new delalloc range, clear its new delalloc flag to
3153 * update the inode's number of bytes. This needs to be done first
3154 * before updating the inode item.
3155 */
3156 if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3157 !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
72e7e6ed 3158 clear_extent_bit(&inode->io_tree, start, end,
2766ff61 3159 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
bd015294 3160 &cached_state);
2766ff61 3161
72e7e6ed 3162 btrfs_inode_safe_disk_i_size_write(inode, 0);
0a5d0dc5 3163 ret = btrfs_update_inode_fallback(trans, inode);
6c760c07 3164 if (ret) { /* -ENOMEM or corruption */
66642832 3165 btrfs_abort_transaction(trans, ret);
a7e3b975 3166 goto out;
1ef30be1 3167 }
c2167754 3168out:
bd015294 3169 clear_extent_bit(&inode->io_tree, start, end, clear_bits,
313facc5 3170 &cached_state);
a7e3b975 3171
a698d075 3172 if (trans)
3a45bb20 3173 btrfs_end_transaction(trans);
0cb59c99 3174
77cef2ec 3175 if (ret || truncated) {
bffe633e 3176 u64 unwritten_start = start;
77cef2ec 3177
d61bec08
JB
3178 /*
3179 * If we failed to finish this ordered extent for any reason we
3180 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3181 * extent, and mark the inode with the error if it wasn't
3182 * already set. Any error during writeback would have already
3183 * set the mapping error, so we need to set it if we're the ones
3184 * marking this ordered extent as failed.
3185 */
aa5ccf29
JB
3186 if (ret)
3187 btrfs_mark_ordered_extent_error(ordered_extent);
d61bec08 3188
77cef2ec 3189 if (truncated)
bffe633e
OS
3190 unwritten_start += logical_len;
3191 clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
77cef2ec 3192
5571e41e
JB
3193 /*
3194 * Drop extent maps for the part of the extent we didn't write.
3195 *
3196 * We have an exception here for the free_space_inode, this is
3197 * because when we do btrfs_get_extent() on the free space inode
3198 * we will search the commit root. If this is a new block group
3199 * we won't find anything, and we will trip over the assert in
3200 * writepage where we do ASSERT(em->block_start !=
3201 * EXTENT_MAP_HOLE).
3202 *
3203 * Theoretically we could also skip this for any NOCOW extent as
3204 * we don't mess with the extent map tree in the NOCOW case, but
3205 * for now simply skip this if we are the free space inode.
3206 */
3207 if (!btrfs_is_free_space_inode(inode))
3208 btrfs_drop_extent_map_range(inode, unwritten_start,
3209 end, false);
5fd02043 3210
0bec9ef5
JB
3211 /*
3212 * If the ordered extent had an IOERR or something else went
3213 * wrong we need to return the space for this ordered extent
77cef2ec
JB
3214 * back to the allocator. We only free the extent in the
3215 * truncated case if we didn't write out the extent at all.
49940bdd
JB
3216 *
3217 * If we made it past insert_reserved_file_extent before we
3218 * errored out then we don't need to do this as the accounting
3219 * has already been done.
0bec9ef5 3220 */
77cef2ec 3221 if ((ret || !logical_len) &&
49940bdd 3222 clear_reserved_extent &&
77cef2ec 3223 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
4eaaec24
NB
3224 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3225 /*
3226 * Discard the range before returning it back to the
3227 * free space pool
3228 */
46b27f50 3229 if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
4eaaec24 3230 btrfs_discard_extent(fs_info,
bffe633e
OS
3231 ordered_extent->disk_bytenr,
3232 ordered_extent->disk_num_bytes,
3233 NULL);
2ff7e61e 3234 btrfs_free_reserved_extent(fs_info,
bffe633e
OS
3235 ordered_extent->disk_bytenr,
3236 ordered_extent->disk_num_bytes, 1);
e28b0211
BB
3237 /*
3238 * Actually free the qgroup rsv which was released when
3239 * the ordered extent was created.
3240 */
e094f480 3241 btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
e28b0211
BB
3242 ordered_extent->qgroup_rsv,
3243 BTRFS_QGROUP_RSV_DATA);
4eaaec24 3244 }
0bec9ef5
JB
3245 }
3246
5fd02043 3247 /*
8bad3c02
LB
3248 * This needs to be done to make sure anybody waiting knows we are done
3249 * updating everything for this ordered extent.
5fd02043 3250 */
72e7e6ed 3251 btrfs_remove_ordered_extent(inode, ordered_extent);
5fd02043 3252
e6dcd2dc
CM
3253 /* once for us */
3254 btrfs_put_ordered_extent(ordered_extent);
3255 /* once for the tree */
3256 btrfs_put_ordered_extent(ordered_extent);
3257
5fd02043
JB
3258 return ret;
3259}
3260
71df088c
CH
3261int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
3262{
a1f4e3d7 3263 if (btrfs_is_zoned(ordered->inode->root->fs_info) &&
02c372e1
JT
3264 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3265 list_empty(&ordered->bioc_list))
71df088c
CH
3266 btrfs_finish_ordered_zoned(ordered);
3267 return btrfs_finish_one_ordered(ordered);
3268}
3269
ae643a74
QW
3270/*
3271 * Verify the checksum for a single sector without any extra action that depend
3272 * on the type of I/O.
3273 */
3274int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
3275 u32 pgoff, u8 *csum, const u8 * const csum_expected)
3276{
3277 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3278 char *kaddr;
3279
3280 ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
3281
3282 shash->tfm = fs_info->csum_shash;
3283
3284 kaddr = kmap_local_page(page) + pgoff;
3285 crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
3286 kunmap_local(kaddr);
3287
3288 if (memcmp(csum, csum_expected, fs_info->csum_size))
3289 return -EIO;
3290 return 0;
211f90e6
CM
3291}
3292
265d4ac0 3293/*
e5219044
CH
3294 * Verify the checksum of a single data sector.
3295 *
3296 * @bbio: btrfs_io_bio which contains the csum
3297 * @dev: device the sector is on
7ffd27e3 3298 * @bio_offset: offset to the beginning of the bio (in bytes)
e5219044 3299 * @bv: bio_vec to check
265d4ac0 3300 *
e5219044
CH
3301 * Check if the checksum on a data block is valid. When a checksum mismatch is
3302 * detected, report the error and fill the corrupted range with zero.
ae643a74 3303 *
e5219044 3304 * Return %true if the sector is ok or had no checksum to start with, else %false.
265d4ac0 3305 */
e5219044
CH
3306bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
3307 u32 bio_offset, struct bio_vec *bv)
dc380aea 3308{
e5219044 3309 struct btrfs_inode *inode = bbio->inode;
621af94a 3310 struct btrfs_fs_info *fs_info = inode->root->fs_info;
e5219044
CH
3311 u64 file_offset = bbio->file_offset + bio_offset;
3312 u64 end = file_offset + bv->bv_len - 1;
d5178578
JT
3313 u8 *csum_expected;
3314 u8 csum[BTRFS_CSUM_SIZE];
dc380aea 3315
3d49d0d3 3316 ASSERT(bv->bv_len == fs_info->sectorsize);
265d4ac0 3317
e5219044
CH
3318 if (!bbio->csum)
3319 return true;
d5178578 3320
e5219044
CH
3321 if (btrfs_is_data_reloc_root(inode->root) &&
3322 test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
893fe243 3323 NULL)) {
e5219044
CH
3324 /* Skip the range without csum for data reloc inode */
3325 clear_extent_bits(&inode->io_tree, file_offset, end,
3326 EXTENT_NODATASUM);
3327 return true;
3328 }
3329
fa13661c
JT
3330 csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
3331 fs_info->csum_size;
3d49d0d3
CH
3332 if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
3333 csum_expected))
dc380aea 3334 goto zeroit;
e5219044 3335 return true;
ae643a74 3336
dc380aea 3337zeroit:
3d49d0d3
CH
3338 btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
3339 bbio->mirror_num);
3340 if (dev)
3341 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
3342 memzero_bvec(bv);
3343 return false;
07157aac 3344}
b888db2b 3345
c1c3fac2 3346/*
9580503b 3347 * Perform a delayed iput on @inode.
c1c3fac2
NB
3348 *
3349 * @inode: The inode we want to perform iput on
3350 *
3351 * This function uses the generic vfs_inode::i_count to track whether we should
3352 * just decrement it (in case it's > 1) or if this is the last iput then link
3353 * the inode to the delayed iput machinery. Delayed iputs are processed at
3354 * transaction commit time/superblock commit/cleaner kthread.
3355 */
e55cf7ca 3356void btrfs_add_delayed_iput(struct btrfs_inode *inode)
24bbcf04 3357{
e55cf7ca 3358 struct btrfs_fs_info *fs_info = inode->root->fs_info;
866e98a4 3359 unsigned long flags;
24bbcf04 3360
e55cf7ca 3361 if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
24bbcf04
YZ
3362 return;
3363
034f784d 3364 atomic_inc(&fs_info->nr_delayed_iputs);
866e98a4
FM
3365 /*
3366 * Need to be irq safe here because we can be called from either an irq
3367 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
3368 * context.
3369 */
3370 spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
e55cf7ca
DS
3371 ASSERT(list_empty(&inode->delayed_iput));
3372 list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
866e98a4 3373 spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
fd340d0f
JB
3374 if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3375 wake_up_process(fs_info->cleaner_kthread);
24bbcf04
YZ
3376}
3377
63611e73
JB
3378static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3379 struct btrfs_inode *inode)
3380{
3381 list_del_init(&inode->delayed_iput);
866e98a4 3382 spin_unlock_irq(&fs_info->delayed_iput_lock);
63611e73
JB
3383 iput(&inode->vfs_inode);
3384 if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3385 wake_up(&fs_info->delayed_iputs_wait);
866e98a4 3386 spin_lock_irq(&fs_info->delayed_iput_lock);
63611e73
JB
3387}
3388
3389static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3390 struct btrfs_inode *inode)
3391{
3392 if (!list_empty(&inode->delayed_iput)) {
866e98a4 3393 spin_lock_irq(&fs_info->delayed_iput_lock);
63611e73
JB
3394 if (!list_empty(&inode->delayed_iput))
3395 run_delayed_iput_locked(fs_info, inode);
866e98a4 3396 spin_unlock_irq(&fs_info->delayed_iput_lock);
63611e73
JB
3397 }
3398}
3399
2ff7e61e 3400void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
24bbcf04 3401{
866e98a4
FM
3402 /*
3403 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
3404 * calls btrfs_add_delayed_iput() and that needs to lock
3405 * fs_info->delayed_iput_lock. So we need to disable irqs here to
3406 * prevent a deadlock.
3407 */
3408 spin_lock_irq(&fs_info->delayed_iput_lock);
8089fe62
DS
3409 while (!list_empty(&fs_info->delayed_iputs)) {
3410 struct btrfs_inode *inode;
3411
3412 inode = list_first_entry(&fs_info->delayed_iputs,
3413 struct btrfs_inode, delayed_iput);
63611e73 3414 run_delayed_iput_locked(fs_info, inode);
866e98a4
FM
3415 if (need_resched()) {
3416 spin_unlock_irq(&fs_info->delayed_iput_lock);
3417 cond_resched();
3418 spin_lock_irq(&fs_info->delayed_iput_lock);
3419 }
24bbcf04 3420 }
866e98a4 3421 spin_unlock_irq(&fs_info->delayed_iput_lock);
24bbcf04
YZ
3422}
3423
e43eec81 3424/*
2639631d
NB
3425 * Wait for flushing all delayed iputs
3426 *
3427 * @fs_info: the filesystem
034f784d
JB
3428 *
3429 * This will wait on any delayed iputs that are currently running with KILLABLE
3430 * set. Once they are all done running we will return, unless we are killed in
3431 * which case we return EINTR. This helps in user operations like fallocate etc
3432 * that might get blocked on the iputs.
2639631d
NB
3433 *
3434 * Return EINTR if we were killed, 0 if nothing's pending
034f784d
JB
3435 */
3436int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3437{
3438 int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3439 atomic_read(&fs_info->nr_delayed_iputs) == 0);
3440 if (ret)
3441 return -EINTR;
3442 return 0;
3443}
3444
7b128766 3445/*
f7e9e8fc
OS
3446 * This creates an orphan entry for the given inode in case something goes wrong
3447 * in the middle of an unlink.
7b128766 3448 */
73f2e545 3449int btrfs_orphan_add(struct btrfs_trans_handle *trans,
27919067 3450 struct btrfs_inode *inode)
7b128766 3451{
d68fc57b 3452 int ret;
7b128766 3453
27919067
OS
3454 ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3455 if (ret && ret != -EEXIST) {
3456 btrfs_abort_transaction(trans, ret);
3457 return ret;
d68fc57b
YZ
3458 }
3459
d68fc57b 3460 return 0;
7b128766
JB
3461}
3462
3463/*
f7e9e8fc
OS
3464 * We have done the delete so we can go ahead and remove the orphan item for
3465 * this particular inode.
7b128766 3466 */
48a3b636 3467static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3d6ae7bb 3468 struct btrfs_inode *inode)
7b128766 3469{
27919067 3470 return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
7b128766
JB
3471}
3472
3473/*
3474 * this cleans up any orphans that may be left on the list from the last use
3475 * of this root.
3476 */
66b4ffd1 3477int btrfs_orphan_cleanup(struct btrfs_root *root)
7b128766 3478{
0b246afa 3479 struct btrfs_fs_info *fs_info = root->fs_info;
7b128766
JB
3480 struct btrfs_path *path;
3481 struct extent_buffer *leaf;
7b128766
JB
3482 struct btrfs_key key, found_key;
3483 struct btrfs_trans_handle *trans;
3484 struct inode *inode;
8f6d7f4f 3485 u64 last_objectid = 0;
f7e9e8fc 3486 int ret = 0, nr_unlink = 0;
7b128766 3487
54230013 3488 if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
66b4ffd1 3489 return 0;
c71bf099
YZ
3490
3491 path = btrfs_alloc_path();
66b4ffd1
JB
3492 if (!path) {
3493 ret = -ENOMEM;
3494 goto out;
3495 }
e4058b54 3496 path->reada = READA_BACK;
7b128766
JB
3497
3498 key.objectid = BTRFS_ORPHAN_OBJECTID;
962a298f 3499 key.type = BTRFS_ORPHAN_ITEM_KEY;
7b128766
JB
3500 key.offset = (u64)-1;
3501
7b128766
JB
3502 while (1) {
3503 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
66b4ffd1
JB
3504 if (ret < 0)
3505 goto out;
7b128766
JB
3506
3507 /*
3508 * if ret == 0 means we found what we were searching for, which
25985edc 3509 * is weird, but possible, so only screw with path if we didn't
7b128766
JB
3510 * find the key and see if we have stuff that matches
3511 */
3512 if (ret > 0) {
66b4ffd1 3513 ret = 0;
7b128766
JB
3514 if (path->slots[0] == 0)
3515 break;
3516 path->slots[0]--;
3517 }
3518
3519 /* pull out the item */
3520 leaf = path->nodes[0];
7b128766
JB
3521 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3522
3523 /* make sure the item matches what we want */
3524 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3525 break;
962a298f 3526 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
7b128766
JB
3527 break;
3528
3529 /* release the path since we're done with it */
b3b4aa74 3530 btrfs_release_path(path);
7b128766
JB
3531
3532 /*
3533 * this is where we are basically btrfs_lookup, without the
3534 * crossing root thing. we store the inode number in the
3535 * offset of the orphan item.
3536 */
8f6d7f4f
JB
3537
3538 if (found_key.offset == last_objectid) {
a7f8de50
FM
3539 /*
3540 * We found the same inode as before. This means we were
3541 * not able to remove its items via eviction triggered
3542 * by an iput(). A transaction abort may have happened,
3543 * due to -ENOSPC for example, so try to grab the error
3544 * that lead to a transaction abort, if any.
3545 */
0b246afa
JM
3546 btrfs_err(fs_info,
3547 "Error removing orphan entry, stopping orphan cleanup");
a7f8de50 3548 ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
8f6d7f4f
JB
3549 goto out;
3550 }
3551
3552 last_objectid = found_key.offset;
3553
5d4f98a2
YZ
3554 found_key.objectid = found_key.offset;
3555 found_key.type = BTRFS_INODE_ITEM_KEY;
3556 found_key.offset = 0;
d13240dd 3557 inode = btrfs_iget(last_objectid, root);
cbaee87f
FM
3558 if (IS_ERR(inode)) {
3559 ret = PTR_ERR(inode);
3560 inode = NULL;
3561 if (ret != -ENOENT)
3562 goto out;
3563 }
7b128766 3564
cbaee87f 3565 if (!inode && root == fs_info->tree_root) {
f8e9e0b0 3566 struct btrfs_root *dead_root;
f8e9e0b0
AJ
3567 int is_dead_root = 0;
3568
3569 /*
0c0218e9 3570 * This is an orphan in the tree root. Currently these
f8e9e0b0 3571 * could come from 2 sources:
0c0218e9 3572 * a) a root (snapshot/subvolume) deletion in progress
f8e9e0b0 3573 * b) a free space cache inode
0c0218e9
FM
3574 * We need to distinguish those two, as the orphan item
3575 * for a root must not get deleted before the deletion
3576 * of the snapshot/subvolume's tree completes.
3577 *
3578 * btrfs_find_orphan_roots() ran before us, which has
3579 * found all deleted roots and loaded them into
fc7cbcd4 3580 * fs_info->fs_roots_radix. So here we can find if an
0c0218e9 3581 * orphan item corresponds to a deleted root by looking
fc7cbcd4 3582 * up the root from that radix tree.
f8e9e0b0 3583 */
a619b3c7 3584
fc7cbcd4
DS
3585 spin_lock(&fs_info->fs_roots_radix_lock);
3586 dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3587 (unsigned long)found_key.objectid);
a619b3c7
RK
3588 if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3589 is_dead_root = 1;
fc7cbcd4 3590 spin_unlock(&fs_info->fs_roots_radix_lock);
a619b3c7 3591
f8e9e0b0
AJ
3592 if (is_dead_root) {
3593 /* prevent this orphan from being found again */
3594 key.offset = found_key.objectid - 1;
3595 continue;
3596 }
f7e9e8fc 3597
f8e9e0b0 3598 }
f7e9e8fc 3599
7b128766 3600 /*
f7e9e8fc 3601 * If we have an inode with links, there are a couple of
70524253
BB
3602 * possibilities:
3603 *
3604 * 1. We were halfway through creating fsverity metadata for the
3605 * file. In that case, the orphan item represents incomplete
3606 * fsverity metadata which must be cleaned up with
3607 * btrfs_drop_verity_items and deleting the orphan item.
3608
3609 * 2. Old kernels (before v3.12) used to create an
f7e9e8fc
OS
3610 * orphan item for truncate indicating that there were possibly
3611 * extent items past i_size that needed to be deleted. In v3.12,
3612 * truncate was changed to update i_size in sync with the extent
3613 * items, but the (useless) orphan item was still created. Since
3614 * v4.18, we don't create the orphan item for truncate at all.
3615 *
3616 * So, this item could mean that we need to do a truncate, but
3617 * only if this filesystem was last used on a pre-v3.12 kernel
3618 * and was not cleanly unmounted. The odds of that are quite
3619 * slim, and it's a pain to do the truncate now, so just delete
3620 * the orphan item.
3621 *
3622 * It's also possible that this orphan item was supposed to be
3623 * deleted but wasn't. The inode number may have been reused,
3624 * but either way, we can delete the orphan item.
7b128766 3625 */
cbaee87f
FM
3626 if (!inode || inode->i_nlink) {
3627 if (inode) {
70524253 3628 ret = btrfs_drop_verity_items(BTRFS_I(inode));
f7e9e8fc 3629 iput(inode);
b777d279 3630 inode = NULL;
70524253
BB
3631 if (ret)
3632 goto out;
3633 }
a8c9e576 3634 trans = btrfs_start_transaction(root, 1);
66b4ffd1
JB
3635 if (IS_ERR(trans)) {
3636 ret = PTR_ERR(trans);
3637 goto out;
3638 }
0b246afa
JM
3639 btrfs_debug(fs_info, "auto deleting %Lu",
3640 found_key.objectid);
a8c9e576
JB
3641 ret = btrfs_del_orphan_item(trans, root,
3642 found_key.objectid);
3a45bb20 3643 btrfs_end_transaction(trans);
cbaee87f 3644 if (ret)
4ef31a45 3645 goto out;
7b128766
JB
3646 continue;
3647 }
3648
f7e9e8fc 3649 nr_unlink++;
7b128766
JB
3650
3651 /* this will do delete_inode and everything for us */
3652 iput(inode);
3653 }
3254c876
MX
3654 /* release the path since we're done with it */
3655 btrfs_release_path(path);
3656
a575ceeb 3657 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
7a7eaa40 3658 trans = btrfs_join_transaction(root);
66b4ffd1 3659 if (!IS_ERR(trans))
3a45bb20 3660 btrfs_end_transaction(trans);
d68fc57b 3661 }
7b128766
JB
3662
3663 if (nr_unlink)
0b246afa 3664 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
66b4ffd1
JB
3665
3666out:
3667 if (ret)
0b246afa 3668 btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
66b4ffd1
JB
3669 btrfs_free_path(path);
3670 return ret;
7b128766
JB
3671}
3672
46a53cca
CM
3673/*
3674 * very simple check to peek ahead in the leaf looking for xattrs. If we
3675 * don't find any xattrs, we know there can't be any acls.
3676 *
3677 * slot is the slot the inode is in, objectid is the objectid of the inode
3678 */
3679static noinline int acls_after_inode_item(struct extent_buffer *leaf,
63541927
FDBM
3680 int slot, u64 objectid,
3681 int *first_xattr_slot)
46a53cca
CM
3682{
3683 u32 nritems = btrfs_header_nritems(leaf);
3684 struct btrfs_key found_key;
f23b5a59
JB
3685 static u64 xattr_access = 0;
3686 static u64 xattr_default = 0;
46a53cca
CM
3687 int scanned = 0;
3688
f23b5a59 3689 if (!xattr_access) {
97d79299
AG
3690 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3691 strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3692 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3693 strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
f23b5a59
JB
3694 }
3695
46a53cca 3696 slot++;
63541927 3697 *first_xattr_slot = -1;
46a53cca
CM
3698 while (slot < nritems) {
3699 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3700
3701 /* we found a different objectid, there must not be acls */
3702 if (found_key.objectid != objectid)
3703 return 0;
3704
3705 /* we found an xattr, assume we've got an acl */
f23b5a59 3706 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
63541927
FDBM
3707 if (*first_xattr_slot == -1)
3708 *first_xattr_slot = slot;
f23b5a59
JB
3709 if (found_key.offset == xattr_access ||
3710 found_key.offset == xattr_default)
3711 return 1;
3712 }
46a53cca
CM
3713
3714 /*
3715 * we found a key greater than an xattr key, there can't
3716 * be any acls later on
3717 */
3718 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3719 return 0;
3720
3721 slot++;
3722 scanned++;
3723
3724 /*
3725 * it goes inode, inode backrefs, xattrs, extents,
3726 * so if there are a ton of hard links to an inode there can
3727 * be a lot of backrefs. Don't waste time searching too hard,
3728 * this is just an optimization
3729 */
3730 if (scanned >= 8)
3731 break;
3732 }
3733 /* we hit the end of the leaf before we found an xattr or
3734 * something larger than an xattr. We have to assume the inode
3735 * has acls
3736 */
63541927
FDBM
3737 if (*first_xattr_slot == -1)
3738 *first_xattr_slot = slot;
46a53cca
CM
3739 return 1;
3740}
3741
3d7db6e8
FM
3742static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
3743{
3744 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3745
3746 if (WARN_ON_ONCE(inode->file_extent_tree))
3747 return 0;
3748 if (btrfs_fs_incompat(fs_info, NO_HOLES))
3749 return 0;
3750 if (!S_ISREG(inode->vfs_inode.i_mode))
3751 return 0;
3752 if (btrfs_is_free_space_inode(inode))
3753 return 0;
3754
3755 inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
3756 if (!inode->file_extent_tree)
3757 return -ENOMEM;
3758
3759 extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT);
3760 /* Lockdep class is set only for the file extent tree. */
3761 lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
3762
3763 return 0;
3764}
3765
d352ac68
CM
3766/*
3767 * read an inode from the btree into the in-memory inode
3768 */
4222ea71
FM
3769static int btrfs_read_locked_inode(struct inode *inode,
3770 struct btrfs_path *in_path)
39279cc3 3771{
41044b41 3772 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
4222ea71 3773 struct btrfs_path *path = in_path;
5f39d397 3774 struct extent_buffer *leaf;
39279cc3
CM
3775 struct btrfs_inode_item *inode_item;
3776 struct btrfs_root *root = BTRFS_I(inode)->root;
3777 struct btrfs_key location;
67de1176 3778 unsigned long ptr;
46a53cca 3779 int maybe_acls;
618e21d5 3780 u32 rdev;
39279cc3 3781 int ret;
2f7e33d4 3782 bool filled = false;
63541927 3783 int first_xattr_slot;
2f7e33d4 3784
3d7db6e8
FM
3785 ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
3786 if (ret)
3787 return ret;
3788
2f7e33d4
MX
3789 ret = btrfs_fill_inode(inode, &rdev);
3790 if (!ret)
3791 filled = true;
39279cc3 3792
4222ea71
FM
3793 if (!path) {
3794 path = btrfs_alloc_path();
3795 if (!path)
3796 return -ENOMEM;
3797 }
1748f843 3798
068fc8f9 3799 btrfs_get_inode_key(BTRFS_I(inode), &location);
dc17ff8f 3800
39279cc3 3801 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
67710892 3802 if (ret) {
4222ea71
FM
3803 if (path != in_path)
3804 btrfs_free_path(path);
f5b3a417 3805 return ret;
67710892 3806 }
39279cc3 3807
5f39d397 3808 leaf = path->nodes[0];
2f7e33d4
MX
3809
3810 if (filled)
67de1176 3811 goto cache_index;
2f7e33d4 3812
5f39d397
CM
3813 inode_item = btrfs_item_ptr(leaf, path->slots[0],
3814 struct btrfs_inode_item);
5f39d397 3815 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
bfe86848 3816 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
2f2f43d3
EB
3817 i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3818 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
6ef06d27 3819 btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
41a2ee75
JB
3820 btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
3821 round_up(i_size_read(inode), fs_info->sectorsize));
5f39d397 3822
b1c38a13
JL
3823 inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
3824 btrfs_timespec_nsec(leaf, &inode_item->atime));
5f39d397 3825
b1c38a13
JL
3826 inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
3827 btrfs_timespec_nsec(leaf, &inode_item->mtime));
5f39d397 3828
2a9462de
JL
3829 inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
3830 btrfs_timespec_nsec(leaf, &inode_item->ctime));
5f39d397 3831
c6e8f898
DS
3832 BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
3833 BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
5f39d397 3834
a76a3cd4 3835 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
e02119d5 3836 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
5dc562c5
JB
3837 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3838
c7f88c4e
JL
3839 inode_set_iversion_queried(inode,
3840 btrfs_inode_sequence(leaf, inode_item));
6e17d30b
YD
3841 inode->i_generation = BTRFS_I(inode)->generation;
3842 inode->i_rdev = 0;
3843 rdev = btrfs_inode_rdev(leaf, inode_item);
3844
d9891ae2
FM
3845 if (S_ISDIR(inode->i_mode))
3846 BTRFS_I(inode)->index_cnt = (u64)-1;
3847
77eea05e
BB
3848 btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
3849 &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
6e17d30b
YD
3850
3851cache_index:
5dc562c5
JB
3852 /*
3853 * If we were modified in the current generation and evicted from memory
3854 * and then re-read we need to do a full sync since we don't have any
3855 * idea about which extents were modified before we were evicted from
3856 * cache.
6e17d30b
YD
3857 *
3858 * This is required for both inode re-read from disk and delayed inode
6140ba8a 3859 * in the delayed_nodes xarray.
5dc562c5 3860 */
4a4f8fe2 3861 if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
5dc562c5
JB
3862 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3863 &BTRFS_I(inode)->runtime_flags);
3864
bde6c242
FM
3865 /*
3866 * We don't persist the id of the transaction where an unlink operation
3867 * against the inode was last made. So here we assume the inode might
3868 * have been evicted, and therefore the exact value of last_unlink_trans
3869 * lost, and set it to last_trans to avoid metadata inconsistencies
3870 * between the inode and its parent if the inode is fsync'ed and the log
3871 * replayed. For example, in the scenario:
3872 *
3873 * touch mydir/foo
3874 * ln mydir/foo mydir/bar
3875 * sync
3876 * unlink mydir/bar
3877 * echo 2 > /proc/sys/vm/drop_caches # evicts inode
3878 * xfs_io -c fsync mydir/foo
3879 * <power failure>
3880 * mount fs, triggers fsync log replay
3881 *
3882 * We must make sure that when we fsync our inode foo we also log its
3883 * parent inode, otherwise after log replay the parent still has the
3884 * dentry with the "bar" name but our inode foo has a link count of 1
3885 * and doesn't have an inode ref with the name "bar" anymore.
3886 *
3887 * Setting last_unlink_trans to last_trans is a pessimistic approach,
01327610 3888 * but it guarantees correctness at the expense of occasional full
bde6c242
FM
3889 * transaction commits on fsync if our inode is a directory, or if our
3890 * inode is not a directory, logging its parent unnecessarily.
3891 */
3892 BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3893
3ebac17c
FM
3894 /*
3895 * Same logic as for last_unlink_trans. We don't persist the generation
3896 * of the last transaction where this inode was used for a reflink
3897 * operation, so after eviction and reloading the inode we must be
3898 * pessimistic and assume the last transaction that modified the inode.
3899 */
3900 BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
3901
67de1176
MX
3902 path->slots[0]++;
3903 if (inode->i_nlink != 1 ||
3904 path->slots[0] >= btrfs_header_nritems(leaf))
3905 goto cache_acl;
3906
3907 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
4a0cc7ca 3908 if (location.objectid != btrfs_ino(BTRFS_I(inode)))
67de1176
MX
3909 goto cache_acl;
3910
3911 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3912 if (location.type == BTRFS_INODE_REF_KEY) {
3913 struct btrfs_inode_ref *ref;
3914
3915 ref = (struct btrfs_inode_ref *)ptr;
3916 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3917 } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3918 struct btrfs_inode_extref *extref;
3919
3920 extref = (struct btrfs_inode_extref *)ptr;
3921 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3922 extref);
3923 }
2f7e33d4 3924cache_acl:
46a53cca
CM
3925 /*
3926 * try to precache a NULL acl entry for files that don't have
3927 * any xattrs or acls
3928 */
33345d01 3929 maybe_acls = acls_after_inode_item(leaf, path->slots[0],
f85b7379 3930 btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
63541927
FDBM
3931 if (first_xattr_slot != -1) {
3932 path->slots[0] = first_xattr_slot;
3933 ret = btrfs_load_inode_props(inode, path);
3934 if (ret)
0b246afa 3935 btrfs_err(fs_info,
351fd353 3936 "error loading props for ino %llu (root %llu): %d",
4a0cc7ca 3937 btrfs_ino(BTRFS_I(inode)),
e094f480 3938 btrfs_root_id(root), ret);
63541927 3939 }
4222ea71
FM
3940 if (path != in_path)
3941 btrfs_free_path(path);
63541927 3942
72c04902
AV
3943 if (!maybe_acls)
3944 cache_no_acl(inode);
46a53cca 3945
39279cc3 3946 switch (inode->i_mode & S_IFMT) {
39279cc3
CM
3947 case S_IFREG:
3948 inode->i_mapping->a_ops = &btrfs_aops;
3949 inode->i_fop = &btrfs_file_operations;
3950 inode->i_op = &btrfs_file_inode_operations;
3951 break;
3952 case S_IFDIR:
3953 inode->i_fop = &btrfs_dir_file_operations;
67ade058 3954 inode->i_op = &btrfs_dir_inode_operations;
39279cc3
CM
3955 break;
3956 case S_IFLNK:
3957 inode->i_op = &btrfs_symlink_inode_operations;
21fc61c7 3958 inode_nohighmem(inode);
4779cc04 3959 inode->i_mapping->a_ops = &btrfs_aops;
39279cc3 3960 break;
618e21d5 3961 default:
0279b4cd 3962 inode->i_op = &btrfs_special_inode_operations;
618e21d5
JB
3963 init_special_inode(inode, inode->i_mode, rdev);
3964 break;
39279cc3 3965 }
6cbff00f 3966
7b6a221e 3967 btrfs_sync_inode_flags_to_i_flags(inode);
67710892 3968 return 0;
39279cc3
CM
3969}
3970
d352ac68
CM
3971/*
3972 * given a leaf and an inode, copy the inode fields into the leaf
3973 */
e02119d5
CM
3974static void fill_inode_item(struct btrfs_trans_handle *trans,
3975 struct extent_buffer *leaf,
5f39d397 3976 struct btrfs_inode_item *item,
39279cc3
CM
3977 struct inode *inode)
3978{
51fab693 3979 struct btrfs_map_token token;
77eea05e 3980 u64 flags;
51fab693 3981
c82f823c 3982 btrfs_init_map_token(&token, leaf);
5f39d397 3983
cc4c13d5
DS
3984 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3985 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3986 btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
3987 btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3988 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3989
3990 btrfs_set_token_timespec_sec(&token, &item->atime,
b1c38a13 3991 inode_get_atime_sec(inode));
cc4c13d5 3992 btrfs_set_token_timespec_nsec(&token, &item->atime,
b1c38a13 3993 inode_get_atime_nsec(inode));
cc4c13d5
DS
3994
3995 btrfs_set_token_timespec_sec(&token, &item->mtime,
b1c38a13 3996 inode_get_mtime_sec(inode));
cc4c13d5 3997 btrfs_set_token_timespec_nsec(&token, &item->mtime,
b1c38a13 3998 inode_get_mtime_nsec(inode));
cc4c13d5
DS
3999
4000 btrfs_set_token_timespec_sec(&token, &item->ctime,
b1c38a13 4001 inode_get_ctime_sec(inode));
cc4c13d5 4002 btrfs_set_token_timespec_nsec(&token, &item->ctime,
b1c38a13 4003 inode_get_ctime_nsec(inode));
cc4c13d5 4004
c6e8f898
DS
4005 btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
4006 btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
cc4c13d5
DS
4007
4008 btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
4009 btrfs_set_token_inode_generation(&token, item,
4010 BTRFS_I(inode)->generation);
4011 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
4012 btrfs_set_token_inode_transid(&token, item, trans->transid);
4013 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
77eea05e
BB
4014 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4015 BTRFS_I(inode)->ro_flags);
4016 btrfs_set_token_inode_flags(&token, item, flags);
cc4c13d5 4017 btrfs_set_token_inode_block_group(&token, item, 0);
39279cc3
CM
4018}
4019
d352ac68
CM
4020/*
4021 * copy everything in the in-memory inode into the btree.
4022 */
2115133f 4023static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
07a274a8 4024 struct btrfs_inode *inode)
39279cc3
CM
4025{
4026 struct btrfs_inode_item *inode_item;
4027 struct btrfs_path *path;
5f39d397 4028 struct extent_buffer *leaf;
068fc8f9 4029 struct btrfs_key key;
39279cc3
CM
4030 int ret;
4031
4032 path = btrfs_alloc_path();
16cdcec7
MX
4033 if (!path)
4034 return -ENOMEM;
4035
068fc8f9
FM
4036 btrfs_get_inode_key(inode, &key);
4037 ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1);
39279cc3
CM
4038 if (ret) {
4039 if (ret > 0)
4040 ret = -ENOENT;
4041 goto failed;
4042 }
4043
5f39d397
CM
4044 leaf = path->nodes[0];
4045 inode_item = btrfs_item_ptr(leaf, path->slots[0],
16cdcec7 4046 struct btrfs_inode_item);
39279cc3 4047
dfeb9e7c 4048 fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
50564b65 4049 btrfs_mark_buffer_dirty(trans, leaf);
dfeb9e7c 4050 btrfs_set_inode_last_trans(trans, inode);
39279cc3
CM
4051 ret = 0;
4052failed:
39279cc3
CM
4053 btrfs_free_path(path);
4054 return ret;
4055}
4056
2115133f
CM
4057/*
4058 * copy everything in the in-memory inode into the btree.
4059 */
cddaaacc 4060int btrfs_update_inode(struct btrfs_trans_handle *trans,
cddaaacc 4061 struct btrfs_inode *inode)
2115133f 4062{
8b9d0322 4063 struct btrfs_root *root = inode->root;
0b246afa 4064 struct btrfs_fs_info *fs_info = root->fs_info;
2115133f
CM
4065 int ret;
4066
4067 /*
4068 * If the inode is a free space inode, we can deadlock during commit
4069 * if we put it into the delayed code.
4070 *
4071 * The data relocation inode should also be directly updated
4072 * without delay
4073 */
9a56fcd1 4074 if (!btrfs_is_free_space_inode(inode)
37f00a6d 4075 && !btrfs_is_data_reloc_root(root)
0b246afa 4076 && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
8ea05e3a
AB
4077 btrfs_update_root_times(trans, root);
4078
04bd8e94 4079 ret = btrfs_delayed_update_inode(trans, inode);
2115133f 4080 if (!ret)
9a56fcd1 4081 btrfs_set_inode_last_trans(trans, inode);
2115133f
CM
4082 return ret;
4083 }
4084
07a274a8 4085 return btrfs_update_inode_item(trans, inode);
2115133f
CM
4086}
4087
729f7961 4088int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
0a5d0dc5 4089 struct btrfs_inode *inode)
2115133f
CM
4090{
4091 int ret;
4092
8b9d0322 4093 ret = btrfs_update_inode(trans, inode);
2115133f 4094 if (ret == -ENOSPC)
07a274a8 4095 return btrfs_update_inode_item(trans, inode);
2115133f
CM
4096 return ret;
4097}
4098
d352ac68
CM
4099/*
4100 * unlink helper that gets used here in inode.c and in the tree logging
4101 * recovery code. It remove a link in a directory with a given name, and
4102 * also drops the back refs in the inode to the directory
4103 */
92986796 4104static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4ec5934e
NB
4105 struct btrfs_inode *dir,
4106 struct btrfs_inode *inode,
6db75318 4107 const struct fscrypt_str *name,
88d2beec 4108 struct btrfs_rename_ctx *rename_ctx)
39279cc3 4109{
4467af88 4110 struct btrfs_root *root = dir->root;
0b246afa 4111 struct btrfs_fs_info *fs_info = root->fs_info;
39279cc3 4112 struct btrfs_path *path;
39279cc3 4113 int ret = 0;
39279cc3 4114 struct btrfs_dir_item *di;
aec7477b 4115 u64 index;
33345d01
LZ
4116 u64 ino = btrfs_ino(inode);
4117 u64 dir_ino = btrfs_ino(dir);
39279cc3
CM
4118
4119 path = btrfs_alloc_path();
54aa1f4d
CM
4120 if (!path) {
4121 ret = -ENOMEM;
554233a6 4122 goto out;
54aa1f4d
CM
4123 }
4124
e43eec81 4125 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
3cf5068f
LB
4126 if (IS_ERR_OR_NULL(di)) {
4127 ret = di ? PTR_ERR(di) : -ENOENT;
39279cc3
CM
4128 goto err;
4129 }
39279cc3 4130 ret = btrfs_delete_one_dir_name(trans, root, path, di);
54aa1f4d
CM
4131 if (ret)
4132 goto err;
b3b4aa74 4133 btrfs_release_path(path);
39279cc3 4134
67de1176
MX
4135 /*
4136 * If we don't have dir index, we have to get it by looking up
4137 * the inode ref, since we get the inode ref, remove it directly,
4138 * it is unnecessary to do delayed deletion.
4139 *
4140 * But if we have dir index, needn't search inode ref to get it.
4141 * Since the inode ref is close to the inode item, it is better
4142 * that we delay to delete it, and just do this deletion when
4143 * we update the inode item.
4144 */
4ec5934e 4145 if (inode->dir_index) {
67de1176
MX
4146 ret = btrfs_delayed_delete_inode_ref(inode);
4147 if (!ret) {
4ec5934e 4148 index = inode->dir_index;
67de1176
MX
4149 goto skip_backref;
4150 }
4151 }
4152
e43eec81 4153 ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
aec7477b 4154 if (ret) {
0b246afa 4155 btrfs_info(fs_info,
c2cf52eb 4156 "failed to delete reference to %.*s, inode %llu parent %llu",
e43eec81 4157 name->len, name->name, ino, dir_ino);
66642832 4158 btrfs_abort_transaction(trans, ret);
aec7477b
JB
4159 goto err;
4160 }
67de1176 4161skip_backref:
88d2beec
FM
4162 if (rename_ctx)
4163 rename_ctx->index = index;
4164
9add2945 4165 ret = btrfs_delete_delayed_dir_index(trans, dir, index);
79787eaa 4166 if (ret) {
66642832 4167 btrfs_abort_transaction(trans, ret);
39279cc3 4168 goto err;
79787eaa 4169 }
39279cc3 4170
259c4b96
FM
4171 /*
4172 * If we are in a rename context, we don't need to update anything in the
4173 * log. That will be done later during the rename by btrfs_log_new_name().
143823cf 4174 * Besides that, doing it here would only cause extra unnecessary btree
259c4b96
FM
4175 * operations on the log tree, increasing latency for applications.
4176 */
4177 if (!rename_ctx) {
e43eec81
STD
4178 btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
4179 btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
259c4b96 4180 }
63611e73
JB
4181
4182 /*
4183 * If we have a pending delayed iput we could end up with the final iput
4184 * being run in btrfs-cleaner context. If we have enough of these built
4185 * up we can end up burning a lot of time in btrfs-cleaner without any
4186 * way to throttle the unlinks. Since we're currently holding a ref on
4187 * the inode we can run the delayed iput here without any issues as the
4188 * final iput won't be done until after we drop the ref we're currently
4189 * holding.
4190 */
4191 btrfs_run_delayed_iput(fs_info, inode);
39279cc3
CM
4192err:
4193 btrfs_free_path(path);
e02119d5
CM
4194 if (ret)
4195 goto out;
4196
e43eec81 4197 btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
4ec5934e 4198 inode_inc_iversion(&inode->vfs_inode);
3bc2ac2f 4199 inode_set_ctime_current(&inode->vfs_inode);
4ec5934e 4200 inode_inc_iversion(&dir->vfs_inode);
b1c38a13 4201 inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
8b9d0322 4202 ret = btrfs_update_inode(trans, dir);
e02119d5 4203out:
39279cc3
CM
4204 return ret;
4205}
4206
92986796 4207int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4ec5934e 4208 struct btrfs_inode *dir, struct btrfs_inode *inode,
6db75318 4209 const struct fscrypt_str *name)
92986796
AV
4210{
4211 int ret;
e43eec81
STD
4212
4213 ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
92986796 4214 if (!ret) {
4ec5934e 4215 drop_nlink(&inode->vfs_inode);
8b9d0322 4216 ret = btrfs_update_inode(trans, inode);
92986796
AV
4217 }
4218 return ret;
4219}
39279cc3 4220
a22285a6
YZ
4221/*
4222 * helper to start transaction for unlink and rmdir.
4223 *
d52be818
JB
4224 * unlink and rmdir are special in btrfs, they do not always free space, so
4225 * if we cannot make our reservations the normal way try and see if there is
4226 * plenty of slack room in the global reserve to migrate, otherwise we cannot
4227 * allow the unlink to occur.
a22285a6 4228 */
e569b1d5 4229static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
4df27c4d 4230{
e569b1d5 4231 struct btrfs_root *root = dir->root;
4df27c4d 4232
5630e2bc
FM
4233 return btrfs_start_transaction_fallback_global_rsv(root,
4234 BTRFS_UNLINK_METADATA_UNITS);
a22285a6
YZ
4235}
4236
4237static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4238{
a22285a6 4239 struct btrfs_trans_handle *trans;
2b0143b5 4240 struct inode *inode = d_inode(dentry);
a22285a6 4241 int ret;
ab3c5c18 4242 struct fscrypt_name fname;
a22285a6 4243
ab3c5c18
STD
4244 ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4245 if (ret)
4246 return ret;
ab3c5c18
STD
4247
4248 /* This needs to handle no-key deletions later on */
a22285a6 4249
e569b1d5 4250 trans = __unlink_start_trans(BTRFS_I(dir));
ab3c5c18
STD
4251 if (IS_ERR(trans)) {
4252 ret = PTR_ERR(trans);
4253 goto fscrypt_free;
4254 }
5f39d397 4255
4ec5934e 4256 btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
59fcf388 4257 false);
12fcfd22 4258
e43eec81 4259 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
6db75318 4260 &fname.disk_name);
b532402e 4261 if (ret)
ab3c5c18 4262 goto end_trans;
7b128766 4263
a22285a6 4264 if (inode->i_nlink == 0) {
73f2e545 4265 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
b532402e 4266 if (ret)
ab3c5c18 4267 goto end_trans;
a22285a6 4268 }
7b128766 4269
ab3c5c18 4270end_trans:
3a45bb20 4271 btrfs_end_transaction(trans);
4467af88 4272 btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
ab3c5c18
STD
4273fscrypt_free:
4274 fscrypt_free_filename(&fname);
39279cc3
CM
4275 return ret;
4276}
4277
f60a2364 4278static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
5b7544cb 4279 struct btrfs_inode *dir, struct dentry *dentry)
4df27c4d 4280{
5b7544cb 4281 struct btrfs_root *root = dir->root;
045d3967 4282 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4df27c4d
YZ
4283 struct btrfs_path *path;
4284 struct extent_buffer *leaf;
4285 struct btrfs_dir_item *di;
4286 struct btrfs_key key;
4287 u64 index;
4288 int ret;
045d3967 4289 u64 objectid;
5b7544cb 4290 u64 dir_ino = btrfs_ino(dir);
ab3c5c18
STD
4291 struct fscrypt_name fname;
4292
5b7544cb 4293 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
ab3c5c18
STD
4294 if (ret)
4295 return ret;
ab3c5c18
STD
4296
4297 /* This needs to handle no-key deletions later on */
4df27c4d 4298
045d3967 4299 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
e094f480 4300 objectid = btrfs_root_id(inode->root);
045d3967 4301 } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
7a7bc214 4302 objectid = inode->ref_root_id;
045d3967
JB
4303 } else {
4304 WARN_ON(1);
ab3c5c18 4305 fscrypt_free_filename(&fname);
045d3967
JB
4306 return -EINVAL;
4307 }
4308
4df27c4d 4309 path = btrfs_alloc_path();
ab3c5c18
STD
4310 if (!path) {
4311 ret = -ENOMEM;
4312 goto out;
4313 }
4df27c4d 4314
33345d01 4315 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
6db75318 4316 &fname.disk_name, -1);
79787eaa 4317 if (IS_ERR_OR_NULL(di)) {
3cf5068f 4318 ret = di ? PTR_ERR(di) : -ENOENT;
79787eaa
JM
4319 goto out;
4320 }
4df27c4d
YZ
4321
4322 leaf = path->nodes[0];
4323 btrfs_dir_item_key_to_cpu(leaf, di, &key);
4324 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4325 ret = btrfs_delete_one_dir_name(trans, root, path, di);
79787eaa 4326 if (ret) {
66642832 4327 btrfs_abort_transaction(trans, ret);
79787eaa
JM
4328 goto out;
4329 }
b3b4aa74 4330 btrfs_release_path(path);
4df27c4d 4331
d49d3287
JB
4332 /*
4333 * This is a placeholder inode for a subvolume we didn't have a
4334 * reference to at the time of the snapshot creation. In the meantime
4335 * we could have renamed the real subvol link into our snapshot, so
1a9fd417 4336 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
d49d3287
JB
4337 * Instead simply lookup the dir_index_item for this entry so we can
4338 * remove it. Otherwise we know we have a ref to the root and we can
4339 * call btrfs_del_root_ref, and it _shouldn't_ fail.
4340 */
4341 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
6db75318 4342 di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
79787eaa
JM
4343 if (IS_ERR_OR_NULL(di)) {
4344 if (!di)
4345 ret = -ENOENT;
4346 else
4347 ret = PTR_ERR(di);
66642832 4348 btrfs_abort_transaction(trans, ret);
79787eaa
JM
4349 goto out;
4350 }
4df27c4d
YZ
4351
4352 leaf = path->nodes[0];
4353 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4df27c4d 4354 index = key.offset;
d49d3287
JB
4355 btrfs_release_path(path);
4356 } else {
4357 ret = btrfs_del_root_ref(trans, objectid,
e094f480 4358 btrfs_root_id(root), dir_ino,
6db75318 4359 &index, &fname.disk_name);
d49d3287
JB
4360 if (ret) {
4361 btrfs_abort_transaction(trans, ret);
4362 goto out;
4363 }
4df27c4d
YZ
4364 }
4365
5b7544cb 4366 ret = btrfs_delete_delayed_dir_index(trans, dir, index);
79787eaa 4367 if (ret) {
66642832 4368 btrfs_abort_transaction(trans, ret);
79787eaa
JM
4369 goto out;
4370 }
4df27c4d 4371
5b7544cb
DS
4372 btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
4373 inode_inc_iversion(&dir->vfs_inode);
b1c38a13 4374 inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
0a5d0dc5 4375 ret = btrfs_update_inode_fallback(trans, dir);
79787eaa 4376 if (ret)
66642832 4377 btrfs_abort_transaction(trans, ret);
79787eaa 4378out:
71d7aed0 4379 btrfs_free_path(path);
ab3c5c18 4380 fscrypt_free_filename(&fname);
79787eaa 4381 return ret;
4df27c4d
YZ
4382}
4383
ec42f167
MT
4384/*
4385 * Helper to check if the subvolume references other subvolumes or if it's
4386 * default.
4387 */
f60a2364 4388static noinline int may_destroy_subvol(struct btrfs_root *root)
ec42f167
MT
4389{
4390 struct btrfs_fs_info *fs_info = root->fs_info;
4391 struct btrfs_path *path;
4392 struct btrfs_dir_item *di;
4393 struct btrfs_key key;
6db75318 4394 struct fscrypt_str name = FSTR_INIT("default", 7);
ec42f167
MT
4395 u64 dir_id;
4396 int ret;
4397
4398 path = btrfs_alloc_path();
4399 if (!path)
4400 return -ENOMEM;
4401
4402 /* Make sure this root isn't set as the default subvol */
4403 dir_id = btrfs_super_root_dir(fs_info->super_copy);
4404 di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
e43eec81 4405 dir_id, &name, 0);
ec42f167
MT
4406 if (di && !IS_ERR(di)) {
4407 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
e094f480 4408 if (key.objectid == btrfs_root_id(root)) {
ec42f167
MT
4409 ret = -EPERM;
4410 btrfs_err(fs_info,
4411 "deleting default subvolume %llu is not allowed",
4412 key.objectid);
4413 goto out;
4414 }
4415 btrfs_release_path(path);
4416 }
4417
e094f480 4418 key.objectid = btrfs_root_id(root);
ec42f167
MT
4419 key.type = BTRFS_ROOT_REF_KEY;
4420 key.offset = (u64)-1;
4421
4422 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4423 if (ret < 0)
4424 goto out;
6fbc6f4a
DS
4425 if (ret == 0) {
4426 /*
4427 * Key with offset -1 found, there would have to exist a root
4428 * with such id, but this is out of valid range.
4429 */
4430 ret = -EUCLEAN;
4431 goto out;
4432 }
ec42f167
MT
4433
4434 ret = 0;
4435 if (path->slots[0] > 0) {
4436 path->slots[0]--;
4437 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
e094f480 4438 if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
ec42f167
MT
4439 ret = -ENOTEMPTY;
4440 }
4441out:
4442 btrfs_free_path(path);
4443 return ret;
4444}
4445
20a68004
NB
4446/* Delete all dentries for inodes belonging to the root */
4447static void btrfs_prune_dentries(struct btrfs_root *root)
4448{
4449 struct btrfs_fs_info *fs_info = root->fs_info;
26c0fae3
FM
4450 struct btrfs_inode *inode;
4451 u64 min_ino = 0;
20a68004 4452
84961539 4453 if (!BTRFS_FS_ERROR(fs_info))
20a68004
NB
4454 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4455
26c0fae3
FM
4456 inode = btrfs_find_first_inode(root, min_ino);
4457 while (inode) {
4458 if (atomic_read(&inode->vfs_inode.i_count) > 1)
4459 d_prune_aliases(&inode->vfs_inode);
20a68004 4460
26c0fae3
FM
4461 min_ino = btrfs_ino(inode) + 1;
4462 /*
4463 * btrfs_drop_inode() will have it removed from the inode
4464 * cache when its usage count hits zero.
4465 */
4466 iput(&inode->vfs_inode);
4467 cond_resched();
4468 inode = btrfs_find_first_inode(root, min_ino);
20a68004 4469 }
20a68004
NB
4470}
4471
3c4f91e2 4472int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
f60a2364 4473{
3c4f91e2 4474 struct btrfs_root *root = dir->root;
41044b41 4475 struct btrfs_fs_info *fs_info = root->fs_info;
f60a2364
MT
4476 struct inode *inode = d_inode(dentry);
4477 struct btrfs_root *dest = BTRFS_I(inode)->root;
4478 struct btrfs_trans_handle *trans;
4479 struct btrfs_block_rsv block_rsv;
4480 u64 root_flags;
74e97958 4481 u64 qgroup_reserved = 0;
f60a2364 4482 int ret;
f60a2364 4483
3324d054
OS
4484 down_write(&fs_info->subvol_sem);
4485
f60a2364
MT
4486 /*
4487 * Don't allow to delete a subvolume with send in progress. This is
4488 * inside the inode lock so the error handling that has to drop the bit
4489 * again is not run concurrently.
4490 */
4491 spin_lock(&dest->root_item_lock);
a7176f74 4492 if (dest->send_in_progress) {
f60a2364
MT
4493 spin_unlock(&dest->root_item_lock);
4494 btrfs_warn(fs_info,
4495 "attempt to delete subvolume %llu during send",
e094f480 4496 btrfs_root_id(dest));
3324d054
OS
4497 ret = -EPERM;
4498 goto out_up_write;
f60a2364 4499 }
60021bd7
KH
4500 if (atomic_read(&dest->nr_swapfiles)) {
4501 spin_unlock(&dest->root_item_lock);
4502 btrfs_warn(fs_info,
4503 "attempt to delete subvolume %llu with active swapfile",
e094f480 4504 btrfs_root_id(root));
3324d054
OS
4505 ret = -EPERM;
4506 goto out_up_write;
60021bd7 4507 }
a7176f74
LF
4508 root_flags = btrfs_root_flags(&dest->root_item);
4509 btrfs_set_root_flags(&dest->root_item,
4510 root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4511 spin_unlock(&dest->root_item_lock);
f60a2364 4512
ee0d904f
NB
4513 ret = may_destroy_subvol(dest);
4514 if (ret)
3324d054 4515 goto out_undead;
f60a2364
MT
4516
4517 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4518 /*
4519 * One for dir inode,
4520 * two for dir entries,
4521 * two for root ref/backref.
4522 */
ee0d904f
NB
4523 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4524 if (ret)
3324d054 4525 goto out_undead;
74e97958 4526 qgroup_reserved = block_rsv.qgroup_rsv_reserved;
f60a2364
MT
4527
4528 trans = btrfs_start_transaction(root, 0);
4529 if (IS_ERR(trans)) {
ee0d904f 4530 ret = PTR_ERR(trans);
f60a2364
MT
4531 goto out_release;
4532 }
74e97958
BB
4533 btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
4534 qgroup_reserved = 0;
f60a2364
MT
4535 trans->block_rsv = &block_rsv;
4536 trans->bytes_reserved = block_rsv.size;
4537
3c4f91e2 4538 btrfs_record_snapshot_destroy(trans, dir);
f60a2364 4539
045d3967 4540 ret = btrfs_unlink_subvol(trans, dir, dentry);
f60a2364 4541 if (ret) {
f60a2364
MT
4542 btrfs_abort_transaction(trans, ret);
4543 goto out_end_trans;
4544 }
4545
2731f518
JB
4546 ret = btrfs_record_root_in_trans(trans, dest);
4547 if (ret) {
4548 btrfs_abort_transaction(trans, ret);
4549 goto out_end_trans;
4550 }
f60a2364
MT
4551
4552 memset(&dest->root_item.drop_progress, 0,
4553 sizeof(dest->root_item.drop_progress));
c8422684 4554 btrfs_set_root_drop_level(&dest->root_item, 0);
f60a2364
MT
4555 btrfs_set_root_refs(&dest->root_item, 0);
4556
4557 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4558 ret = btrfs_insert_orphan_item(trans,
4559 fs_info->tree_root,
e094f480 4560 btrfs_root_id(dest));
f60a2364
MT
4561 if (ret) {
4562 btrfs_abort_transaction(trans, ret);
f60a2364
MT
4563 goto out_end_trans;
4564 }
4565 }
4566
d1957791 4567 ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
e094f480 4568 BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
f60a2364
MT
4569 if (ret && ret != -ENOENT) {
4570 btrfs_abort_transaction(trans, ret);
f60a2364
MT
4571 goto out_end_trans;
4572 }
4573 if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
d1957791 4574 ret = btrfs_uuid_tree_remove(trans,
f60a2364
MT
4575 dest->root_item.received_uuid,
4576 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
e094f480 4577 btrfs_root_id(dest));
f60a2364
MT
4578 if (ret && ret != -ENOENT) {
4579 btrfs_abort_transaction(trans, ret);
f60a2364
MT
4580 goto out_end_trans;
4581 }
4582 }
4583
082b6c97
QW
4584 free_anon_bdev(dest->anon_dev);
4585 dest->anon_dev = 0;
f60a2364
MT
4586out_end_trans:
4587 trans->block_rsv = NULL;
4588 trans->bytes_reserved = 0;
4589 ret = btrfs_end_transaction(trans);
f60a2364
MT
4590 inode->i_flags |= S_DEAD;
4591out_release:
74e97958
BB
4592 btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
4593 if (qgroup_reserved)
4594 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
3324d054 4595out_undead:
ee0d904f 4596 if (ret) {
f60a2364
MT
4597 spin_lock(&dest->root_item_lock);
4598 root_flags = btrfs_root_flags(&dest->root_item);
4599 btrfs_set_root_flags(&dest->root_item,
4600 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4601 spin_unlock(&dest->root_item_lock);
3324d054
OS
4602 }
4603out_up_write:
4604 up_write(&fs_info->subvol_sem);
4605 if (!ret) {
f60a2364 4606 d_invalidate(dentry);
20a68004 4607 btrfs_prune_dentries(dest);
f60a2364 4608 ASSERT(dest->send_in_progress == 0);
f60a2364
MT
4609 }
4610
ee0d904f 4611 return ret;
f60a2364
MT
4612}
4613
39279cc3
CM
4614static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4615{
2b0143b5 4616 struct inode *inode = d_inode(dentry);
813febdb 4617 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
c3a1cc8f 4618 int ret = 0;
39279cc3 4619 struct btrfs_trans_handle *trans;
44f714da 4620 u64 last_unlink_trans;
ab3c5c18 4621 struct fscrypt_name fname;
39279cc3 4622
b3ae244e 4623 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
134d4512 4624 return -ENOTEMPTY;
813febdb
JB
4625 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
4626 if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
4627 btrfs_err(fs_info,
4628 "extent tree v2 doesn't support snapshot deletion yet");
4629 return -EOPNOTSUPP;
4630 }
3c4f91e2 4631 return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
813febdb 4632 }
134d4512 4633
c3a1cc8f
AJ
4634 ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4635 if (ret)
4636 return ret;
ab3c5c18
STD
4637
4638 /* This needs to handle no-key deletions later on */
4639
e569b1d5 4640 trans = __unlink_start_trans(BTRFS_I(dir));
ab3c5c18 4641 if (IS_ERR(trans)) {
c3a1cc8f 4642 ret = PTR_ERR(trans);
ab3c5c18
STD
4643 goto out_notrans;
4644 }
5df6a9f6 4645
4a0cc7ca 4646 if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
c3a1cc8f 4647 ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
4df27c4d
YZ
4648 goto out;
4649 }
4650
c3a1cc8f
AJ
4651 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4652 if (ret)
4df27c4d 4653 goto out;
7b128766 4654
44f714da
FM
4655 last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4656
39279cc3 4657 /* now the directory is empty */
c3a1cc8f 4658 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
6db75318 4659 &fname.disk_name);
c3a1cc8f 4660 if (!ret) {
6ef06d27 4661 btrfs_i_size_write(BTRFS_I(inode), 0);
44f714da
FM
4662 /*
4663 * Propagate the last_unlink_trans value of the deleted dir to
4664 * its parent directory. This is to prevent an unrecoverable
4665 * log tree in the case we do something like this:
4666 * 1) create dir foo
4667 * 2) create snapshot under dir foo
4668 * 3) delete the snapshot
4669 * 4) rmdir foo
4670 * 5) mkdir foo
4671 * 6) fsync foo or some file inside foo
4672 */
4673 if (last_unlink_trans >= trans->transid)
4674 BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4675 }
4df27c4d 4676out:
3a45bb20 4677 btrfs_end_transaction(trans);
ab3c5c18 4678out_notrans:
813febdb 4679 btrfs_btree_balance_dirty(fs_info);
ab3c5c18 4680 fscrypt_free_filename(&fname);
3954401f 4681
c3a1cc8f 4682 return ret;
39279cc3
CM
4683}
4684
39279cc3 4685/*
9580503b
DS
4686 * Read, zero a chunk and write a block.
4687 *
2aaa6655
JB
4688 * @inode - inode that we're zeroing
4689 * @from - the offset to start zeroing
4690 * @len - the length to zero, 0 to zero the entire range respective to the
4691 * offset
4692 * @front - zero up to the offset instead of from the offset on
4693 *
9703fefe 4694 * This will find the block for the "from" offset and cow the block and zero the
2aaa6655 4695 * part we want to zero. This is used with truncate and hole punching.
39279cc3 4696 */
217f42eb
NB
4697int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
4698 int front)
39279cc3 4699{
217f42eb
NB
4700 struct btrfs_fs_info *fs_info = inode->root->fs_info;
4701 struct address_space *mapping = inode->vfs_inode.i_mapping;
4702 struct extent_io_tree *io_tree = &inode->io_tree;
e6dcd2dc 4703 struct btrfs_ordered_extent *ordered;
2ac55d41 4704 struct extent_state *cached_state = NULL;
364ecf36 4705 struct extent_changeset *data_reserved = NULL;
6d4572a9 4706 bool only_release_metadata = false;
0b246afa 4707 u32 blocksize = fs_info->sectorsize;
09cbfeaf 4708 pgoff_t index = from >> PAGE_SHIFT;
9703fefe 4709 unsigned offset = from & (blocksize - 1);
df055afe 4710 struct folio *folio;
3b16a4e3 4711 gfp_t mask = btrfs_alloc_write_mask(mapping);
6d4572a9 4712 size_t write_bytes = blocksize;
39279cc3 4713 int ret = 0;
9703fefe
CR
4714 u64 block_start;
4715 u64 block_end;
39279cc3 4716
b03ebd99
NB
4717 if (IS_ALIGNED(offset, blocksize) &&
4718 (!len || IS_ALIGNED(len, blocksize)))
39279cc3 4719 goto out;
9703fefe 4720
8b62f87b
JB
4721 block_start = round_down(from, blocksize);
4722 block_end = block_start + blocksize - 1;
4723
217f42eb 4724 ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
1daedb1d 4725 blocksize, false);
6d4572a9 4726 if (ret < 0) {
80f9d241 4727 if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
6d4572a9
QW
4728 /* For nocow case, no need to reserve data space */
4729 only_release_metadata = true;
4730 } else {
4731 goto out;
4732 }
4733 }
d4135134 4734 ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
6d4572a9
QW
4735 if (ret < 0) {
4736 if (!only_release_metadata)
217f42eb
NB
4737 btrfs_free_reserved_data_space(inode, data_reserved,
4738 block_start, blocksize);
6d4572a9
QW
4739 goto out;
4740 }
211c17f5 4741again:
df055afe
GR
4742 folio = __filemap_get_folio(mapping, index,
4743 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
4744 if (IS_ERR(folio)) {
217f42eb
NB
4745 btrfs_delalloc_release_space(inode, data_reserved, block_start,
4746 blocksize, true);
4747 btrfs_delalloc_release_extents(inode, blocksize);
ac6a2b36 4748 ret = -ENOMEM;
39279cc3 4749 goto out;
5d5e103a 4750 }
e6dcd2dc 4751
df055afe
GR
4752 if (!folio_test_uptodate(folio)) {
4753 ret = btrfs_read_folio(NULL, folio);
4754 folio_lock(folio);
4755 if (folio->mapping != mapping) {
4756 folio_unlock(folio);
4757 folio_put(folio);
211c17f5
CM
4758 goto again;
4759 }
df055afe 4760 if (!folio_test_uptodate(folio)) {
39279cc3 4761 ret = -EIO;
89642229 4762 goto out_unlock;
39279cc3
CM
4763 }
4764 }
17b17fcd
JB
4765
4766 /*
4767 * We unlock the page after the io is completed and then re-lock it
4768 * above. release_folio() could have come in between that and cleared
cfbf07e2 4769 * folio private, but left the page in the mapping. Set the page mapped
17b17fcd
JB
4770 * here to make sure it's properly set for the subpage stuff.
4771 */
df055afe 4772 ret = set_folio_extent_mapped(folio);
17b17fcd
JB
4773 if (ret < 0)
4774 goto out_unlock;
4775
df055afe 4776 folio_wait_writeback(folio);
e6dcd2dc 4777
570eb97b 4778 lock_extent(io_tree, block_start, block_end, &cached_state);
e6dcd2dc 4779
217f42eb 4780 ordered = btrfs_lookup_ordered_extent(inode, block_start);
e6dcd2dc 4781 if (ordered) {
570eb97b 4782 unlock_extent(io_tree, block_start, block_end, &cached_state);
df055afe
GR
4783 folio_unlock(folio);
4784 folio_put(folio);
36d45567 4785 btrfs_start_ordered_extent(ordered);
e6dcd2dc
CM
4786 btrfs_put_ordered_extent(ordered);
4787 goto again;
4788 }
4789
217f42eb 4790 clear_extent_bit(&inode->io_tree, block_start, block_end,
e182163d 4791 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
bd015294 4792 &cached_state);
5d5e103a 4793
217f42eb 4794 ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
330a5827 4795 &cached_state);
9ed74f2d 4796 if (ret) {
570eb97b 4797 unlock_extent(io_tree, block_start, block_end, &cached_state);
9ed74f2d
JB
4798 goto out_unlock;
4799 }
4800
9703fefe 4801 if (offset != blocksize) {
2aaa6655 4802 if (!len)
9703fefe 4803 len = blocksize - offset;
2aaa6655 4804 if (front)
df055afe
GR
4805 folio_zero_range(folio, block_start - folio_pos(folio),
4806 offset);
2aaa6655 4807 else
df055afe
GR
4808 folio_zero_range(folio,
4809 (block_start - folio_pos(folio)) + offset,
4810 len);
e6dcd2dc 4811 }
df055afe 4812 btrfs_folio_clear_checked(fs_info, folio, block_start,
55151ea9 4813 block_end + 1 - block_start);
df055afe 4814 btrfs_folio_set_dirty(fs_info, folio, block_start,
55151ea9 4815 block_end + 1 - block_start);
570eb97b 4816 unlock_extent(io_tree, block_start, block_end, &cached_state);
39279cc3 4817
6d4572a9 4818 if (only_release_metadata)
217f42eb 4819 set_extent_bit(&inode->io_tree, block_start, block_end,
1d126800 4820 EXTENT_NORESERVE, NULL);
6d4572a9 4821
89642229 4822out_unlock:
6d4572a9
QW
4823 if (ret) {
4824 if (only_release_metadata)
217f42eb 4825 btrfs_delalloc_release_metadata(inode, blocksize, true);
6d4572a9 4826 else
217f42eb 4827 btrfs_delalloc_release_space(inode, data_reserved,
6d4572a9
QW
4828 block_start, blocksize, true);
4829 }
217f42eb 4830 btrfs_delalloc_release_extents(inode, blocksize);
df055afe
GR
4831 folio_unlock(folio);
4832 folio_put(folio);
39279cc3 4833out:
6d4572a9 4834 if (only_release_metadata)
217f42eb 4835 btrfs_check_nocow_unlock(inode);
364ecf36 4836 extent_changeset_free(data_reserved);
39279cc3
CM
4837 return ret;
4838}
4839
0a325e62 4840static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
16e7549f 4841{
0a325e62 4842 struct btrfs_root *root = inode->root;
a4ba6cc0 4843 struct btrfs_fs_info *fs_info = root->fs_info;
16e7549f 4844 struct btrfs_trans_handle *trans;
5893dfb9 4845 struct btrfs_drop_extents_args drop_args = { 0 };
16e7549f
JB
4846 int ret;
4847
4848 /*
cceaa89f
FM
4849 * If NO_HOLES is enabled, we don't need to do anything.
4850 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
4851 * or btrfs_update_inode() will be called, which guarantee that the next
4852 * fsync will know this inode was changed and needs to be logged.
16e7549f 4853 */
cceaa89f 4854 if (btrfs_fs_incompat(fs_info, NO_HOLES))
16e7549f 4855 return 0;
16e7549f
JB
4856
4857 /*
4858 * 1 - for the one we're dropping
4859 * 1 - for the one we're adding
4860 * 1 - for updating the inode.
4861 */
4862 trans = btrfs_start_transaction(root, 3);
4863 if (IS_ERR(trans))
4864 return PTR_ERR(trans);
4865
5893dfb9
FM
4866 drop_args.start = offset;
4867 drop_args.end = offset + len;
4868 drop_args.drop_cache = true;
4869
a4ba6cc0 4870 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
16e7549f 4871 if (ret) {
66642832 4872 btrfs_abort_transaction(trans, ret);
3a45bb20 4873 btrfs_end_transaction(trans);
16e7549f
JB
4874 return ret;
4875 }
4876
d1f68ba0 4877 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
2766ff61 4878 if (ret) {
66642832 4879 btrfs_abort_transaction(trans, ret);
2766ff61 4880 } else {
a4ba6cc0 4881 btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
8b9d0322 4882 btrfs_update_inode(trans, inode);
2766ff61 4883 }
3a45bb20 4884 btrfs_end_transaction(trans);
16e7549f
JB
4885 return ret;
4886}
4887
695a0d0d
JB
4888/*
4889 * This function puts in dummy file extents for the area we're creating a hole
4890 * for. So if we are truncating this file to a larger size we need to insert
4891 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4892 * the range between oldsize and size
4893 */
b06359a3 4894int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
39279cc3 4895{
b06359a3
NB
4896 struct btrfs_root *root = inode->root;
4897 struct btrfs_fs_info *fs_info = root->fs_info;
4898 struct extent_io_tree *io_tree = &inode->io_tree;
a22285a6 4899 struct extent_map *em = NULL;
2ac55d41 4900 struct extent_state *cached_state = NULL;
0b246afa
JM
4901 u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4902 u64 block_end = ALIGN(size, fs_info->sectorsize);
9036c102
YZ
4903 u64 last_byte;
4904 u64 cur_offset;
4905 u64 hole_size;
5e45b044 4906 int ret = 0;
39279cc3 4907
a71754fc 4908 /*
9703fefe
CR
4909 * If our size started in the middle of a block we need to zero out the
4910 * rest of the block before we expand the i_size, otherwise we could
a71754fc
JB
4911 * expose stale data.
4912 */
5e45b044
AJ
4913 ret = btrfs_truncate_block(inode, oldsize, 0, 0);
4914 if (ret)
4915 return ret;
a71754fc 4916
9036c102
YZ
4917 if (size <= hole_start)
4918 return 0;
4919
b06359a3
NB
4920 btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
4921 &cached_state);
9036c102
YZ
4922 cur_offset = hole_start;
4923 while (1) {
8bab0a30 4924 em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
79787eaa 4925 if (IS_ERR(em)) {
5e45b044 4926 ret = PTR_ERR(em);
f2767956 4927 em = NULL;
79787eaa
JM
4928 break;
4929 }
9036c102 4930 last_byte = min(extent_map_end(em), block_end);
0b246afa 4931 last_byte = ALIGN(last_byte, fs_info->sectorsize);
9ddc959e
JB
4932 hole_size = last_byte - cur_offset;
4933
f86f7a75 4934 if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
5dc562c5 4935 struct extent_map *hole_em;
9ed74f2d 4936
5e45b044
AJ
4937 ret = maybe_insert_hole(inode, cur_offset, hole_size);
4938 if (ret)
3893e33b 4939 break;
9ddc959e 4940
5e45b044 4941 ret = btrfs_inode_set_file_extent_range(inode,
9ddc959e 4942 cur_offset, hole_size);
5e45b044 4943 if (ret)
9ddc959e
JB
4944 break;
4945
5dc562c5
JB
4946 hole_em = alloc_extent_map();
4947 if (!hole_em) {
a1ba4c08
FM
4948 btrfs_drop_extent_map_range(inode, cur_offset,
4949 cur_offset + hole_size - 1,
4950 false);
23e3337f 4951 btrfs_set_inode_full_sync(inode);
5dc562c5
JB
4952 goto next;
4953 }
4954 hole_em->start = cur_offset;
4955 hole_em->len = hole_size;
8082510e 4956
3d2ac992 4957 hole_em->disk_bytenr = EXTENT_MAP_HOLE;
e8fe524d 4958 hole_em->disk_num_bytes = 0;
cc95bef6 4959 hole_em->ram_bytes = hole_size;
4a4f8fe2 4960 hole_em->generation = btrfs_get_fs_generation(fs_info);
8082510e 4961
5e45b044 4962 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
5dc562c5 4963 free_extent_map(hole_em);
9ddc959e 4964 } else {
5e45b044 4965 ret = btrfs_inode_set_file_extent_range(inode,
9ddc959e 4966 cur_offset, hole_size);
5e45b044 4967 if (ret)
9ddc959e 4968 break;
9036c102 4969 }
16e7549f 4970next:
9036c102 4971 free_extent_map(em);
a22285a6 4972 em = NULL;
9036c102 4973 cur_offset = last_byte;
8082510e 4974 if (cur_offset >= block_end)
9036c102
YZ
4975 break;
4976 }
a22285a6 4977 free_extent_map(em);
570eb97b 4978 unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
5e45b044 4979 return ret;
9036c102 4980}
39279cc3 4981
3972f260 4982static int btrfs_setsize(struct inode *inode, struct iattr *attr)
8082510e 4983{
f4a2f4c5
MX
4984 struct btrfs_root *root = BTRFS_I(inode)->root;
4985 struct btrfs_trans_handle *trans;
a41ad394 4986 loff_t oldsize = i_size_read(inode);
3972f260
ES
4987 loff_t newsize = attr->ia_size;
4988 int mask = attr->ia_valid;
8082510e
YZ
4989 int ret;
4990
3972f260
ES
4991 /*
4992 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4993 * special case where we need to update the times despite not having
4994 * these flags set. For all other operations the VFS set these flags
4995 * explicitly if it wants a timestamp update.
4996 */
dff6efc3
CH
4997 if (newsize != oldsize) {
4998 inode_inc_iversion(inode);
c1867eb3 4999 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
b1c38a13
JL
5000 inode_set_mtime_to_ts(inode,
5001 inode_set_ctime_current(inode));
c1867eb3 5002 }
dff6efc3 5003 }
3972f260 5004
a41ad394 5005 if (newsize > oldsize) {
9ea24bbe 5006 /*
ea14b57f 5007 * Don't do an expanding truncate while snapshotting is ongoing.
9ea24bbe
FM
5008 * This is to ensure the snapshot captures a fully consistent
5009 * state of this file - if the snapshot captures this expanding
5010 * truncation, it must capture all writes that happened before
5011 * this truncation.
5012 */
dcc3eb96 5013 btrfs_drew_write_lock(&root->snapshot_lock);
b06359a3 5014 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
9ea24bbe 5015 if (ret) {
dcc3eb96 5016 btrfs_drew_write_unlock(&root->snapshot_lock);
8082510e 5017 return ret;
9ea24bbe 5018 }
8082510e 5019
f4a2f4c5 5020 trans = btrfs_start_transaction(root, 1);
9ea24bbe 5021 if (IS_ERR(trans)) {
dcc3eb96 5022 btrfs_drew_write_unlock(&root->snapshot_lock);
f4a2f4c5 5023 return PTR_ERR(trans);
9ea24bbe 5024 }
f4a2f4c5
MX
5025
5026 i_size_write(inode, newsize);
76aea537 5027 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
27772b68 5028 pagecache_isize_extended(inode, oldsize, newsize);
8b9d0322 5029 ret = btrfs_update_inode(trans, BTRFS_I(inode));
dcc3eb96 5030 btrfs_drew_write_unlock(&root->snapshot_lock);
3a45bb20 5031 btrfs_end_transaction(trans);
a41ad394 5032 } else {
41044b41 5033 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
24c0a722
NA
5034
5035 if (btrfs_is_zoned(fs_info)) {
e641e323 5036 ret = btrfs_wait_ordered_range(BTRFS_I(inode),
24c0a722
NA
5037 ALIGN(newsize, fs_info->sectorsize),
5038 (u64)-1);
5039 if (ret)
5040 return ret;
5041 }
8082510e 5042
a41ad394
JB
5043 /*
5044 * We're truncating a file that used to have good data down to
1fd4033d
NB
5045 * zero. Make sure any new writes to the file get on disk
5046 * on close.
a41ad394
JB
5047 */
5048 if (newsize == 0)
1fd4033d 5049 set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
72ac3c0d 5050 &BTRFS_I(inode)->runtime_flags);
8082510e 5051
a41ad394 5052 truncate_setsize(inode, newsize);
2e60a51e 5053
2e60a51e 5054 inode_dio_wait(inode);
2e60a51e 5055
d9dcae67 5056 ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
7f4f6e0a
JB
5057 if (ret && inode->i_nlink) {
5058 int err;
5059
5060 /*
f7e9e8fc
OS
5061 * Truncate failed, so fix up the in-memory size. We
5062 * adjusted disk_i_size down as we removed extents, so
5063 * wait for disk_i_size to be stable and then update the
5064 * in-memory size to match.
7f4f6e0a 5065 */
e641e323 5066 err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
7f4f6e0a 5067 if (err)
f7e9e8fc
OS
5068 return err;
5069 i_size_write(inode, BTRFS_I(inode)->disk_i_size);
7f4f6e0a 5070 }
8082510e
YZ
5071 }
5072
a41ad394 5073 return ret;
8082510e
YZ
5074}
5075
c1632a0f 5076static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
549c7297 5077 struct iattr *attr)
9036c102 5078{
2b0143b5 5079 struct inode *inode = d_inode(dentry);
b83cc969 5080 struct btrfs_root *root = BTRFS_I(inode)->root;
9036c102 5081 int err;
39279cc3 5082
b83cc969
LZ
5083 if (btrfs_root_readonly(root))
5084 return -EROFS;
5085
c1632a0f 5086 err = setattr_prepare(idmap, dentry, attr);
9036c102
YZ
5087 if (err)
5088 return err;
2bf5a725 5089
5a3f23d5 5090 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
3972f260 5091 err = btrfs_setsize(inode, attr);
8082510e
YZ
5092 if (err)
5093 return err;
39279cc3 5094 }
9036c102 5095
1025774c 5096 if (attr->ia_valid) {
c1632a0f 5097 setattr_copy(idmap, inode, attr);
0c4d2d95 5098 inode_inc_iversion(inode);
7152b425 5099 err = btrfs_dirty_inode(BTRFS_I(inode));
1025774c 5100
22c44fe6 5101 if (!err && attr->ia_valid & ATTR_MODE)
13e83a49 5102 err = posix_acl_chmod(idmap, dentry, inode->i_mode);
1025774c 5103 }
33268eaf 5104
39279cc3
CM
5105 return err;
5106}
61295eb8 5107
131e404a 5108/*
895586eb
MWO
5109 * While truncating the inode pages during eviction, we get the VFS
5110 * calling btrfs_invalidate_folio() against each folio of the inode. This
5111 * is slow because the calls to btrfs_invalidate_folio() result in a
570eb97b 5112 * huge amount of calls to lock_extent() and clear_extent_bit(),
895586eb
MWO
5113 * which keep merging and splitting extent_state structures over and over,
5114 * wasting lots of time.
131e404a 5115 *
895586eb
MWO
5116 * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
5117 * skip all those expensive operations on a per folio basis and do only
5118 * the ordered io finishing, while we release here the extent_map and
5119 * extent_state structures, without the excessive merging and splitting.
131e404a
FDBM
5120 */
5121static void evict_inode_truncate_pages(struct inode *inode)
5122{
5123 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
131e404a
FDBM
5124 struct rb_node *node;
5125
5126 ASSERT(inode->i_state & I_FREEING);
91b0abe3 5127 truncate_inode_pages_final(&inode->i_data);
131e404a 5128
9c9d1b4f 5129 btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
131e404a 5130
6ca07097
FM
5131 /*
5132 * Keep looping until we have no more ranges in the io tree.
ba206a02
MWO
5133 * We can have ongoing bios started by readahead that have
5134 * their endio callback (extent_io.c:end_bio_extent_readpage)
9c6429d9
FM
5135 * still in progress (unlocked the pages in the bio but did not yet
5136 * unlocked the ranges in the io tree). Therefore this means some
6ca07097
FM
5137 * ranges can still be locked and eviction started because before
5138 * submitting those bios, which are executed by a separate task (work
5139 * queue kthread), inode references (inode->i_count) were not taken
5140 * (which would be dropped in the end io callback of each bio).
5141 * Therefore here we effectively end up waiting for those bios and
5142 * anyone else holding locked ranges without having bumped the inode's
5143 * reference count - if we don't do it, when they access the inode's
5144 * io_tree to unlock a range it may be too late, leading to an
5145 * use-after-free issue.
5146 */
131e404a
FDBM
5147 spin_lock(&io_tree->lock);
5148 while (!RB_EMPTY_ROOT(&io_tree->state)) {
5149 struct extent_state *state;
5150 struct extent_state *cached_state = NULL;
6ca07097
FM
5151 u64 start;
5152 u64 end;
421f0922 5153 unsigned state_flags;
131e404a
FDBM
5154
5155 node = rb_first(&io_tree->state);
5156 state = rb_entry(node, struct extent_state, rb_node);
6ca07097
FM
5157 start = state->start;
5158 end = state->end;
421f0922 5159 state_flags = state->state;
131e404a
FDBM
5160 spin_unlock(&io_tree->lock);
5161
570eb97b 5162 lock_extent(io_tree, start, end, &cached_state);
b9d0b389
QW
5163
5164 /*
5165 * If still has DELALLOC flag, the extent didn't reach disk,
5166 * and its reserved space won't be freed by delayed_ref.
5167 * So we need to free its reserved space here.
895586eb 5168 * (Refer to comment in btrfs_invalidate_folio, case 2)
b9d0b389
QW
5169 *
5170 * Note, end is the bytenr of last byte, so we need + 1 here.
5171 */
421f0922 5172 if (state_flags & EXTENT_DELALLOC)
8b8a979f 5173 btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
9e65bfca 5174 end - start + 1, NULL);
b9d0b389 5175
6ca07097 5176 clear_extent_bit(io_tree, start, end,
bd015294 5177 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
e182163d 5178 &cached_state);
131e404a 5179
7064dd5c 5180 cond_resched();
131e404a
FDBM
5181 spin_lock(&io_tree->lock);
5182 }
5183 spin_unlock(&io_tree->lock);
5184}
5185
4b9d7b59 5186static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
ad80cf50 5187 struct btrfs_block_rsv *rsv)
4b9d7b59
OS
5188{
5189 struct btrfs_fs_info *fs_info = root->fs_info;
d3984c90 5190 struct btrfs_trans_handle *trans;
b13d57db 5191 u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
d3984c90 5192 int ret;
4b9d7b59 5193
d3984c90
JB
5194 /*
5195 * Eviction should be taking place at some place safe because of our
5196 * delayed iputs. However the normal flushing code will run delayed
5197 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5198 *
5199 * We reserve the delayed_refs_extra here again because we can't use
5200 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5201 * above. We reserve our extra bit here because we generate a ton of
5202 * delayed refs activity by truncating.
5203 *
ee6adbfd
JB
5204 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
5205 * if we fail to make this reservation we can re-try without the
5206 * delayed_refs_extra so we can make some forward progress.
d3984c90 5207 */
9270501c 5208 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
d3984c90
JB
5209 BTRFS_RESERVE_FLUSH_EVICT);
5210 if (ret) {
9270501c 5211 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
ee6adbfd
JB
5212 BTRFS_RESERVE_FLUSH_EVICT);
5213 if (ret) {
d3984c90
JB
5214 btrfs_warn(fs_info,
5215 "could not allocate space for delete; will truncate on mount");
5216 return ERR_PTR(-ENOSPC);
5217 }
5218 delayed_refs_extra = 0;
5219 }
4b9d7b59 5220
d3984c90
JB
5221 trans = btrfs_join_transaction(root);
5222 if (IS_ERR(trans))
5223 return trans;
5224
5225 if (delayed_refs_extra) {
5226 trans->block_rsv = &fs_info->trans_block_rsv;
5227 trans->bytes_reserved = delayed_refs_extra;
5228 btrfs_block_rsv_migrate(rsv, trans->block_rsv,
4e0527de 5229 delayed_refs_extra, true);
4b9d7b59 5230 }
d3984c90 5231 return trans;
4b9d7b59
OS
5232}
5233
bd555975 5234void btrfs_evict_inode(struct inode *inode)
39279cc3 5235{
41044b41 5236 struct btrfs_fs_info *fs_info;
39279cc3
CM
5237 struct btrfs_trans_handle *trans;
5238 struct btrfs_root *root = BTRFS_I(inode)->root;
b7b1167c 5239 struct btrfs_block_rsv *rsv = NULL;
39279cc3
CM
5240 int ret;
5241
1abe9b8a 5242 trace_btrfs_inode_evict(inode);
5243
3d48d981 5244 if (!root) {
14605409 5245 fsverity_cleanup_inode(inode);
e8f1bc14 5246 clear_inode(inode);
3d48d981
NB
5247 return;
5248 }
5249
41044b41 5250 fs_info = inode_to_fs_info(inode);
131e404a
FDBM
5251 evict_inode_truncate_pages(inode);
5252
69e9c6c6
SB
5253 if (inode->i_nlink &&
5254 ((btrfs_root_refs(&root->root_item) != 0 &&
e094f480 5255 btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) ||
70ddc553 5256 btrfs_is_free_space_inode(BTRFS_I(inode))))
b7b1167c 5257 goto out;
bd555975 5258
27919067 5259 if (is_bad_inode(inode))
b7b1167c 5260 goto out;
5f39d397 5261
7b40b695 5262 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
b7b1167c 5263 goto out;
c71bf099 5264
76dda93c 5265 if (inode->i_nlink > 0) {
69e9c6c6 5266 BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
e094f480 5267 btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID);
b7b1167c 5268 goto out;
76dda93c
YZ
5269 }
5270
2adc75d6
JB
5271 /*
5272 * This makes sure the inode item in tree is uptodate and the space for
5273 * the inode update is released.
5274 */
aa79021f 5275 ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
27919067 5276 if (ret)
b7b1167c 5277 goto out;
0e8c36a9 5278
2adc75d6
JB
5279 /*
5280 * This drops any pending insert or delete operations we have for this
5281 * inode. We could have a delayed dir index deletion queued up, but
5282 * we're removing the inode completely so that'll be taken care of in
5283 * the truncate.
5284 */
5285 btrfs_kill_delayed_inode_items(BTRFS_I(inode));
5286
2ff7e61e 5287 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
27919067 5288 if (!rsv)
b7b1167c 5289 goto out;
2bd36e7b 5290 rsv->size = btrfs_calc_metadata_size(fs_info, 1);
710d5921 5291 rsv->failfast = true;
4289a667 5292
6ef06d27 5293 btrfs_i_size_write(BTRFS_I(inode), 0);
5f39d397 5294
8082510e 5295 while (1) {
d9ac19c3 5296 struct btrfs_truncate_control control = {
71d18b53 5297 .inode = BTRFS_I(inode),
487e81d2 5298 .ino = btrfs_ino(BTRFS_I(inode)),
d9ac19c3
JB
5299 .new_size = 0,
5300 .min_type = 0,
5301 };
5302
ad80cf50 5303 trans = evict_refill_and_join(root, rsv);
27919067 5304 if (IS_ERR(trans))
b7b1167c 5305 goto out;
7b128766 5306
4289a667
JB
5307 trans->block_rsv = rsv;
5308
71d18b53 5309 ret = btrfs_truncate_inode_items(trans, root, &control);
27919067
OS
5310 trans->block_rsv = &fs_info->trans_block_rsv;
5311 btrfs_end_transaction(trans);
afa4b0af
FM
5312 /*
5313 * We have not added new delayed items for our inode after we
5314 * have flushed its delayed items, so no need to throttle on
5315 * delayed items. However we have modified extent buffers.
5316 */
5317 btrfs_btree_balance_dirty_nodelay(fs_info);
27919067 5318 if (ret && ret != -ENOSPC && ret != -EAGAIN)
b7b1167c 5319 goto out;
27919067 5320 else if (!ret)
8082510e 5321 break;
8082510e 5322 }
5f39d397 5323
4ef31a45 5324 /*
27919067
OS
5325 * Errors here aren't a big deal, it just means we leave orphan items in
5326 * the tree. They will be cleaned up on the next mount. If the inode
5327 * number gets reused, cleanup deletes the orphan item without doing
5328 * anything, and unlink reuses the existing orphan item.
5329 *
5330 * If it turns out that we are dropping too many of these, we might want
5331 * to add a mechanism for retrying these after a commit.
4ef31a45 5332 */
ad80cf50 5333 trans = evict_refill_and_join(root, rsv);
27919067
OS
5334 if (!IS_ERR(trans)) {
5335 trans->block_rsv = rsv;
5336 btrfs_orphan_del(trans, BTRFS_I(inode));
5337 trans->block_rsv = &fs_info->trans_block_rsv;
5338 btrfs_end_transaction(trans);
5339 }
54aa1f4d 5340
b7b1167c 5341out:
27919067 5342 btrfs_free_block_rsv(fs_info, rsv);
27919067
OS
5343 /*
5344 * If we didn't successfully delete, the orphan item will still be in
5345 * the tree and we'll retry on the next mount. Again, we might also want
5346 * to retry these periodically in the future.
5347 */
f48d1cf5 5348 btrfs_remove_delayed_node(BTRFS_I(inode));
14605409 5349 fsverity_cleanup_inode(inode);
dbd5768f 5350 clear_inode(inode);
39279cc3
CM
5351}
5352
5353/*
6bf9e4bd
QW
5354 * Return the key found in the dir entry in the location pointer, fill @type
5355 * with BTRFS_FT_*, and return 0.
5356 *
005d6712
SY
5357 * If no dir entries were found, returns -ENOENT.
5358 * If found a corrupted location in dir entry, returns -EUCLEAN.
39279cc3 5359 */
d1de429b 5360static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
6bf9e4bd 5361 struct btrfs_key *location, u8 *type)
39279cc3 5362{
39279cc3
CM
5363 struct btrfs_dir_item *di;
5364 struct btrfs_path *path;
d1de429b 5365 struct btrfs_root *root = dir->root;
0d9f7f3e 5366 int ret = 0;
ab3c5c18 5367 struct fscrypt_name fname;
39279cc3
CM
5368
5369 path = btrfs_alloc_path();
d8926bb3
MF
5370 if (!path)
5371 return -ENOMEM;
3954401f 5372
d1de429b 5373 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
10a8857a 5374 if (ret < 0)
ab3c5c18 5375 goto out;
10a8857a
STD
5376 /*
5377 * fscrypt_setup_filename() should never return a positive value, but
5378 * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
5379 */
5380 ASSERT(ret == 0);
ab3c5c18 5381
ab3c5c18
STD
5382 /* This needs to handle no-key deletions later on */
5383
d1de429b 5384 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
6db75318 5385 &fname.disk_name, 0);
3cf5068f
LB
5386 if (IS_ERR_OR_NULL(di)) {
5387 ret = di ? PTR_ERR(di) : -ENOENT;
005d6712
SY
5388 goto out;
5389 }
d397712b 5390
5f39d397 5391 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
56a0e706
LB
5392 if (location->type != BTRFS_INODE_ITEM_KEY &&
5393 location->type != BTRFS_ROOT_ITEM_KEY) {
005d6712 5394 ret = -EUCLEAN;
56a0e706
LB
5395 btrfs_warn(root->fs_info,
5396"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
d1de429b 5397 __func__, fname.disk_name.name, btrfs_ino(dir),
56a0e706 5398 location->objectid, location->type, location->offset);
56a0e706 5399 }
6bf9e4bd 5400 if (!ret)
94a48aef 5401 *type = btrfs_dir_ftype(path->nodes[0], di);
39279cc3 5402out:
ab3c5c18 5403 fscrypt_free_filename(&fname);
39279cc3
CM
5404 btrfs_free_path(path);
5405 return ret;
5406}
5407
5408/*
5409 * when we hit a tree root in a directory, the btrfs part of the inode
5410 * needs to be changed to reflect the root directory of the tree root. This
5411 * is kind of like crossing a mount point.
5412 */
2ff7e61e 5413static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
3c1b1c4c 5414 struct btrfs_inode *dir,
4df27c4d
YZ
5415 struct dentry *dentry,
5416 struct btrfs_key *location,
5417 struct btrfs_root **sub_root)
39279cc3 5418{
4df27c4d
YZ
5419 struct btrfs_path *path;
5420 struct btrfs_root *new_root;
5421 struct btrfs_root_ref *ref;
5422 struct extent_buffer *leaf;
1d4c08e0 5423 struct btrfs_key key;
4df27c4d
YZ
5424 int ret;
5425 int err = 0;
ab3c5c18 5426 struct fscrypt_name fname;
ab3c5c18 5427
3c1b1c4c 5428 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
ab3c5c18
STD
5429 if (ret)
5430 return ret;
39279cc3 5431
4df27c4d
YZ
5432 path = btrfs_alloc_path();
5433 if (!path) {
5434 err = -ENOMEM;
5435 goto out;
5436 }
39279cc3 5437
4df27c4d 5438 err = -ENOENT;
e094f480 5439 key.objectid = btrfs_root_id(dir->root);
1d4c08e0
DS
5440 key.type = BTRFS_ROOT_REF_KEY;
5441 key.offset = location->objectid;
5442
0b246afa 5443 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4df27c4d
YZ
5444 if (ret) {
5445 if (ret < 0)
5446 err = ret;
5447 goto out;
5448 }
39279cc3 5449
4df27c4d
YZ
5450 leaf = path->nodes[0];
5451 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
3c1b1c4c 5452 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
6db75318 5453 btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
4df27c4d 5454 goto out;
39279cc3 5455
6db75318
STD
5456 ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
5457 (unsigned long)(ref + 1), fname.disk_name.len);
4df27c4d
YZ
5458 if (ret)
5459 goto out;
5460
b3b4aa74 5461 btrfs_release_path(path);
4df27c4d 5462
56e9357a 5463 new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
4df27c4d
YZ
5464 if (IS_ERR(new_root)) {
5465 err = PTR_ERR(new_root);
5466 goto out;
5467 }
5468
4df27c4d
YZ
5469 *sub_root = new_root;
5470 location->objectid = btrfs_root_dirid(&new_root->root_item);
5471 location->type = BTRFS_INODE_ITEM_KEY;
5472 location->offset = 0;
5473 err = 0;
5474out:
5475 btrfs_free_path(path);
ab3c5c18 5476 fscrypt_free_filename(&fname);
4df27c4d 5477 return err;
39279cc3
CM
5478}
5479
061ea858 5480static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
5d4f98a2 5481{
4c45a4f4 5482 struct btrfs_root *root = inode->root;
310b2f5d
FM
5483 struct btrfs_inode *existing;
5484 const u64 ino = btrfs_ino(inode);
5485 int ret;
5d4f98a2 5486
4c45a4f4 5487 if (inode_unhashed(&inode->vfs_inode))
310b2f5d 5488 return 0;
5d4f98a2 5489
061ea858
FM
5490 if (prealloc) {
5491 ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
5492 if (ret)
5493 return ret;
5d4f98a2 5494 }
310b2f5d 5495
310b2f5d 5496 existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
5d4f98a2 5497
310b2f5d
FM
5498 if (xa_is_err(existing)) {
5499 ret = xa_err(existing);
5500 ASSERT(ret != -EINVAL);
5501 ASSERT(ret != -ENOMEM);
5502 return ret;
5503 } else if (existing) {
5504 WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
5d4f98a2 5505 }
310b2f5d
FM
5506
5507 return 0;
5d4f98a2
YZ
5508}
5509
310b2f5d 5510static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
5d4f98a2 5511{
b79b7249 5512 struct btrfs_root *root = inode->root;
310b2f5d
FM
5513 struct btrfs_inode *entry;
5514 bool empty = false;
5d4f98a2 5515
e2844cce
FM
5516 xa_lock(&root->inodes);
5517 entry = __xa_erase(&root->inodes, btrfs_ino(inode));
310b2f5d
FM
5518 if (entry == inode)
5519 empty = xa_empty(&root->inodes);
e2844cce 5520 xa_unlock(&root->inodes);
76dda93c 5521
69e9c6c6 5522 if (empty && btrfs_root_refs(&root->root_item) == 0) {
e2844cce 5523 xa_lock(&root->inodes);
310b2f5d 5524 empty = xa_empty(&root->inodes);
e2844cce 5525 xa_unlock(&root->inodes);
76dda93c
YZ
5526 if (empty)
5527 btrfs_add_dead_root(root);
5528 }
5529}
5530
5d4f98a2 5531
e02119d5
CM
5532static int btrfs_init_locked_inode(struct inode *inode, void *p)
5533{
5534 struct btrfs_iget_args *args = p;
0202e83f 5535
7a7bc214 5536 btrfs_set_inode_number(BTRFS_I(inode), args->ino);
5c8fd99f 5537 BTRFS_I(inode)->root = btrfs_grab_root(args->root);
9b9b8854
JB
5538
5539 if (args->root && args->root == args->root->fs_info->tree_root &&
5540 args->ino != BTRFS_BTREE_INODE_OBJECTID)
5541 set_bit(BTRFS_INODE_FREE_SPACE_INODE,
5542 &BTRFS_I(inode)->runtime_flags);
39279cc3
CM
5543 return 0;
5544}
5545
5546static int btrfs_find_actor(struct inode *inode, void *opaque)
5547{
5548 struct btrfs_iget_args *args = opaque;
0202e83f 5549
7a7bc214 5550 return args->ino == btrfs_ino(BTRFS_I(inode)) &&
d397712b 5551 args->root == BTRFS_I(inode)->root;
39279cc3
CM
5552}
5553
b7519157 5554static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
39279cc3
CM
5555{
5556 struct inode *inode;
5557 struct btrfs_iget_args args;
0202e83f 5558 unsigned long hashval = btrfs_inode_hash(ino, root);
778ba82b 5559
0202e83f 5560 args.ino = ino;
39279cc3
CM
5561 args.root = root;
5562
a1b547f0 5563 inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
39279cc3
CM
5564 btrfs_init_locked_inode,
5565 (void *)&args);
5566 return inode;
5567}
5568
4c66e0d4 5569/*
0202e83f 5570 * Get an inode object given its inode number and corresponding root.
4c66e0d4
DS
5571 * Path can be preallocated to prevent recursing back to iget through
5572 * allocator. NULL is also valid but may require an additional allocation
5573 * later.
1a54ef8c 5574 */
d383eb69
FM
5575struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
5576 struct btrfs_path *path)
1a54ef8c
BR
5577{
5578 struct inode *inode;
d25f4ec1 5579 int ret;
1a54ef8c 5580
b7519157 5581 inode = btrfs_iget_locked(ino, root);
1a54ef8c 5582 if (!inode)
5d4f98a2 5583 return ERR_PTR(-ENOMEM);
1a54ef8c 5584
d25f4ec1
FM
5585 if (!(inode->i_state & I_NEW))
5586 return inode;
67710892 5587
d25f4ec1
FM
5588 ret = btrfs_read_locked_inode(inode, path);
5589 /*
5590 * ret > 0 can come from btrfs_search_slot called by
5591 * btrfs_read_locked_inode(), this means the inode item was not found.
5592 */
5593 if (ret > 0)
5594 ret = -ENOENT;
5595 if (ret < 0)
5596 goto error;
5597
5598 ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
5599 if (ret < 0)
5600 goto error;
5601
5602 unlock_new_inode(inode);
1748f843 5603
1a54ef8c 5604 return inode;
d25f4ec1
FM
5605error:
5606 iget_failed(inode);
5607 return ERR_PTR(ret);
1a54ef8c
BR
5608}
5609
d13240dd 5610struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
4222ea71 5611{
d383eb69 5612 return btrfs_iget_path(ino, root, NULL);
4222ea71
FM
5613}
5614
94628ad9 5615static struct inode *new_simple_dir(struct inode *dir,
4df27c4d
YZ
5616 struct btrfs_key *key,
5617 struct btrfs_root *root)
5618{
d5acbc60 5619 struct timespec64 ts;
94628ad9 5620 struct inode *inode = new_inode(dir->i_sb);
4df27c4d
YZ
5621
5622 if (!inode)
5623 return ERR_PTR(-ENOMEM);
5624
5c8fd99f 5625 BTRFS_I(inode)->root = btrfs_grab_root(root);
7a7bc214 5626 BTRFS_I(inode)->ref_root_id = key->objectid;
068fc8f9 5627 set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags);
72ac3c0d 5628 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
4df27c4d 5629
7a7bc214 5630 btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
6bb6b514
OS
5631 /*
5632 * We only need lookup, the rest is read-only and there's no inode
5633 * associated with the dentry
5634 */
5635 inode->i_op = &simple_dir_inode_operations;
1fdf4194 5636 inode->i_opflags &= ~IOP_XATTR;
4df27c4d
YZ
5637 inode->i_fop = &simple_dir_operations;
5638 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
d5acbc60
LT
5639
5640 ts = inode_set_ctime_current(inode);
5641 inode_set_mtime_to_ts(inode, ts);
b1c38a13 5642 inode_set_atime_to_ts(inode, inode_get_atime(dir));
d5acbc60
LT
5643 BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
5644 BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
5645
94628ad9
LT
5646 inode->i_uid = dir->i_uid;
5647 inode->i_gid = dir->i_gid;
4df27c4d
YZ
5648
5649 return inode;
5650}
5651
a55e65b8
DS
5652static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
5653static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
5654static_assert(BTRFS_FT_DIR == FT_DIR);
5655static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
5656static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
5657static_assert(BTRFS_FT_FIFO == FT_FIFO);
5658static_assert(BTRFS_FT_SOCK == FT_SOCK);
5659static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
5660
6bf9e4bd
QW
5661static inline u8 btrfs_inode_type(struct inode *inode)
5662{
6bf9e4bd
QW
5663 return fs_umode_to_ftype(inode->i_mode);
5664}
5665
3de4586c 5666struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
39279cc3 5667{
41044b41 5668 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
d397712b 5669 struct inode *inode;
4df27c4d 5670 struct btrfs_root *root = BTRFS_I(dir)->root;
39279cc3 5671 struct btrfs_root *sub_root = root;
b8e947e9 5672 struct btrfs_key location = { 0 };
6bf9e4bd 5673 u8 di_type = 0;
b4aff1f8 5674 int ret = 0;
39279cc3
CM
5675
5676 if (dentry->d_name.len > BTRFS_NAME_LEN)
5677 return ERR_PTR(-ENAMETOOLONG);
5f39d397 5678
d1de429b 5679 ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
39279cc3
CM
5680 if (ret < 0)
5681 return ERR_PTR(ret);
5f39d397 5682
4df27c4d 5683 if (location.type == BTRFS_INODE_ITEM_KEY) {
d13240dd 5684 inode = btrfs_iget(location.objectid, root);
6bf9e4bd
QW
5685 if (IS_ERR(inode))
5686 return inode;
5687
5688 /* Do extra check against inode mode with di_type */
5689 if (btrfs_inode_type(inode) != di_type) {
5690 btrfs_crit(fs_info,
5691"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5692 inode->i_mode, btrfs_inode_type(inode),
5693 di_type);
5694 iput(inode);
5695 return ERR_PTR(-EUCLEAN);
5696 }
4df27c4d
YZ
5697 return inode;
5698 }
5699
3c1b1c4c 5700 ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
4df27c4d
YZ
5701 &location, &sub_root);
5702 if (ret < 0) {
5703 if (ret != -ENOENT)
5704 inode = ERR_PTR(ret);
5705 else
94628ad9 5706 inode = new_simple_dir(dir, &location, root);
4df27c4d 5707 } else {
d13240dd 5708 inode = btrfs_iget(location.objectid, sub_root);
00246528 5709 btrfs_put_root(sub_root);
76dda93c 5710
fc8b235f
NB
5711 if (IS_ERR(inode))
5712 return inode;
5713
0b246afa 5714 down_read(&fs_info->cleanup_work_sem);
bc98a42c 5715 if (!sb_rdonly(inode->i_sb))
66b4ffd1 5716 ret = btrfs_orphan_cleanup(sub_root);
0b246afa 5717 up_read(&fs_info->cleanup_work_sem);
01cd3367
JB
5718 if (ret) {
5719 iput(inode);
66b4ffd1 5720 inode = ERR_PTR(ret);
01cd3367 5721 }
c71bf099
YZ
5722 }
5723
3de4586c
CM
5724 return inode;
5725}
5726
fe15ce44 5727static int btrfs_dentry_delete(const struct dentry *dentry)
76dda93c
YZ
5728{
5729 struct btrfs_root *root;
2b0143b5 5730 struct inode *inode = d_inode(dentry);
76dda93c 5731
848cce0d 5732 if (!inode && !IS_ROOT(dentry))
2b0143b5 5733 inode = d_inode(dentry->d_parent);
76dda93c 5734
848cce0d
LZ
5735 if (inode) {
5736 root = BTRFS_I(inode)->root;
efefb143
YZ
5737 if (btrfs_root_refs(&root->root_item) == 0)
5738 return 1;
848cce0d 5739
4a0cc7ca 5740 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
848cce0d 5741 return 1;
efefb143 5742 }
76dda93c
YZ
5743 return 0;
5744}
5745
3de4586c 5746static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
00cd8dd3 5747 unsigned int flags)
3de4586c 5748{
3837d208 5749 struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5662344b 5750
3837d208
AV
5751 if (inode == ERR_PTR(-ENOENT))
5752 inode = NULL;
41d28bca 5753 return d_splice_alias(inode, dentry);
39279cc3
CM
5754}
5755
9b378f6a
FM
5756/*
5757 * Find the highest existing sequence number in a directory and then set the
5758 * in-memory index_cnt variable to the first free sequence number.
5759 */
5760static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
5761{
5762 struct btrfs_root *root = inode->root;
5763 struct btrfs_key key, found_key;
5764 struct btrfs_path *path;
5765 struct extent_buffer *leaf;
5766 int ret;
5767
5768 key.objectid = btrfs_ino(inode);
5769 key.type = BTRFS_DIR_INDEX_KEY;
5770 key.offset = (u64)-1;
5771
5772 path = btrfs_alloc_path();
5773 if (!path)
5774 return -ENOMEM;
5775
5776 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5777 if (ret < 0)
5778 goto out;
5779 /* FIXME: we should be able to handle this */
5780 if (ret == 0)
5781 goto out;
5782 ret = 0;
5783
5784 if (path->slots[0] == 0) {
5785 inode->index_cnt = BTRFS_DIR_START_INDEX;
5786 goto out;
5787 }
5788
5789 path->slots[0]--;
5790
5791 leaf = path->nodes[0];
5792 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5793
5794 if (found_key.objectid != btrfs_ino(inode) ||
5795 found_key.type != BTRFS_DIR_INDEX_KEY) {
5796 inode->index_cnt = BTRFS_DIR_START_INDEX;
5797 goto out;
5798 }
5799
5800 inode->index_cnt = found_key.offset + 1;
5801out:
5802 btrfs_free_path(path);
5803 return ret;
5804}
5805
5806static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
5807{
8e7f82de 5808 int ret = 0;
9b378f6a 5809
8e7f82de
FM
5810 btrfs_inode_lock(dir, 0);
5811 if (dir->index_cnt == (u64)-1) {
9b378f6a
FM
5812 ret = btrfs_inode_delayed_dir_index_count(dir);
5813 if (ret) {
5814 ret = btrfs_set_inode_index_count(dir);
5815 if (ret)
8e7f82de 5816 goto out;
9b378f6a
FM
5817 }
5818 }
5819
35795036
FM
5820 /* index_cnt is the index number of next new entry, so decrement it. */
5821 *index = dir->index_cnt - 1;
8e7f82de
FM
5822out:
5823 btrfs_inode_unlock(dir, 0);
9b378f6a 5824
8e7f82de 5825 return ret;
9b378f6a
FM
5826}
5827
23b5ec74
JB
5828/*
5829 * All this infrastructure exists because dir_emit can fault, and we are holding
5830 * the tree lock when doing readdir. For now just allocate a buffer and copy
5831 * our information into that, and then dir_emit from the buffer. This is
5832 * similar to what NFS does, only we don't keep the buffer around in pagecache
5833 * because I'm afraid I'll mess that up. Long term we need to make filldir do
5834 * copy_to_user_inatomic so we don't have to worry about page faulting under the
5835 * tree lock.
5836 */
5837static int btrfs_opendir(struct inode *inode, struct file *file)
5838{
5839 struct btrfs_file_private *private;
9b378f6a
FM
5840 u64 last_index;
5841 int ret;
5842
5843 ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
5844 if (ret)
5845 return ret;
23b5ec74
JB
5846
5847 private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
5848 if (!private)
5849 return -ENOMEM;
9b378f6a 5850 private->last_index = last_index;
23b5ec74
JB
5851 private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
5852 if (!private->filldir_buf) {
5853 kfree(private);
5854 return -ENOMEM;
5855 }
5856 file->private_data = private;
5857 return 0;
5858}
5859
e60aa5da
FM
5860static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
5861{
5862 struct btrfs_file_private *private = file->private_data;
5863 int ret;
5864
5865 ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
5866 &private->last_index);
5867 if (ret)
5868 return ret;
5869
5870 return generic_file_llseek(file, offset, whence);
5871}
5872
23b5ec74
JB
5873struct dir_entry {
5874 u64 ino;
5875 u64 offset;
5876 unsigned type;
5877 int name_len;
5878};
5879
5880static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
5881{
5882 while (entries--) {
5883 struct dir_entry *entry = addr;
5884 char *name = (char *)(entry + 1);
5885
92d32170
DS
5886 ctx->pos = get_unaligned(&entry->offset);
5887 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
5888 get_unaligned(&entry->ino),
5889 get_unaligned(&entry->type)))
23b5ec74 5890 return 1;
92d32170
DS
5891 addr += sizeof(struct dir_entry) +
5892 get_unaligned(&entry->name_len);
23b5ec74
JB
5893 ctx->pos++;
5894 }
5895 return 0;
5896}
5897
9cdda8d3 5898static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
39279cc3 5899{
9cdda8d3 5900 struct inode *inode = file_inode(file);
39279cc3 5901 struct btrfs_root *root = BTRFS_I(inode)->root;
23b5ec74 5902 struct btrfs_file_private *private = file->private_data;
39279cc3
CM
5903 struct btrfs_dir_item *di;
5904 struct btrfs_key key;
5f39d397 5905 struct btrfs_key found_key;
39279cc3 5906 struct btrfs_path *path;
23b5ec74 5907 void *addr;
84af994b
RJ
5908 LIST_HEAD(ins_list);
5909 LIST_HEAD(del_list);
39279cc3 5910 int ret;
5f39d397
CM
5911 char *name_ptr;
5912 int name_len;
23b5ec74
JB
5913 int entries = 0;
5914 int total_len = 0;
02dbfc99 5915 bool put = false;
c2951f32 5916 struct btrfs_key location;
5f39d397 5917
9cdda8d3
AV
5918 if (!dir_emit_dots(file, ctx))
5919 return 0;
5920
49593bfa 5921 path = btrfs_alloc_path();
16cdcec7
MX
5922 if (!path)
5923 return -ENOMEM;
ff5714cc 5924
23b5ec74 5925 addr = private->filldir_buf;
e4058b54 5926 path->reada = READA_FORWARD;
49593bfa 5927
a0d7e98c 5928 put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index,
9b378f6a 5929 &ins_list, &del_list);
16cdcec7 5930
23b5ec74 5931again:
c2951f32 5932 key.type = BTRFS_DIR_INDEX_KEY;
9cdda8d3 5933 key.offset = ctx->pos;
4a0cc7ca 5934 key.objectid = btrfs_ino(BTRFS_I(inode));
5f39d397 5935
a8ce68fd 5936 btrfs_for_each_slot(root, &key, &found_key, path, ret) {
23b5ec74 5937 struct dir_entry *entry;
a8ce68fd 5938 struct extent_buffer *leaf = path->nodes[0];
94a48aef 5939 u8 ftype;
5f39d397
CM
5940
5941 if (found_key.objectid != key.objectid)
39279cc3 5942 break;
c2951f32 5943 if (found_key.type != BTRFS_DIR_INDEX_KEY)
39279cc3 5944 break;
9cdda8d3 5945 if (found_key.offset < ctx->pos)
a8ce68fd 5946 continue;
9b378f6a
FM
5947 if (found_key.offset > private->last_index)
5948 break;
c2951f32 5949 if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
a8ce68fd
GN
5950 continue;
5951 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
c2951f32 5952 name_len = btrfs_dir_name_len(leaf, di);
23b5ec74
JB
5953 if ((total_len + sizeof(struct dir_entry) + name_len) >=
5954 PAGE_SIZE) {
5955 btrfs_release_path(path);
5956 ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5957 if (ret)
5958 goto nopos;
5959 addr = private->filldir_buf;
5960 entries = 0;
5961 total_len = 0;
5962 goto again;
c2951f32 5963 }
23b5ec74 5964
94a48aef 5965 ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
23b5ec74 5966 entry = addr;
23b5ec74 5967 name_ptr = (char *)(entry + 1);
94a48aef
OS
5968 read_extent_buffer(leaf, name_ptr,
5969 (unsigned long)(di + 1), name_len);
5970 put_unaligned(name_len, &entry->name_len);
5971 put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
c2951f32 5972 btrfs_dir_item_key_to_cpu(leaf, di, &location);
92d32170
DS
5973 put_unaligned(location.objectid, &entry->ino);
5974 put_unaligned(found_key.offset, &entry->offset);
23b5ec74
JB
5975 entries++;
5976 addr += sizeof(struct dir_entry) + name_len;
5977 total_len += sizeof(struct dir_entry) + name_len;
39279cc3 5978 }
a8ce68fd
GN
5979 /* Catch error encountered during iteration */
5980 if (ret < 0)
5981 goto err;
5982
23b5ec74
JB
5983 btrfs_release_path(path);
5984
5985 ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5986 if (ret)
5987 goto nopos;
49593bfa 5988
d2fbb2b5 5989 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
c2951f32 5990 if (ret)
bc4ef759
DS
5991 goto nopos;
5992
db62efbb
ZB
5993 /*
5994 * Stop new entries from being returned after we return the last
5995 * entry.
5996 *
5997 * New directory entries are assigned a strictly increasing
5998 * offset. This means that new entries created during readdir
5999 * are *guaranteed* to be seen in the future by that readdir.
6000 * This has broken buggy programs which operate on names as
6001 * they're returned by readdir. Until we re-use freed offsets
6002 * we have this hack to stop new entries from being returned
6003 * under the assumption that they'll never reach this huge
6004 * offset.
6005 *
6006 * This is being careful not to overflow 32bit loff_t unless the
6007 * last entry requires it because doing so has broken 32bit apps
6008 * in the past.
6009 */
c2951f32
JM
6010 if (ctx->pos >= INT_MAX)
6011 ctx->pos = LLONG_MAX;
6012 else
6013 ctx->pos = INT_MAX;
39279cc3
CM
6014nopos:
6015 ret = 0;
6016err:
02dbfc99 6017 if (put)
849c01ae 6018 btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
39279cc3 6019 btrfs_free_path(path);
39279cc3
CM
6020 return ret;
6021}
6022
39279cc3 6023/*
54aa1f4d 6024 * This is somewhat expensive, updating the tree every time the
39279cc3
CM
6025 * inode changes. But, it is most likely to find the inode in cache.
6026 * FIXME, needs more benchmarking...there are no reasons other than performance
6027 * to keep or drop this code.
6028 */
7152b425 6029static int btrfs_dirty_inode(struct btrfs_inode *inode)
39279cc3 6030{
7152b425
DS
6031 struct btrfs_root *root = inode->root;
6032 struct btrfs_fs_info *fs_info = root->fs_info;
39279cc3 6033 struct btrfs_trans_handle *trans;
8929ecfa
YZ
6034 int ret;
6035
7152b425 6036 if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
22c44fe6 6037 return 0;
39279cc3 6038
7a7eaa40 6039 trans = btrfs_join_transaction(root);
22c44fe6
JB
6040 if (IS_ERR(trans))
6041 return PTR_ERR(trans);
8929ecfa 6042
8b9d0322 6043 ret = btrfs_update_inode(trans, inode);
2199cb0f 6044 if (ret == -ENOSPC || ret == -EDQUOT) {
94b60442 6045 /* whoops, lets try again with the full transaction */
3a45bb20 6046 btrfs_end_transaction(trans);
94b60442 6047 trans = btrfs_start_transaction(root, 1);
22c44fe6
JB
6048 if (IS_ERR(trans))
6049 return PTR_ERR(trans);
8929ecfa 6050
8b9d0322 6051 ret = btrfs_update_inode(trans, inode);
94b60442 6052 }
3a45bb20 6053 btrfs_end_transaction(trans);
7152b425 6054 if (inode->delayed_node)
2ff7e61e 6055 btrfs_balance_delayed_items(fs_info);
22c44fe6
JB
6056
6057 return ret;
6058}
6059
6060/*
6061 * This is a copy of file_update_time. We need this so we can return error on
6062 * ENOSPC for updating the inode in the case of file write and mmap writes.
6063 */
913e9928 6064static int btrfs_update_time(struct inode *inode, int flags)
22c44fe6 6065{
2bc55652 6066 struct btrfs_root *root = BTRFS_I(inode)->root;
a666ce9b 6067 bool dirty;
2bc55652
AB
6068
6069 if (btrfs_root_readonly(root))
6070 return -EROFS;
6071
bb7cc0a6 6072 dirty = inode_update_timestamps(inode, flags);
7152b425 6073 return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
39279cc3
CM
6074}
6075
d352ac68
CM
6076/*
6077 * helper to find a free sequence number in a given directory. This current
6078 * code is very simple, later versions will do smarter things in the btree
6079 */
877574e2 6080int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
aec7477b
JB
6081{
6082 int ret = 0;
6083
877574e2
NB
6084 if (dir->index_cnt == (u64)-1) {
6085 ret = btrfs_inode_delayed_dir_index_count(dir);
16cdcec7
MX
6086 if (ret) {
6087 ret = btrfs_set_inode_index_count(dir);
6088 if (ret)
6089 return ret;
6090 }
aec7477b
JB
6091 }
6092
877574e2
NB
6093 *index = dir->index_cnt;
6094 dir->index_cnt++;
aec7477b
JB
6095
6096 return ret;
6097}
6098
b0d5d10f
CM
6099static int btrfs_insert_inode_locked(struct inode *inode)
6100{
6101 struct btrfs_iget_args args;
0202e83f 6102
7a7bc214 6103 args.ino = btrfs_ino(BTRFS_I(inode));
b0d5d10f
CM
6104 args.root = BTRFS_I(inode)->root;
6105
6106 return insert_inode_locked4(inode,
6107 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6108 btrfs_find_actor, &args);
6109}
6110
3538d68d
OS
6111int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
6112 unsigned int *trans_num_items)
6113{
6114 struct inode *dir = args->dir;
6115 struct inode *inode = args->inode;
6116 int ret;
6117
ab3c5c18
STD
6118 if (!args->orphan) {
6119 ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
6120 &args->fname);
6121 if (ret)
6122 return ret;
ab3c5c18
STD
6123 }
6124
3538d68d 6125 ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
ab3c5c18
STD
6126 if (ret) {
6127 fscrypt_free_filename(&args->fname);
3538d68d 6128 return ret;
ab3c5c18 6129 }
3538d68d
OS
6130
6131 /* 1 to add inode item */
6132 *trans_num_items = 1;
6133 /* 1 to add compression property */
6134 if (BTRFS_I(dir)->prop_compress)
6135 (*trans_num_items)++;
6136 /* 1 to add default ACL xattr */
6137 if (args->default_acl)
6138 (*trans_num_items)++;
6139 /* 1 to add access ACL xattr */
6140 if (args->acl)
6141 (*trans_num_items)++;
6142#ifdef CONFIG_SECURITY
6143 /* 1 to add LSM xattr */
6144 if (dir->i_security)
6145 (*trans_num_items)++;
6146#endif
6147 if (args->orphan) {
6148 /* 1 to add orphan item */
6149 (*trans_num_items)++;
6150 } else {
6151 /*
3538d68d
OS
6152 * 1 to add dir item
6153 * 1 to add dir index
6154 * 1 to update parent inode item
97bdf1a9
FM
6155 *
6156 * No need for 1 unit for the inode ref item because it is
6157 * inserted in a batch together with the inode item at
6158 * btrfs_create_new_inode().
3538d68d 6159 */
97bdf1a9 6160 *trans_num_items += 3;
3538d68d
OS
6161 }
6162 return 0;
6163}
6164
6165void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
6166{
6167 posix_acl_release(args->acl);
6168 posix_acl_release(args->default_acl);
ab3c5c18 6169 fscrypt_free_filename(&args->fname);
3538d68d
OS
6170}
6171
19aee8de
AJ
6172/*
6173 * Inherit flags from the parent inode.
6174 *
6175 * Currently only the compression flags and the cow flags are inherited.
6176 */
7a0443f0 6177static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
19aee8de
AJ
6178{
6179 unsigned int flags;
6180
7a0443f0 6181 flags = dir->flags;
19aee8de
AJ
6182
6183 if (flags & BTRFS_INODE_NOCOMPRESS) {
7a0443f0
DS
6184 inode->flags &= ~BTRFS_INODE_COMPRESS;
6185 inode->flags |= BTRFS_INODE_NOCOMPRESS;
19aee8de 6186 } else if (flags & BTRFS_INODE_COMPRESS) {
7a0443f0
DS
6187 inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
6188 inode->flags |= BTRFS_INODE_COMPRESS;
19aee8de
AJ
6189 }
6190
6191 if (flags & BTRFS_INODE_NODATACOW) {
7a0443f0
DS
6192 inode->flags |= BTRFS_INODE_NODATACOW;
6193 if (S_ISREG(inode->vfs_inode.i_mode))
6194 inode->flags |= BTRFS_INODE_NODATASUM;
19aee8de
AJ
6195 }
6196
7a0443f0 6197 btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
19aee8de
AJ
6198}
6199
3538d68d 6200int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
caae78e0 6201 struct btrfs_new_inode_args *args)
39279cc3 6202{
d5acbc60 6203 struct timespec64 ts;
caae78e0 6204 struct inode *dir = args->dir;
3538d68d 6205 struct inode *inode = args->inode;
6db75318 6206 const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
41044b41 6207 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
3538d68d 6208 struct btrfs_root *root;
5f39d397 6209 struct btrfs_inode_item *inode_item;
5f39d397 6210 struct btrfs_path *path;
6437d458 6211 u64 objectid;
9c58309d
CM
6212 struct btrfs_inode_ref *ref;
6213 struct btrfs_key key[2];
6214 u32 sizes[2];
b7ef5f3a 6215 struct btrfs_item_batch batch;
9c58309d 6216 unsigned long ptr;
39279cc3 6217 int ret;
061ea858 6218 bool xa_reserved = false;
39279cc3 6219
5f39d397 6220 path = btrfs_alloc_path();
d8926bb3 6221 if (!path)
a1fd0c35 6222 return -ENOMEM;
39279cc3 6223
3538d68d
OS
6224 if (!args->subvol)
6225 BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
6226 root = BTRFS_I(inode)->root;
6227
3d7db6e8
FM
6228 ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
6229 if (ret)
6230 goto out;
6231
6437d458 6232 ret = btrfs_get_free_objectid(root, &objectid);
caae78e0
OS
6233 if (ret)
6234 goto out;
7a7bc214 6235 btrfs_set_inode_number(BTRFS_I(inode), objectid);
581bb050 6236
061ea858
FM
6237 ret = xa_reserve(&root->inodes, objectid, GFP_NOFS);
6238 if (ret)
6239 goto out;
6240 xa_reserved = true;
581bb050 6241
caae78e0
OS
6242 if (args->orphan) {
6243 /*
6244 * O_TMPFILE, set link count to 0, so that after this point, we
6245 * fill in an inode item with the correct link count.
6246 */
6247 set_nlink(inode, 0);
6248 } else {
1abe9b8a 6249 trace_btrfs_inode_request(dir);
6250
caae78e0
OS
6251 ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
6252 if (ret)
6253 goto out;
aec7477b 6254 }
d9891ae2
FM
6255
6256 if (S_ISDIR(inode->i_mode))
6257 BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
6258
e02119d5 6259 BTRFS_I(inode)->generation = trans->transid;
76195853 6260 inode->i_generation = BTRFS_I(inode)->generation;
b888db2b 6261
ed9b50a1
JB
6262 /*
6263 * We don't have any capability xattrs set here yet, shortcut any
6264 * queries for the xattrs here. If we add them later via the inode
6265 * security init path or any other path this flag will be cleared.
6266 */
6267 set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
6268
caae78e0
OS
6269 /*
6270 * Subvolumes don't inherit flags from their parent directory.
6271 * Originally this was probably by accident, but we probably can't
6272 * change it now without compatibility issues.
6273 */
6274 if (!args->subvol)
7a0443f0 6275 btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
305eaac0 6276
a1fd0c35 6277 if (S_ISREG(inode->i_mode)) {
305eaac0
OS
6278 if (btrfs_test_opt(fs_info, NODATASUM))
6279 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6280 if (btrfs_test_opt(fs_info, NODATACOW))
6281 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6282 BTRFS_INODE_NODATASUM;
6283 }
6284
caae78e0
OS
6285 ret = btrfs_insert_inode_locked(inode);
6286 if (ret < 0) {
6287 if (!args->orphan)
6288 BTRFS_I(dir)->index_cnt--;
6289 goto out;
6290 }
6291
5dc562c5
JB
6292 /*
6293 * We could have gotten an inode number from somebody who was fsynced
6294 * and then removed in this same transaction, so let's just set full
6295 * sync since it will be a full sync anyway and this will blow away the
6296 * old info in the log.
6297 */
23e3337f 6298 btrfs_set_inode_full_sync(BTRFS_I(inode));
5dc562c5 6299
9c58309d 6300 key[0].objectid = objectid;
962a298f 6301 key[0].type = BTRFS_INODE_ITEM_KEY;
9c58309d
CM
6302 key[0].offset = 0;
6303
9c58309d 6304 sizes[0] = sizeof(struct btrfs_inode_item);
ef3b9af5 6305
caae78e0 6306 if (!args->orphan) {
ef3b9af5
FM
6307 /*
6308 * Start new inodes with an inode_ref. This is slightly more
6309 * efficient for small numbers of hard links since they will
6310 * be packed into one item. Extended refs will kick in if we
6311 * add more hard links than can fit in the ref item.
6312 */
6313 key[1].objectid = objectid;
962a298f 6314 key[1].type = BTRFS_INODE_REF_KEY;
caae78e0 6315 if (args->subvol) {
23c24ef8 6316 key[1].offset = objectid;
caae78e0
OS
6317 sizes[1] = 2 + sizeof(*ref);
6318 } else {
6319 key[1].offset = btrfs_ino(BTRFS_I(dir));
e43eec81 6320 sizes[1] = name->len + sizeof(*ref);
caae78e0 6321 }
ef3b9af5 6322 }
9c58309d 6323
b7ef5f3a
FM
6324 batch.keys = &key[0];
6325 batch.data_sizes = &sizes[0];
caae78e0
OS
6326 batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
6327 batch.nr = args->orphan ? 1 : 2;
b7ef5f3a 6328 ret = btrfs_insert_empty_items(trans, root, path, &batch);
caae78e0
OS
6329 if (ret != 0) {
6330 btrfs_abort_transaction(trans, ret);
6331 goto discard;
6332 }
5f39d397 6333
d5acbc60
LT
6334 ts = simple_inode_init_ts(inode);
6335 BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
6336 BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
9cc97d64 6337
caae78e0
OS
6338 /*
6339 * We're going to fill the inode item now, so at this point the inode
6340 * must be fully initialized.
6341 */
6342
5f39d397
CM
6343 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6344 struct btrfs_inode_item);
b159fa28 6345 memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
293f7e07 6346 sizeof(*inode_item));
e02119d5 6347 fill_inode_item(trans, path->nodes[0], inode_item, inode);
9c58309d 6348
caae78e0 6349 if (!args->orphan) {
ef3b9af5
FM
6350 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6351 struct btrfs_inode_ref);
ef3b9af5 6352 ptr = (unsigned long)(ref + 1);
caae78e0
OS
6353 if (args->subvol) {
6354 btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
6355 btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
6356 write_extent_buffer(path->nodes[0], "..", ptr, 2);
6357 } else {
e43eec81
STD
6358 btrfs_set_inode_ref_name_len(path->nodes[0], ref,
6359 name->len);
caae78e0
OS
6360 btrfs_set_inode_ref_index(path->nodes[0], ref,
6361 BTRFS_I(inode)->dir_index);
e43eec81
STD
6362 write_extent_buffer(path->nodes[0], name->name, ptr,
6363 name->len);
caae78e0 6364 }
ef3b9af5 6365 }
9c58309d 6366
50564b65 6367 btrfs_mark_buffer_dirty(trans, path->nodes[0]);
814e7718
FM
6368 /*
6369 * We don't need the path anymore, plus inheriting properties, adding
6370 * ACLs, security xattrs, orphan item or adding the link, will result in
6371 * allocating yet another path. So just free our path.
6372 */
6373 btrfs_free_path(path);
6374 path = NULL;
5f39d397 6375
6c3636eb
STD
6376 if (args->subvol) {
6377 struct inode *parent;
6378
6379 /*
6380 * Subvolumes inherit properties from their parent subvolume,
6381 * not the directory they were created in.
6382 */
d13240dd 6383 parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root);
6c3636eb
STD
6384 if (IS_ERR(parent)) {
6385 ret = PTR_ERR(parent);
6386 } else {
6387 ret = btrfs_inode_inherit_props(trans, inode, parent);
6388 iput(parent);
6389 }
6390 } else {
6391 ret = btrfs_inode_inherit_props(trans, inode, dir);
6392 }
6393 if (ret) {
6394 btrfs_err(fs_info,
6395 "error inheriting props for ino %llu (root %llu): %d",
e094f480 6396 btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
6c3636eb
STD
6397 }
6398
6399 /*
6400 * Subvolumes don't inherit ACLs or get passed to the LSM. This is
6401 * probably a bug.
6402 */
6403 if (!args->subvol) {
6404 ret = btrfs_init_inode_security(trans, args);
6405 if (ret) {
6406 btrfs_abort_transaction(trans, ret);
6407 goto discard;
6408 }
6409 }
6410
061ea858
FM
6411 ret = btrfs_add_inode_to_root(BTRFS_I(inode), false);
6412 if (WARN_ON(ret)) {
6413 /* Shouldn't happen, we used xa_reserve() before. */
310b2f5d
FM
6414 btrfs_abort_transaction(trans, ret);
6415 goto discard;
6416 }
1abe9b8a 6417
6418 trace_btrfs_inode_new(inode);
d9094414 6419 btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
1abe9b8a 6420
8ea05e3a
AB
6421 btrfs_update_root_times(trans, root);
6422
caae78e0
OS
6423 if (args->orphan) {
6424 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
6425 } else {
6426 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
e43eec81 6427 0, BTRFS_I(inode)->dir_index);
caae78e0
OS
6428 }
6429 if (ret) {
6430 btrfs_abort_transaction(trans, ret);
6431 goto discard;
6432 }
63541927 6433
814e7718 6434 return 0;
b0d5d10f 6435
caae78e0 6436discard:
a1fd0c35
OS
6437 /*
6438 * discard_new_inode() calls iput(), but the caller owns the reference
6439 * to the inode.
6440 */
6441 ihold(inode);
32955c54 6442 discard_new_inode(inode);
caae78e0 6443out:
061ea858
FM
6444 if (xa_reserved)
6445 xa_release(&root->inodes, objectid);
6446
5f39d397 6447 btrfs_free_path(path);
a1fd0c35 6448 return ret;
39279cc3
CM
6449}
6450
d352ac68
CM
6451/*
6452 * utility function to add 'inode' into 'parent_inode' with
6453 * a give name and a given sequence number.
6454 * if 'add_backref' is true, also insert a backref from the
6455 * inode to the parent directory.
6456 */
e02119d5 6457int btrfs_add_link(struct btrfs_trans_handle *trans,
db0a669f 6458 struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6db75318 6459 const struct fscrypt_str *name, int add_backref, u64 index)
39279cc3 6460{
4df27c4d 6461 int ret = 0;
39279cc3 6462 struct btrfs_key key;
db0a669f
NB
6463 struct btrfs_root *root = parent_inode->root;
6464 u64 ino = btrfs_ino(inode);
6465 u64 parent_ino = btrfs_ino(parent_inode);
5f39d397 6466
33345d01 6467 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
db0a669f 6468 memcpy(&key, &inode->root->root_key, sizeof(key));
4df27c4d 6469 } else {
33345d01 6470 key.objectid = ino;
962a298f 6471 key.type = BTRFS_INODE_ITEM_KEY;
4df27c4d
YZ
6472 key.offset = 0;
6473 }
6474
33345d01 6475 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6025c19f 6476 ret = btrfs_add_root_ref(trans, key.objectid,
e094f480 6477 btrfs_root_id(root), parent_ino,
e43eec81 6478 index, name);
4df27c4d 6479 } else if (add_backref) {
e43eec81
STD
6480 ret = btrfs_insert_inode_ref(trans, root, name,
6481 ino, parent_ino, index);
4df27c4d 6482 }
39279cc3 6483
79787eaa
JM
6484 /* Nothing to clean up yet */
6485 if (ret)
6486 return ret;
4df27c4d 6487
e43eec81 6488 ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
db0a669f 6489 btrfs_inode_type(&inode->vfs_inode), index);
9c52057c 6490 if (ret == -EEXIST || ret == -EOVERFLOW)
79787eaa
JM
6491 goto fail_dir_item;
6492 else if (ret) {
66642832 6493 btrfs_abort_transaction(trans, ret);
79787eaa 6494 return ret;
39279cc3 6495 }
79787eaa 6496
db0a669f 6497 btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
e43eec81 6498 name->len * 2);
db0a669f 6499 inode_inc_iversion(&parent_inode->vfs_inode);
5338e43a
FM
6500 /*
6501 * If we are replaying a log tree, we do not want to update the mtime
6502 * and ctime of the parent directory with the current time, since the
6503 * log replay procedure is responsible for setting them to their correct
6504 * values (the ones it had when the fsync was done).
6505 */
2a9462de 6506 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
b1c38a13
JL
6507 inode_set_mtime_to_ts(&parent_inode->vfs_inode,
6508 inode_set_ctime_current(&parent_inode->vfs_inode));
5338e43a 6509
8b9d0322 6510 ret = btrfs_update_inode(trans, parent_inode);
79787eaa 6511 if (ret)
66642832 6512 btrfs_abort_transaction(trans, ret);
39279cc3 6513 return ret;
fe66a05a
CM
6514
6515fail_dir_item:
6516 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6517 u64 local_index;
6518 int err;
3ee1c553 6519 err = btrfs_del_root_ref(trans, key.objectid,
e094f480 6520 btrfs_root_id(root), parent_ino,
e43eec81 6521 &local_index, name);
1690dd41
JT
6522 if (err)
6523 btrfs_abort_transaction(trans, err);
fe66a05a
CM
6524 } else if (add_backref) {
6525 u64 local_index;
6526 int err;
6527
e43eec81
STD
6528 err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
6529 &local_index);
1690dd41
JT
6530 if (err)
6531 btrfs_abort_transaction(trans, err);
fe66a05a 6532 }
1690dd41
JT
6533
6534 /* Return the original error code */
fe66a05a 6535 return ret;
39279cc3
CM
6536}
6537
5f465bf1
OS
6538static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
6539 struct inode *inode)
618e21d5 6540{
41044b41 6541 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
618e21d5 6542 struct btrfs_root *root = BTRFS_I(dir)->root;
3538d68d
OS
6543 struct btrfs_new_inode_args new_inode_args = {
6544 .dir = dir,
6545 .dentry = dentry,
6546 .inode = inode,
6547 };
6548 unsigned int trans_num_items;
5f465bf1 6549 struct btrfs_trans_handle *trans;
618e21d5 6550 int err;
618e21d5 6551
3538d68d 6552 err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
caae78e0
OS
6553 if (err)
6554 goto out_inode;
3538d68d
OS
6555
6556 trans = btrfs_start_transaction(root, trans_num_items);
a1fd0c35 6557 if (IS_ERR(trans)) {
3538d68d
OS
6558 err = PTR_ERR(trans);
6559 goto out_new_inode_args;
a1fd0c35 6560 }
1832a6d5 6561
caae78e0
OS
6562 err = btrfs_create_new_inode(trans, &new_inode_args);
6563 if (!err)
6564 d_instantiate_new(dentry, inode);
b0d5d10f 6565
3a45bb20 6566 btrfs_end_transaction(trans);
5f465bf1 6567 btrfs_btree_balance_dirty(fs_info);
3538d68d
OS
6568out_new_inode_args:
6569 btrfs_new_inode_args_destroy(&new_inode_args);
caae78e0
OS
6570out_inode:
6571 if (err)
6572 iput(inode);
618e21d5
JB
6573 return err;
6574}
6575
5ebb29be 6576static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
5f465bf1
OS
6577 struct dentry *dentry, umode_t mode, dev_t rdev)
6578{
6579 struct inode *inode;
6580
6581 inode = new_inode(dir->i_sb);
6582 if (!inode)
6583 return -ENOMEM;
f2d40141 6584 inode_init_owner(idmap, inode, dir, mode);
5f465bf1
OS
6585 inode->i_op = &btrfs_special_inode_operations;
6586 init_special_inode(inode, inode->i_mode, rdev);
6587 return btrfs_create_common(dir, dentry, inode);
6588}
6589
6c960e68 6590static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
549c7297 6591 struct dentry *dentry, umode_t mode, bool excl)
39279cc3 6592{
a1fd0c35 6593 struct inode *inode;
39279cc3 6594
a1fd0c35
OS
6595 inode = new_inode(dir->i_sb);
6596 if (!inode)
6597 return -ENOMEM;
f2d40141 6598 inode_init_owner(idmap, inode, dir, mode);
a1fd0c35
OS
6599 inode->i_fop = &btrfs_file_operations;
6600 inode->i_op = &btrfs_file_inode_operations;
6601 inode->i_mapping->a_ops = &btrfs_aops;
5f465bf1 6602 return btrfs_create_common(dir, dentry, inode);
39279cc3
CM
6603}
6604
6605static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6606 struct dentry *dentry)
6607{
271dba45 6608 struct btrfs_trans_handle *trans = NULL;
39279cc3 6609 struct btrfs_root *root = BTRFS_I(dir)->root;
2b0143b5 6610 struct inode *inode = d_inode(old_dentry);
41044b41 6611 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
ab3c5c18 6612 struct fscrypt_name fname;
00e4e6b3 6613 u64 index;
39279cc3
CM
6614 int err;
6615 int drop_inode = 0;
6616
4a8be425 6617 /* do not allow sys_link's with other subvols of the same device */
e094f480 6618 if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
3ab3564f 6619 return -EXDEV;
4a8be425 6620
f186373f 6621 if (inode->i_nlink >= BTRFS_LINK_MAX)
c055e99e 6622 return -EMLINK;
4a8be425 6623
ab3c5c18
STD
6624 err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
6625 if (err)
6626 goto fail;
6627
877574e2 6628 err = btrfs_set_inode_index(BTRFS_I(dir), &index);
aec7477b
JB
6629 if (err)
6630 goto fail;
6631
a22285a6 6632 /*
7e6b6465 6633 * 2 items for inode and inode ref
a22285a6 6634 * 2 items for dir items
7e6b6465 6635 * 1 item for parent inode
399b0bbf 6636 * 1 item for orphan item deletion if O_TMPFILE
a22285a6 6637 */
399b0bbf 6638 trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
a22285a6
YZ
6639 if (IS_ERR(trans)) {
6640 err = PTR_ERR(trans);
271dba45 6641 trans = NULL;
a22285a6
YZ
6642 goto fail;
6643 }
5f39d397 6644
67de1176
MX
6645 /* There are several dir indexes for this inode, clear the cache. */
6646 BTRFS_I(inode)->dir_index = 0ULL;
8b558c5f 6647 inc_nlink(inode);
0c4d2d95 6648 inode_inc_iversion(inode);
2a9462de 6649 inode_set_ctime_current(inode);
7de9c6ee 6650 ihold(inode);
e9976151 6651 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
aec7477b 6652
81512e89 6653 err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6db75318 6654 &fname.disk_name, 1, index);
5f39d397 6655
a5719521 6656 if (err) {
54aa1f4d 6657 drop_inode = 1;
a5719521 6658 } else {
10d9f309 6659 struct dentry *parent = dentry->d_parent;
d4682ba0 6660
8b9d0322 6661 err = btrfs_update_inode(trans, BTRFS_I(inode));
79787eaa
JM
6662 if (err)
6663 goto fail;
ef3b9af5
FM
6664 if (inode->i_nlink == 1) {
6665 /*
6666 * If new hard link count is 1, it's a file created
6667 * with open(2) O_TMPFILE flag.
6668 */
3d6ae7bb 6669 err = btrfs_orphan_del(trans, BTRFS_I(inode));
ef3b9af5
FM
6670 if (err)
6671 goto fail;
6672 }
08c422c2 6673 d_instantiate(dentry, inode);
88d2beec 6674 btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
a5719521 6675 }
39279cc3 6676
1832a6d5 6677fail:
ab3c5c18 6678 fscrypt_free_filename(&fname);
271dba45 6679 if (trans)
3a45bb20 6680 btrfs_end_transaction(trans);
39279cc3
CM
6681 if (drop_inode) {
6682 inode_dec_link_count(inode);
6683 iput(inode);
6684 }
2ff7e61e 6685 btrfs_btree_balance_dirty(fs_info);
39279cc3
CM
6686 return err;
6687}
6688
c54bd91e 6689static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
549c7297 6690 struct dentry *dentry, umode_t mode)
39279cc3 6691{
a1fd0c35 6692 struct inode *inode;
39279cc3 6693
a1fd0c35
OS
6694 inode = new_inode(dir->i_sb);
6695 if (!inode)
6696 return -ENOMEM;
f2d40141 6697 inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
a1fd0c35
OS
6698 inode->i_op = &btrfs_dir_inode_operations;
6699 inode->i_fop = &btrfs_dir_file_operations;
5f465bf1 6700 return btrfs_create_common(dir, dentry, inode);
39279cc3
CM
6701}
6702
c8b97818 6703static noinline int uncompress_inline(struct btrfs_path *path,
e40da0e5 6704 struct page *page,
c8b97818
CM
6705 struct btrfs_file_extent_item *item)
6706{
6707 int ret;
6708 struct extent_buffer *leaf = path->nodes[0];
6709 char *tmp;
6710 size_t max_size;
6711 unsigned long inline_size;
6712 unsigned long ptr;
261507a0 6713 int compress_type;
c8b97818 6714
261507a0 6715 compress_type = btrfs_file_extent_compression(leaf, item);
c8b97818 6716 max_size = btrfs_file_extent_ram_bytes(leaf, item);
437bd07e 6717 inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
c8b97818 6718 tmp = kmalloc(inline_size, GFP_NOFS);
8d413713
TI
6719 if (!tmp)
6720 return -ENOMEM;
c8b97818
CM
6721 ptr = btrfs_file_extent_inline_start(item);
6722
6723 read_extent_buffer(leaf, tmp, ptr, inline_size);
6724
09cbfeaf 6725 max_size = min_t(unsigned long, PAGE_SIZE, max_size);
a982fc82 6726 ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
e1699d2d
ZB
6727
6728 /*
6729 * decompression code contains a memset to fill in any space between the end
6730 * of the uncompressed data and the end of max_size in case the decompressed
6731 * data ends up shorter than ram_bytes. That doesn't cover the hole between
6732 * the end of an inline extent and the beginning of the next block, so we
6733 * cover that region here.
6734 */
6735
a982fc82
QW
6736 if (max_size < PAGE_SIZE)
6737 memzero_page(page, max_size, PAGE_SIZE - max_size);
c8b97818 6738 kfree(tmp);
166ae5a4 6739 return ret;
c8b97818
CM
6740}
6741
a982fc82
QW
6742static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
6743 struct page *page)
6744{
6745 struct btrfs_file_extent_item *fi;
6746 void *kaddr;
6747 size_t copy_size;
6748
6749 if (!page || PageUptodate(page))
6750 return 0;
6751
6752 ASSERT(page_offset(page) == 0);
6753
6754 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
6755 struct btrfs_file_extent_item);
6756 if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
6757 return uncompress_inline(path, page, fi);
6758
6759 copy_size = min_t(u64, PAGE_SIZE,
6760 btrfs_file_extent_ram_bytes(path->nodes[0], fi));
6761 kaddr = kmap_local_page(page);
6762 read_extent_buffer(path->nodes[0], kaddr,
6763 btrfs_file_extent_inline_start(fi), copy_size);
6764 kunmap_local(kaddr);
6765 if (copy_size < PAGE_SIZE)
6766 memzero_page(page, copy_size, PAGE_SIZE - copy_size);
6767 return 0;
6768}
6769
43dd529a
DS
6770/*
6771 * Lookup the first extent overlapping a range in a file.
6772 *
39b07b5d
OS
6773 * @inode: file to search in
6774 * @page: page to read extent data into if the extent is inline
39b07b5d
OS
6775 * @start: file offset
6776 * @len: length of range starting at @start
6777 *
43dd529a
DS
6778 * Return the first &struct extent_map which overlaps the given range, reading
6779 * it from the B-tree and caching it if necessary. Note that there may be more
6780 * extents which overlap the given range after the returned extent_map.
d352ac68 6781 *
39b07b5d
OS
6782 * If @page is not NULL and the extent is inline, this also reads the extent
6783 * data directly into the page and marks the extent up to date in the io_tree.
6784 *
6785 * Return: ERR_PTR on error, non-NULL extent_map on success.
d352ac68 6786 */
fc4f21b1 6787struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
8bab0a30 6788 struct page *page, u64 start, u64 len)
a52d9a80 6789{
3ffbd68c 6790 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1028d1c4 6791 int ret = 0;
a52d9a80
CM
6792 u64 extent_start = 0;
6793 u64 extent_end = 0;
fc4f21b1 6794 u64 objectid = btrfs_ino(inode);
7e74e235 6795 int extent_type = -1;
f421950f 6796 struct btrfs_path *path = NULL;
fc4f21b1 6797 struct btrfs_root *root = inode->root;
a52d9a80 6798 struct btrfs_file_extent_item *item;
5f39d397
CM
6799 struct extent_buffer *leaf;
6800 struct btrfs_key found_key;
a52d9a80 6801 struct extent_map *em = NULL;
fc4f21b1 6802 struct extent_map_tree *em_tree = &inode->extent_tree;
a52d9a80 6803
890871be 6804 read_lock(&em_tree->lock);
d1310b2e 6805 em = lookup_extent_mapping(em_tree, start, len);
890871be 6806 read_unlock(&em_tree->lock);
d1310b2e 6807
a52d9a80 6808 if (em) {
e1c4b745
CM
6809 if (em->start > start || em->start + em->len <= start)
6810 free_extent_map(em);
c77a8c61 6811 else if (em->disk_bytenr == EXTENT_MAP_INLINE && page)
70dec807
CM
6812 free_extent_map(em);
6813 else
6814 goto out;
a52d9a80 6815 }
172ddd60 6816 em = alloc_extent_map();
a52d9a80 6817 if (!em) {
1028d1c4 6818 ret = -ENOMEM;
d1310b2e 6819 goto out;
a52d9a80 6820 }
d1310b2e 6821 em->start = EXTENT_MAP_HOLE;
3d2ac992 6822 em->disk_bytenr = EXTENT_MAP_HOLE;
d1310b2e 6823 em->len = (u64)-1;
f421950f 6824
bee6ec82 6825 path = btrfs_alloc_path();
f421950f 6826 if (!path) {
1028d1c4 6827 ret = -ENOMEM;
bee6ec82 6828 goto out;
f421950f
CM
6829 }
6830
bee6ec82
LB
6831 /* Chances are we'll be called again, so go ahead and do readahead */
6832 path->reada = READA_FORWARD;
4d7240f0
JB
6833
6834 /*
6835 * The same explanation in load_free_space_cache applies here as well,
6836 * we only read when we're loading the free space cache, and at that
6837 * point the commit_root has everything we need.
6838 */
6839 if (btrfs_is_free_space_inode(inode)) {
6840 path->search_commit_root = 1;
6841 path->skip_locking = 1;
6842 }
51899412 6843
5c9a702e 6844 ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
a52d9a80 6845 if (ret < 0) {
a52d9a80 6846 goto out;
b8eeab7f 6847 } else if (ret > 0) {
a52d9a80
CM
6848 if (path->slots[0] == 0)
6849 goto not_found;
6850 path->slots[0]--;
1028d1c4 6851 ret = 0;
a52d9a80
CM
6852 }
6853
5f39d397
CM
6854 leaf = path->nodes[0];
6855 item = btrfs_item_ptr(leaf, path->slots[0],
a52d9a80 6856 struct btrfs_file_extent_item);
5f39d397 6857 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5f39d397 6858 if (found_key.objectid != objectid ||
694c12ed 6859 found_key.type != BTRFS_EXTENT_DATA_KEY) {
25a50341
JB
6860 /*
6861 * If we backup past the first extent we want to move forward
6862 * and see if there is an extent in front of us, otherwise we'll
6863 * say there is a hole for our whole search range which can
6864 * cause problems.
6865 */
6866 extent_end = start;
6867 goto next;
a52d9a80
CM
6868 }
6869
694c12ed 6870 extent_type = btrfs_file_extent_type(leaf, item);
5f39d397 6871 extent_start = found_key.offset;
a5eeb3d1 6872 extent_end = btrfs_file_extent_end(path);
694c12ed
NB
6873 if (extent_type == BTRFS_FILE_EXTENT_REG ||
6874 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6bf9e4bd
QW
6875 /* Only regular file could have regular/prealloc extent */
6876 if (!S_ISREG(inode->vfs_inode.i_mode)) {
1028d1c4 6877 ret = -EUCLEAN;
6bf9e4bd
QW
6878 btrfs_crit(fs_info,
6879 "regular/prealloc extent found for non-regular inode %llu",
6880 btrfs_ino(inode));
6881 goto out;
6882 }
09ed2f16
LB
6883 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6884 extent_start);
694c12ed 6885 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
09ed2f16
LB
6886 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
6887 path->slots[0],
6888 extent_start);
9036c102 6889 }
25a50341 6890next:
9036c102
YZ
6891 if (start >= extent_end) {
6892 path->slots[0]++;
6893 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6894 ret = btrfs_next_leaf(root, path);
1028d1c4 6895 if (ret < 0)
9036c102 6896 goto out;
1028d1c4 6897 else if (ret > 0)
9036c102 6898 goto not_found;
1028d1c4 6899
9036c102 6900 leaf = path->nodes[0];
a52d9a80 6901 }
9036c102
YZ
6902 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6903 if (found_key.objectid != objectid ||
6904 found_key.type != BTRFS_EXTENT_DATA_KEY)
6905 goto not_found;
6906 if (start + len <= found_key.offset)
6907 goto not_found;
e2eca69d
WS
6908 if (start > found_key.offset)
6909 goto next;
02a033df
NB
6910
6911 /* New extent overlaps with existing one */
9036c102
YZ
6912 em->start = start;
6913 em->len = found_key.offset - start;
c77a8c61 6914 em->disk_bytenr = EXTENT_MAP_HOLE;
02a033df 6915 goto insert;
9036c102
YZ
6916 }
6917
280f15cb 6918 btrfs_extent_item_to_extent_map(inode, path, item, em);
7ffbb598 6919
694c12ed
NB
6920 if (extent_type == BTRFS_FILE_EXTENT_REG ||
6921 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
a52d9a80 6922 goto insert;
694c12ed 6923 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
affc5424
QW
6924 /*
6925 * Inline extent can only exist at file offset 0. This is
6926 * ensured by tree-checker and inline extent creation path.
6927 * Thus all members representing file offsets should be zero.
6928 */
affc5424
QW
6929 ASSERT(extent_start == 0);
6930 ASSERT(em->start == 0);
5f39d397 6931
a196a894
QW
6932 /*
6933 * btrfs_extent_item_to_extent_map() should have properly
6934 * initialized em members already.
6935 *
6936 * Other members are not utilized for inline extents.
6937 */
c77a8c61 6938 ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
946c2923 6939 ASSERT(em->len == fs_info->sectorsize);
e49aabd9 6940
a982fc82
QW
6941 ret = read_inline_extent(inode, path, page);
6942 if (ret < 0)
6943 goto out;
a52d9a80 6944 goto insert;
a52d9a80
CM
6945 }
6946not_found:
6947 em->start = start;
d1310b2e 6948 em->len = len;
c77a8c61 6949 em->disk_bytenr = EXTENT_MAP_HOLE;
a52d9a80 6950insert:
1028d1c4 6951 ret = 0;
b3b4aa74 6952 btrfs_release_path(path);
d1310b2e 6953 if (em->start > start || extent_map_end(em) <= start) {
0b246afa 6954 btrfs_err(fs_info,
5d163e0e
JM
6955 "bad extent! em: [%llu %llu] passed [%llu %llu]",
6956 em->start, em->len, start, len);
1028d1c4 6957 ret = -EIO;
a52d9a80
CM
6958 goto out;
6959 }
d1310b2e 6960
890871be 6961 write_lock(&em_tree->lock);
0a308f80 6962 ret = btrfs_add_extent_mapping(inode, &em, start, len);
890871be 6963 write_unlock(&em_tree->lock);
a52d9a80 6964out:
c6414280 6965 btrfs_free_path(path);
1abe9b8a 6966
fc4f21b1 6967 trace_btrfs_get_extent(root, inode, em);
1abe9b8a 6968
1028d1c4 6969 if (ret) {
a52d9a80 6970 free_extent_map(em);
1028d1c4 6971 return ERR_PTR(ret);
a52d9a80
CM
6972 }
6973 return em;
6974}
6975
f4639636 6976static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
05947ae1
AJ
6977{
6978 struct btrfs_block_group *block_group;
f4639636 6979 bool readonly = false;
05947ae1
AJ
6980
6981 block_group = btrfs_lookup_block_group(fs_info, bytenr);
6982 if (!block_group || block_group->ro)
f4639636 6983 readonly = true;
05947ae1
AJ
6984 if (block_group)
6985 btrfs_put_block_group(block_group);
6986 return readonly;
6987}
6988
46bfbb5c 6989/*
e4ecaf90
QW
6990 * Check if we can do nocow write into the range [@offset, @offset + @len)
6991 *
6992 * @offset: File offset
6993 * @len: The length to write, will be updated to the nocow writeable
6994 * range
6995 * @orig_start: (optional) Return the original file offset of the file extent
6996 * @orig_len: (optional) Return the original on-disk length of the file extent
6997 * @ram_bytes: (optional) Return the ram_bytes of the file extent
a84d5d42
BB
6998 * @strict: if true, omit optimizations that might force us into unnecessary
6999 * cow. e.g., don't trust generation number.
e4ecaf90 7000 *
e4ecaf90
QW
7001 * Return:
7002 * >0 and update @len if we can do nocow write
7003 * 0 if we can't do nocow write
7004 * <0 if error happened
7005 *
7006 * NOTE: This only checks the file extents, caller is responsible to wait for
7007 * any ordered extents.
46bfbb5c 7008 */
00361589 7009noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
cdc627e6 7010 struct btrfs_file_extent *file_extent,
87a6962f 7011 bool nowait, bool strict)
46bfbb5c 7012{
41044b41 7013 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
619104ba 7014 struct can_nocow_file_extent_args nocow_args = { 0 };
46bfbb5c
CM
7015 struct btrfs_path *path;
7016 int ret;
7017 struct extent_buffer *leaf;
7018 struct btrfs_root *root = BTRFS_I(inode)->root;
7b2b7085 7019 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
46bfbb5c
CM
7020 struct btrfs_file_extent_item *fi;
7021 struct btrfs_key key;
46bfbb5c 7022 int found_type;
e77751aa 7023
46bfbb5c
CM
7024 path = btrfs_alloc_path();
7025 if (!path)
7026 return -ENOMEM;
26ce9114 7027 path->nowait = nowait;
46bfbb5c 7028
f85b7379
DS
7029 ret = btrfs_lookup_file_extent(NULL, root, path,
7030 btrfs_ino(BTRFS_I(inode)), offset, 0);
46bfbb5c
CM
7031 if (ret < 0)
7032 goto out;
7033
46bfbb5c 7034 if (ret == 1) {
619104ba 7035 if (path->slots[0] == 0) {
46bfbb5c
CM
7036 /* can't find the item, must cow */
7037 ret = 0;
7038 goto out;
7039 }
619104ba 7040 path->slots[0]--;
46bfbb5c
CM
7041 }
7042 ret = 0;
7043 leaf = path->nodes[0];
619104ba 7044 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4a0cc7ca 7045 if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
46bfbb5c
CM
7046 key.type != BTRFS_EXTENT_DATA_KEY) {
7047 /* not our file or wrong item type, must cow */
7048 goto out;
7049 }
7050
7051 if (key.offset > offset) {
7052 /* Wrong offset, must cow */
7053 goto out;
7054 }
7055
619104ba 7056 if (btrfs_file_extent_end(path) <= offset)
7ee9e440
JB
7057 goto out;
7058
619104ba
FM
7059 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
7060 found_type = btrfs_file_extent_type(leaf, fi);
e77751aa 7061
619104ba
FM
7062 nocow_args.start = offset;
7063 nocow_args.end = offset + *len - 1;
7064 nocow_args.strict = strict;
7065 nocow_args.free_path = true;
7ee9e440 7066
619104ba
FM
7067 ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
7068 /* can_nocow_file_extent() has freed the path. */
7069 path = NULL;
7ee9e440 7070
619104ba
FM
7071 if (ret != 1) {
7072 /* Treat errors as not being able to NOCOW. */
7073 ret = 0;
78d4295b 7074 goto out;
7ee9e440 7075 }
eb384b55 7076
619104ba 7077 ret = 0;
cdc627e6
QW
7078 if (btrfs_extent_readonly(fs_info,
7079 nocow_args.file_extent.disk_bytenr +
7080 nocow_args.file_extent.offset))
46bfbb5c 7081 goto out;
7b2b7085 7082
619104ba
FM
7083 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7084 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7b2b7085
MX
7085 u64 range_end;
7086
cdc627e6 7087 range_end = round_up(offset + nocow_args.file_extent.num_bytes,
da17066c 7088 root->fs_info->sectorsize) - 1;
99be1a66 7089 ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
7b2b7085
MX
7090 if (ret) {
7091 ret = -EAGAIN;
7092 goto out;
7093 }
7094 }
7095
87a6962f
QW
7096 if (file_extent)
7097 memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
00361589 7098
cdc627e6 7099 *len = nocow_args.file_extent.num_bytes;
46bfbb5c
CM
7100 ret = 1;
7101out:
7102 btrfs_free_path(path);
7103 return ret;
7104}
7105
6f9994db 7106/* The callers of this must take lock_extent() */
9aa29a20
FM
7107struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
7108 const struct btrfs_file_extent *file_extent,
7109 int type)
69ffb543 7110{
69ffb543 7111 struct extent_map *em;
69ffb543
JB
7112 int ret;
7113
e98bf64f
QW
7114 /*
7115 * Note the missing NOCOW type.
7116 *
7117 * For pure NOCOW writes, we should not create an io extent map, but
7118 * just reusing the existing one.
7119 * Only PREALLOC writes (NOCOW write into preallocated range) can
7120 * create an io extent map.
7121 */
6f9994db
LB
7122 ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7123 type == BTRFS_ORDERED_COMPRESSED ||
1af4a0aa 7124 type == BTRFS_ORDERED_REGULAR);
6f9994db 7125
e98bf64f
QW
7126 switch (type) {
7127 case BTRFS_ORDERED_PREALLOC:
e98bf64f 7128 /* We're only referring part of a larger preallocated extent. */
9fec848b 7129 ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
e98bf64f
QW
7130 break;
7131 case BTRFS_ORDERED_REGULAR:
e98bf64f 7132 /* COW results a new extent matching our file extent size. */
9fec848b
QW
7133 ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes);
7134 ASSERT(file_extent->ram_bytes == file_extent->num_bytes);
e98bf64f
QW
7135
7136 /* Since it's a new extent, we should not have any offset. */
4aa7b5d1 7137 ASSERT(file_extent->offset == 0);
e98bf64f
QW
7138 break;
7139 case BTRFS_ORDERED_COMPRESSED:
7140 /* Must be compressed. */
9fec848b 7141 ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE);
e98bf64f
QW
7142
7143 /*
7144 * Encoded write can make us to refer to part of the
7145 * uncompressed extent.
7146 */
9fec848b 7147 ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
e98bf64f
QW
7148 break;
7149 }
7150
69ffb543
JB
7151 em = alloc_extent_map();
7152 if (!em)
7153 return ERR_PTR(-ENOMEM);
7154
7155 em->start = start;
9fec848b 7156 em->len = file_extent->num_bytes;
3d2ac992 7157 em->disk_bytenr = file_extent->disk_bytenr;
9fec848b
QW
7158 em->disk_num_bytes = file_extent->disk_num_bytes;
7159 em->ram_bytes = file_extent->ram_bytes;
70c8a91c 7160 em->generation = -1;
3d2ac992 7161 em->offset = file_extent->offset;
f86f7a75 7162 em->flags |= EXTENT_FLAG_PINNED;
2e438442 7163 if (type == BTRFS_ORDERED_COMPRESSED)
9fec848b 7164 extent_map_set_compression(em, file_extent->compression);
69ffb543 7165
a1ba4c08 7166 ret = btrfs_replace_extent_map_range(inode, em, true);
69ffb543
JB
7167 if (ret) {
7168 free_extent_map(em);
7169 return ERR_PTR(ret);
7170 }
7171
6f9994db 7172 /* em got 2 refs now, callers needs to do free_extent_map once. */
69ffb543
JB
7173 return em;
7174}
7175
7c11d0ae 7176/*
f913cff3 7177 * For release_folio() and invalidate_folio() we have a race window where
895586eb 7178 * folio_end_writeback() is called but the subpage spinlock is not yet released.
7c11d0ae
QW
7179 * If we continue to release/invalidate the page, we could cause use-after-free
7180 * for subpage spinlock. So this function is to spin and wait for subpage
7181 * spinlock.
7182 */
7183static void wait_subpage_spinlock(struct page *page)
7184{
b33d2e53 7185 struct btrfs_fs_info *fs_info = page_to_fs_info(page);
cfbf07e2 7186 struct folio *folio = page_folio(page);
7c11d0ae
QW
7187 struct btrfs_subpage *subpage;
7188
13df3775 7189 if (!btrfs_is_subpage(fs_info, page->mapping))
7c11d0ae
QW
7190 return;
7191
cfbf07e2
QW
7192 ASSERT(folio_test_private(folio) && folio_get_private(folio));
7193 subpage = folio_get_private(folio);
7c11d0ae
QW
7194
7195 /*
7196 * This may look insane as we just acquire the spinlock and release it,
7197 * without doing anything. But we just want to make sure no one is
7198 * still holding the subpage spinlock.
7199 * And since the page is not dirty nor writeback, and we have page
7200 * locked, the only possible way to hold a spinlock is from the endio
7201 * function to clear page writeback.
7202 *
7203 * Here we just acquire the spinlock so that all existing callers
7204 * should exit and we're safe to release/invalidate the page.
7205 */
7206 spin_lock_irq(&subpage->lock);
7207 spin_unlock_irq(&subpage->lock);
7208}
7209
872617a0
BB
7210static int btrfs_launder_folio(struct folio *folio)
7211{
7212 return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
7213 PAGE_SIZE, NULL);
7214}
7215
f913cff3 7216static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
9ebefb18 7217{
de6f14e8 7218 if (try_release_extent_mapping(&folio->page, gfp_flags)) {
f913cff3
MWO
7219 wait_subpage_spinlock(&folio->page);
7220 clear_page_extent_mapped(&folio->page);
de6f14e8 7221 return true;
7c11d0ae 7222 }
de6f14e8 7223 return false;
39279cc3
CM
7224}
7225
f913cff3 7226static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
e6dcd2dc 7227{
f913cff3
MWO
7228 if (folio_test_writeback(folio) || folio_test_dirty(folio))
7229 return false;
7230 return __btrfs_release_folio(folio, gfp_flags);
e6dcd2dc
CM
7231}
7232
f8e66081 7233#ifdef CONFIG_MIGRATION
e7a60a17
MWO
7234static int btrfs_migrate_folio(struct address_space *mapping,
7235 struct folio *dst, struct folio *src,
f8e66081
RG
7236 enum migrate_mode mode)
7237{
e7a60a17 7238 int ret = filemap_migrate_folio(mapping, dst, src, mode);
f8e66081 7239
f8e66081
RG
7240 if (ret != MIGRATEPAGE_SUCCESS)
7241 return ret;
7242
e7a60a17
MWO
7243 if (folio_test_ordered(src)) {
7244 folio_clear_ordered(src);
7245 folio_set_ordered(dst);
f8e66081
RG
7246 }
7247
f8e66081
RG
7248 return MIGRATEPAGE_SUCCESS;
7249}
e7a60a17
MWO
7250#else
7251#define btrfs_migrate_folio NULL
f8e66081
RG
7252#endif
7253
895586eb
MWO
7254static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
7255 size_t length)
39279cc3 7256{
c8293894 7257 struct btrfs_inode *inode = folio_to_inode(folio);
b945a463 7258 struct btrfs_fs_info *fs_info = inode->root->fs_info;
53ac7ead 7259 struct extent_io_tree *tree = &inode->io_tree;
2ac55d41 7260 struct extent_state *cached_state = NULL;
895586eb
MWO
7261 u64 page_start = folio_pos(folio);
7262 u64 page_end = page_start + folio_size(folio) - 1;
3b835840 7263 u64 cur;
53ac7ead 7264 int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
39279cc3 7265
8b62b72b 7266 /*
895586eb
MWO
7267 * We have folio locked so no new ordered extent can be created on this
7268 * page, nor bio can be submitted for this folio.
8b62b72b 7269 *
895586eb
MWO
7270 * But already submitted bio can still be finished on this folio.
7271 * Furthermore, endio function won't skip folio which has Ordered
f57ad937 7272 * (Private2) already cleared, so it's possible for endio and
895586eb
MWO
7273 * invalidate_folio to do the same ordered extent accounting twice
7274 * on one folio.
266a2586
QW
7275 *
7276 * So here we wait for any submitted bios to finish, so that we won't
895586eb 7277 * do double ordered extent accounting on the same folio.
8b62b72b 7278 */
895586eb
MWO
7279 folio_wait_writeback(folio);
7280 wait_subpage_spinlock(&folio->page);
8b62b72b 7281
bcd77455
QW
7282 /*
7283 * For subpage case, we have call sites like
7284 * btrfs_punch_hole_lock_range() which passes range not aligned to
7285 * sectorsize.
895586eb
MWO
7286 * If the range doesn't cover the full folio, we don't need to and
7287 * shouldn't clear page extent mapped, as folio->private can still
bcd77455
QW
7288 * record subpage dirty bits for other part of the range.
7289 *
895586eb
MWO
7290 * For cases that invalidate the full folio even the range doesn't
7291 * cover the full folio, like invalidating the last folio, we're
bcd77455
QW
7292 * still safe to wait for ordered extent to finish.
7293 */
5a60542c 7294 if (!(offset == 0 && length == folio_size(folio))) {
f913cff3 7295 btrfs_release_folio(folio, GFP_NOFS);
e6dcd2dc
CM
7296 return;
7297 }
131e404a
FDBM
7298
7299 if (!inode_evicting)
570eb97b 7300 lock_extent(tree, page_start, page_end, &cached_state);
951c80f8 7301
3b835840
QW
7302 cur = page_start;
7303 while (cur < page_end) {
7304 struct btrfs_ordered_extent *ordered;
3b835840 7305 u64 range_end;
b945a463 7306 u32 range_len;
bd015294 7307 u32 extra_flags = 0;
3b835840
QW
7308
7309 ordered = btrfs_lookup_first_ordered_range(inode, cur,
7310 page_end + 1 - cur);
7311 if (!ordered) {
7312 range_end = page_end;
7313 /*
7314 * No ordered extent covering this range, we are safe
7315 * to delete all extent states in the range.
7316 */
bd015294 7317 extra_flags = EXTENT_CLEAR_ALL_BITS;
3b835840
QW
7318 goto next;
7319 }
7320 if (ordered->file_offset > cur) {
7321 /*
7322 * There is a range between [cur, oe->file_offset) not
7323 * covered by any ordered extent.
7324 * We are safe to delete all extent states, and handle
7325 * the ordered extent in the next iteration.
7326 */
7327 range_end = ordered->file_offset - 1;
bd015294 7328 extra_flags = EXTENT_CLEAR_ALL_BITS;
3b835840
QW
7329 goto next;
7330 }
7331
7332 range_end = min(ordered->file_offset + ordered->num_bytes - 1,
7333 page_end);
b945a463
QW
7334 ASSERT(range_end + 1 - cur < U32_MAX);
7335 range_len = range_end + 1 - cur;
55151ea9 7336 if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
3b835840 7337 /*
f57ad937
QW
7338 * If Ordered (Private2) is cleared, it means endio has
7339 * already been executed for the range.
3b835840
QW
7340 * We can't delete the extent states as
7341 * btrfs_finish_ordered_io() may still use some of them.
7342 */
3b835840
QW
7343 goto next;
7344 }
55151ea9 7345 btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
3b835840 7346
eb84ae03 7347 /*
2766ff61
FM
7348 * IO on this page will never be started, so we need to account
7349 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
7350 * here, must leave that up for the ordered extent completion.
3b835840
QW
7351 *
7352 * This will also unlock the range for incoming
7353 * btrfs_finish_ordered_io().
eb84ae03 7354 */
131e404a 7355 if (!inode_evicting)
3b835840 7356 clear_extent_bit(tree, cur, range_end,
2766ff61 7357 EXTENT_DELALLOC |
131e404a 7358 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
bd015294 7359 EXTENT_DEFRAG, &cached_state);
3b835840 7360
54c65371 7361 spin_lock_irq(&inode->ordered_tree_lock);
3b835840
QW
7362 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
7363 ordered->truncated_len = min(ordered->truncated_len,
7364 cur - ordered->file_offset);
54c65371 7365 spin_unlock_irq(&inode->ordered_tree_lock);
3b835840 7366
bd015294
JB
7367 /*
7368 * If the ordered extent has finished, we're safe to delete all
7369 * the extent states of the range, otherwise
7370 * btrfs_finish_ordered_io() will get executed by endio for
7371 * other pages, so we can't delete extent states.
7372 */
3b835840 7373 if (btrfs_dec_test_ordered_pending(inode, &ordered,
f41b6ba9 7374 cur, range_end + 1 - cur)) {
3b835840
QW
7375 btrfs_finish_ordered_io(ordered);
7376 /*
7377 * The ordered extent has finished, now we're again
7378 * safe to delete all extent states of the range.
7379 */
bd015294 7380 extra_flags = EXTENT_CLEAR_ALL_BITS;
3b835840
QW
7381 }
7382next:
7383 if (ordered)
7384 btrfs_put_ordered_extent(ordered);
8b62b72b 7385 /*
3b835840
QW
7386 * Qgroup reserved space handler
7387 * Sector(s) here will be either:
266a2586 7388 *
3b835840
QW
7389 * 1) Already written to disk or bio already finished
7390 * Then its QGROUP_RESERVED bit in io_tree is already cleared.
7391 * Qgroup will be handled by its qgroup_record then.
7392 * btrfs_qgroup_free_data() call will do nothing here.
7393 *
7394 * 2) Not written to disk yet
7395 * Then btrfs_qgroup_free_data() call will clear the
7396 * QGROUP_RESERVED bit of its io_tree, and free the qgroup
7397 * reserved data space.
7398 * Since the IO will never happen for this page.
8b62b72b 7399 */
9e65bfca 7400 btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
131e404a 7401 if (!inode_evicting) {
3b835840
QW
7402 clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
7403 EXTENT_DELALLOC | EXTENT_UPTODATE |
bd015294
JB
7404 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
7405 extra_flags, &cached_state);
131e404a 7406 }
3b835840 7407 cur = range_end + 1;
131e404a 7408 }
b9d0b389 7409 /*
3b835840 7410 * We have iterated through all ordered extents of the page, the page
f57ad937
QW
7411 * should not have Ordered (Private2) anymore, or the above iteration
7412 * did something wrong.
b9d0b389 7413 */
895586eb 7414 ASSERT(!folio_test_ordered(folio));
55151ea9 7415 btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
3b835840 7416 if (!inode_evicting)
f913cff3 7417 __btrfs_release_folio(folio, GFP_NOFS);
895586eb 7418 clear_page_extent_mapped(&folio->page);
39279cc3
CM
7419}
7420
d9dcae67 7421static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
39279cc3 7422{
d9ac19c3 7423 struct btrfs_truncate_control control = {
d9dcae67
DS
7424 .inode = inode,
7425 .ino = btrfs_ino(inode),
d9ac19c3 7426 .min_type = BTRFS_EXTENT_DATA_KEY,
655807b8 7427 .clear_extent_range = true,
d9ac19c3 7428 };
d9dcae67
DS
7429 struct btrfs_root *root = inode->root;
7430 struct btrfs_fs_info *fs_info = root->fs_info;
fcb80c2a 7431 struct btrfs_block_rsv *rsv;
ad7e1a74 7432 int ret;
39279cc3 7433 struct btrfs_trans_handle *trans;
0b246afa 7434 u64 mask = fs_info->sectorsize - 1;
6822b3f7 7435 const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
39279cc3 7436
213e8c55 7437 if (!skip_writeback) {
e641e323 7438 ret = btrfs_wait_ordered_range(inode,
d9dcae67 7439 inode->vfs_inode.i_size & (~mask),
213e8c55
FM
7440 (u64)-1);
7441 if (ret)
7442 return ret;
7443 }
39279cc3 7444
fcb80c2a 7445 /*
f7e9e8fc
OS
7446 * Yes ladies and gentlemen, this is indeed ugly. We have a couple of
7447 * things going on here:
fcb80c2a 7448 *
f7e9e8fc 7449 * 1) We need to reserve space to update our inode.
fcb80c2a 7450 *
f7e9e8fc 7451 * 2) We need to have something to cache all the space that is going to
fcb80c2a
JB
7452 * be free'd up by the truncate operation, but also have some slack
7453 * space reserved in case it uses space during the truncate (thank you
7454 * very much snapshotting).
7455 *
f7e9e8fc 7456 * And we need these to be separate. The fact is we can use a lot of
fcb80c2a 7457 * space doing the truncate, and we have no earthly idea how much space
01327610 7458 * we will use, so we need the truncate reservation to be separate so it
f7e9e8fc
OS
7459 * doesn't end up using space reserved for updating the inode. We also
7460 * need to be able to stop the transaction and start a new one, which
7461 * means we need to be able to update the inode several times, and we
7462 * have no idea of knowing how many times that will be, so we can't just
7463 * reserve 1 item for the entirety of the operation, so that has to be
7464 * done separately as well.
fcb80c2a
JB
7465 *
7466 * So that leaves us with
7467 *
f7e9e8fc 7468 * 1) rsv - for the truncate reservation, which we will steal from the
fcb80c2a 7469 * transaction reservation.
f7e9e8fc 7470 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
fcb80c2a
JB
7471 * updating the inode.
7472 */
2ff7e61e 7473 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
fcb80c2a
JB
7474 if (!rsv)
7475 return -ENOMEM;
4a338542 7476 rsv->size = min_size;
710d5921 7477 rsv->failfast = true;
f0cd846e 7478
907cbceb 7479 /*
07127184 7480 * 1 for the truncate slack space
907cbceb
JB
7481 * 1 for updating the inode.
7482 */
f3fe820c 7483 trans = btrfs_start_transaction(root, 2);
fcb80c2a 7484 if (IS_ERR(trans)) {
ad7e1a74 7485 ret = PTR_ERR(trans);
fcb80c2a
JB
7486 goto out;
7487 }
f0cd846e 7488
907cbceb 7489 /* Migrate the slack space for the truncate to our reserve */
0b246afa 7490 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
3a584174 7491 min_size, false);
6822b3f7
FM
7492 /*
7493 * We have reserved 2 metadata units when we started the transaction and
7494 * min_size matches 1 unit, so this should never fail, but if it does,
7495 * it's not critical we just fail truncation.
7496 */
7497 if (WARN_ON(ret)) {
7498 btrfs_end_transaction(trans);
7499 goto out;
7500 }
f0cd846e 7501
ca7e70f5 7502 trans->block_rsv = rsv;
907cbceb 7503
8082510e 7504 while (1) {
9a4a1429 7505 struct extent_state *cached_state = NULL;
d9dcae67 7506 const u64 new_size = inode->vfs_inode.i_size;
9a4a1429
JB
7507 const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
7508
d9ac19c3 7509 control.new_size = new_size;
d9dcae67 7510 lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
9a4a1429
JB
7511 /*
7512 * We want to drop from the next block forward in case this new
7513 * size is not block aligned since we will be keeping the last
7514 * block of the extent just the way it is.
7515 */
d9dcae67 7516 btrfs_drop_extent_map_range(inode,
4c0c8cfc
FM
7517 ALIGN(new_size, fs_info->sectorsize),
7518 (u64)-1, false);
9a4a1429 7519
71d18b53 7520 ret = btrfs_truncate_inode_items(trans, root, &control);
c2ddb612 7521
d9dcae67
DS
7522 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
7523 btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
c2ddb612 7524
d9dcae67 7525 unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
9a4a1429 7526
ddfae63c 7527 trans->block_rsv = &fs_info->trans_block_rsv;
ad7e1a74 7528 if (ret != -ENOSPC && ret != -EAGAIN)
8082510e 7529 break;
39279cc3 7530
8b9d0322 7531 ret = btrfs_update_inode(trans, inode);
ad7e1a74 7532 if (ret)
3893e33b 7533 break;
ca7e70f5 7534
3a45bb20 7535 btrfs_end_transaction(trans);
2ff7e61e 7536 btrfs_btree_balance_dirty(fs_info);
ca7e70f5
JB
7537
7538 trans = btrfs_start_transaction(root, 2);
7539 if (IS_ERR(trans)) {
ad7e1a74 7540 ret = PTR_ERR(trans);
ca7e70f5
JB
7541 trans = NULL;
7542 break;
7543 }
7544
63f018be 7545 btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
0b246afa 7546 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
3a584174 7547 rsv, min_size, false);
6822b3f7
FM
7548 /*
7549 * We have reserved 2 metadata units when we started the
7550 * transaction and min_size matches 1 unit, so this should never
7551 * fail, but if it does, it's not critical we just fail truncation.
7552 */
7553 if (WARN_ON(ret))
7554 break;
7555
ca7e70f5 7556 trans->block_rsv = rsv;
8082510e
YZ
7557 }
7558
ddfae63c
JB
7559 /*
7560 * We can't call btrfs_truncate_block inside a trans handle as we could
54f03ab1
JB
7561 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
7562 * know we've truncated everything except the last little bit, and can
7563 * do btrfs_truncate_block and then update the disk_i_size.
ddfae63c 7564 */
54f03ab1 7565 if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
ddfae63c
JB
7566 btrfs_end_transaction(trans);
7567 btrfs_btree_balance_dirty(fs_info);
7568
d9dcae67 7569 ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
ddfae63c
JB
7570 if (ret)
7571 goto out;
7572 trans = btrfs_start_transaction(root, 1);
7573 if (IS_ERR(trans)) {
7574 ret = PTR_ERR(trans);
7575 goto out;
7576 }
d9dcae67 7577 btrfs_inode_safe_disk_i_size_write(inode, 0);
ddfae63c
JB
7578 }
7579
917c16b2 7580 if (trans) {
ad7e1a74
OS
7581 int ret2;
7582
0b246afa 7583 trans->block_rsv = &fs_info->trans_block_rsv;
8b9d0322 7584 ret2 = btrfs_update_inode(trans, inode);
ad7e1a74
OS
7585 if (ret2 && !ret)
7586 ret = ret2;
7b128766 7587
ad7e1a74
OS
7588 ret2 = btrfs_end_transaction(trans);
7589 if (ret2 && !ret)
7590 ret = ret2;
2ff7e61e 7591 btrfs_btree_balance_dirty(fs_info);
917c16b2 7592 }
fcb80c2a 7593out:
2ff7e61e 7594 btrfs_free_block_rsv(fs_info, rsv);
0d7d3165
FM
7595 /*
7596 * So if we truncate and then write and fsync we normally would just
7597 * write the extents that changed, which is a problem if we need to
7598 * first truncate that entire inode. So set this flag so we write out
7599 * all of the extents in the inode to the sync log so we're completely
7600 * safe.
7601 *
7602 * If no extents were dropped or trimmed we don't need to force the next
7603 * fsync to truncate all the inode's items from the log and re-log them
7604 * all. This means the truncate operation did not change the file size,
7605 * or changed it to a smaller size but there was only an implicit hole
7606 * between the old i_size and the new i_size, and there were no prealloc
7607 * extents beyond i_size to drop.
7608 */
d9ac19c3 7609 if (control.extents_found > 0)
d9dcae67 7610 btrfs_set_inode_full_sync(inode);
fcb80c2a 7611
ad7e1a74 7612 return ret;
39279cc3
CM
7613}
7614
f2d40141 7615struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
a1fd0c35
OS
7616 struct inode *dir)
7617{
7618 struct inode *inode;
7619
7620 inode = new_inode(dir->i_sb);
7621 if (inode) {
7622 /*
7623 * Subvolumes don't inherit the sgid bit or the parent's gid if
7624 * the parent's sgid bit is set. This is probably a bug.
7625 */
f2d40141 7626 inode_init_owner(idmap, inode, NULL,
a1fd0c35
OS
7627 S_IFDIR | (~current_umask() & S_IRWXUGO));
7628 inode->i_op = &btrfs_dir_inode_operations;
7629 inode->i_fop = &btrfs_dir_file_operations;
7630 }
7631 return inode;
7632}
7633
39279cc3
CM
7634struct inode *btrfs_alloc_inode(struct super_block *sb)
7635{
69fe2d75 7636 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
39279cc3 7637 struct btrfs_inode *ei;
2ead6ae7 7638 struct inode *inode;
39279cc3 7639
fd60b288 7640 ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
3d7db6e8 7641 if (!ei)
39279cc3 7642 return NULL;
2ead6ae7
YZ
7643
7644 ei->root = NULL;
2ead6ae7 7645 ei->generation = 0;
15ee9bc7 7646 ei->last_trans = 0;
257c62e1 7647 ei->last_sub_trans = 0;
e02119d5 7648 ei->logged_trans = 0;
2ead6ae7 7649 ei->delalloc_bytes = 0;
a7e3b975 7650 ei->new_delalloc_bytes = 0;
47059d93 7651 ei->defrag_bytes = 0;
2ead6ae7
YZ
7652 ei->disk_i_size = 0;
7653 ei->flags = 0;
77eea05e 7654 ei->ro_flags = 0;
d9891ae2
FM
7655 /*
7656 * ->index_cnt will be properly initialized later when creating a new
7657 * inode (btrfs_create_new_inode()) or when reading an existing inode
7658 * from disk (btrfs_read_locked_inode()).
7659 */
7709cde3 7660 ei->csum_bytes = 0;
67de1176 7661 ei->dir_index = 0;
2ead6ae7 7662 ei->last_unlink_trans = 0;
3ebac17c 7663 ei->last_reflink_trans = 0;
46d8bc34 7664 ei->last_log_commit = 0;
2ead6ae7 7665
9e0baf60
JB
7666 spin_lock_init(&ei->lock);
7667 ei->outstanding_extents = 0;
69fe2d75
JB
7668 if (sb->s_magic != BTRFS_TEST_MAGIC)
7669 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
7670 BTRFS_BLOCK_RSV_DELALLOC);
72ac3c0d 7671 ei->runtime_flags = 0;
b52aa8c9 7672 ei->prop_compress = BTRFS_COMPRESS_NONE;
eec63c65 7673 ei->defrag_compress = BTRFS_COMPRESS_NONE;
2ead6ae7 7674
16cdcec7
MX
7675 ei->delayed_node = NULL;
7676
c6e8f898
DS
7677 ei->i_otime_sec = 0;
7678 ei->i_otime_nsec = 0;
9cc97d64 7679
2ead6ae7 7680 inode = &ei->vfs_inode;
a8067e02 7681 extent_map_tree_init(&ei->extent_tree);
738290c0
DS
7682
7683 /* This io tree sets the valid inode. */
35da5a7e 7684 extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
0988fc7b 7685 ei->io_tree.inode = ei;
738290c0 7686
3d7db6e8
FM
7687 ei->file_extent_tree = NULL;
7688
2ead6ae7 7689 mutex_init(&ei->log_mutex);
54c65371
DS
7690 spin_lock_init(&ei->ordered_tree_lock);
7691 ei->ordered_tree = RB_ROOT;
7692 ei->ordered_tree_last = NULL;
2ead6ae7 7693 INIT_LIST_HEAD(&ei->delalloc_inodes);
8089fe62 7694 INIT_LIST_HEAD(&ei->delayed_iput);
8318ba79 7695 init_rwsem(&ei->i_mmap_lock);
2ead6ae7
YZ
7696
7697 return inode;
39279cc3
CM
7698}
7699
aaedb55b
JB
7700#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
7701void btrfs_test_destroy_inode(struct inode *inode)
7702{
4c0c8cfc 7703 btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
637e6e0f 7704 kfree(BTRFS_I(inode)->file_extent_tree);
aaedb55b
JB
7705 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7706}
7707#endif
7708
26602cab 7709void btrfs_free_inode(struct inode *inode)
fa0d7e3d 7710{
637e6e0f 7711 kfree(BTRFS_I(inode)->file_extent_tree);
fa0d7e3d
NP
7712 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7713}
7714
633cc816 7715void btrfs_destroy_inode(struct inode *vfs_inode)
39279cc3 7716{
e6dcd2dc 7717 struct btrfs_ordered_extent *ordered;
633cc816
NB
7718 struct btrfs_inode *inode = BTRFS_I(vfs_inode);
7719 struct btrfs_root *root = inode->root;
5f4403e1 7720 bool freespace_inode;
5a3f23d5 7721
633cc816
NB
7722 WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
7723 WARN_ON(vfs_inode->i_data.nrpages);
7724 WARN_ON(inode->block_rsv.reserved);
7725 WARN_ON(inode->block_rsv.size);
7726 WARN_ON(inode->outstanding_extents);
dc287224
FM
7727 if (!S_ISDIR(vfs_inode->i_mode)) {
7728 WARN_ON(inode->delalloc_bytes);
7729 WARN_ON(inode->new_delalloc_bytes);
d9891ae2 7730 WARN_ON(inode->csum_bytes);
dc287224 7731 }
d9891ae2
FM
7732 if (!root || !btrfs_is_data_reloc_root(root))
7733 WARN_ON(inode->defrag_bytes);
39279cc3 7734
a6dbd429
JB
7735 /*
7736 * This can happen where we create an inode, but somebody else also
7737 * created the same inode and we need to destroy the one we already
7738 * created.
7739 */
7740 if (!root)
26602cab 7741 return;
a6dbd429 7742
5f4403e1
IA
7743 /*
7744 * If this is a free space inode do not take the ordered extents lockdep
7745 * map.
7746 */
7747 freespace_inode = btrfs_is_free_space_inode(inode);
7748
d397712b 7749 while (1) {
633cc816 7750 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
e6dcd2dc
CM
7751 if (!ordered)
7752 break;
7753 else {
633cc816 7754 btrfs_err(root->fs_info,
5d163e0e 7755 "found ordered extent %llu %llu on inode cleanup",
bffe633e 7756 ordered->file_offset, ordered->num_bytes);
5f4403e1
IA
7757
7758 if (!freespace_inode)
7759 btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
7760
71fe0a55 7761 btrfs_remove_ordered_extent(inode, ordered);
e6dcd2dc
CM
7762 btrfs_put_ordered_extent(ordered);
7763 btrfs_put_ordered_extent(ordered);
7764 }
7765 }
633cc816 7766 btrfs_qgroup_check_reserved_leak(inode);
310b2f5d 7767 btrfs_del_inode_from_root(inode);
4c0c8cfc 7768 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
633cc816
NB
7769 btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
7770 btrfs_put_root(inode->root);
39279cc3
CM
7771}
7772
45321ac5 7773int btrfs_drop_inode(struct inode *inode)
76dda93c
YZ
7774{
7775 struct btrfs_root *root = BTRFS_I(inode)->root;
45321ac5 7776
6379ef9f
NA
7777 if (root == NULL)
7778 return 1;
7779
fa6ac876 7780 /* the snap/subvol tree is on deleting */
69e9c6c6 7781 if (btrfs_root_refs(&root->root_item) == 0)
45321ac5 7782 return 1;
76dda93c 7783 else
45321ac5 7784 return generic_drop_inode(inode);
76dda93c
YZ
7785}
7786
0ee0fda0 7787static void init_once(void *foo)
39279cc3 7788{
0d031dc4 7789 struct btrfs_inode *ei = foo;
39279cc3
CM
7790
7791 inode_init_once(&ei->vfs_inode);
7792}
7793
e67c718b 7794void __cold btrfs_destroy_cachep(void)
39279cc3 7795{
8c0a8537
KS
7796 /*
7797 * Make sure all delayed rcu free inodes are flushed before we
7798 * destroy cache.
7799 */
7800 rcu_barrier();
5598e900 7801 kmem_cache_destroy(btrfs_inode_cachep);
39279cc3
CM
7802}
7803
f5c29bd9 7804int __init btrfs_init_cachep(void)
39279cc3 7805{
837e1972 7806 btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
9601e3f6 7807 sizeof(struct btrfs_inode), 0,
ef5a05c5 7808 SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
5d097056 7809 init_once);
39279cc3 7810 if (!btrfs_inode_cachep)
9aa29a20 7811 return -ENOMEM;
642c5d34 7812
39279cc3 7813 return 0;
39279cc3
CM
7814}
7815
b74d24f7 7816static int btrfs_getattr(struct mnt_idmap *idmap,
549c7297 7817 const struct path *path, struct kstat *stat,
a528d35e 7818 u32 request_mask, unsigned int flags)
39279cc3 7819{
df0af1a5 7820 u64 delalloc_bytes;
2766ff61 7821 u64 inode_bytes;
a528d35e 7822 struct inode *inode = d_inode(path->dentry);
4e00422e 7823 u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize;
04a87e34 7824 u32 bi_flags = BTRFS_I(inode)->flags;
14605409 7825 u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
04a87e34
YS
7826
7827 stat->result_mask |= STATX_BTIME;
c6e8f898
DS
7828 stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
7829 stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
04a87e34
YS
7830 if (bi_flags & BTRFS_INODE_APPEND)
7831 stat->attributes |= STATX_ATTR_APPEND;
7832 if (bi_flags & BTRFS_INODE_COMPRESS)
7833 stat->attributes |= STATX_ATTR_COMPRESSED;
7834 if (bi_flags & BTRFS_INODE_IMMUTABLE)
7835 stat->attributes |= STATX_ATTR_IMMUTABLE;
7836 if (bi_flags & BTRFS_INODE_NODUMP)
7837 stat->attributes |= STATX_ATTR_NODUMP;
14605409
BB
7838 if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
7839 stat->attributes |= STATX_ATTR_VERITY;
04a87e34
YS
7840
7841 stat->attributes_mask |= (STATX_ATTR_APPEND |
7842 STATX_ATTR_COMPRESSED |
7843 STATX_ATTR_IMMUTABLE |
7844 STATX_ATTR_NODUMP);
fadc0d8b 7845
0d72b928 7846 generic_fillattr(idmap, request_mask, inode, stat);
0ee5dc67 7847 stat->dev = BTRFS_I(inode)->root->anon_dev;
df0af1a5 7848
2a82bb02
KO
7849 stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
7850 stat->result_mask |= STATX_SUBVOL;
7851
df0af1a5 7852 spin_lock(&BTRFS_I(inode)->lock);
a7e3b975 7853 delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
2766ff61 7854 inode_bytes = inode_get_bytes(inode);
df0af1a5 7855 spin_unlock(&BTRFS_I(inode)->lock);
2766ff61 7856 stat->blocks = (ALIGN(inode_bytes, blocksize) +
29e70be2 7857 ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
39279cc3
CM
7858 return 0;
7859}
7860
cdd1fedf
DF
7861static int btrfs_rename_exchange(struct inode *old_dir,
7862 struct dentry *old_dentry,
7863 struct inode *new_dir,
7864 struct dentry *new_dentry)
7865{
41044b41 7866 struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
cdd1fedf 7867 struct btrfs_trans_handle *trans;
c1621871 7868 unsigned int trans_num_items;
cdd1fedf
DF
7869 struct btrfs_root *root = BTRFS_I(old_dir)->root;
7870 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
7871 struct inode *new_inode = new_dentry->d_inode;
7872 struct inode *old_inode = old_dentry->d_inode;
88d2beec
FM
7873 struct btrfs_rename_ctx old_rename_ctx;
7874 struct btrfs_rename_ctx new_rename_ctx;
4a0cc7ca
NB
7875 u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
7876 u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
cdd1fedf
DF
7877 u64 old_idx = 0;
7878 u64 new_idx = 0;
cdd1fedf 7879 int ret;
75b463d2 7880 int ret2;
dc09ef35 7881 bool need_abort = false;
ab3c5c18 7882 struct fscrypt_name old_fname, new_fname;
6db75318 7883 struct fscrypt_str *old_name, *new_name;
cdd1fedf 7884
3f79f6f6
N
7885 /*
7886 * For non-subvolumes allow exchange only within one subvolume, in the
7887 * same inode namespace. Two subvolumes (represented as directory) can
7888 * be exchanged as they're a logical link and have a fixed inode number.
7889 */
7890 if (root != dest &&
7891 (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
7892 new_ino != BTRFS_FIRST_FREE_OBJECTID))
cdd1fedf
DF
7893 return -EXDEV;
7894
ab3c5c18
STD
7895 ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
7896 if (ret)
7897 return ret;
7898
7899 ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
7900 if (ret) {
7901 fscrypt_free_filename(&old_fname);
7902 return ret;
7903 }
7904
6db75318
STD
7905 old_name = &old_fname.disk_name;
7906 new_name = &new_fname.disk_name;
ab3c5c18 7907
cdd1fedf 7908 /* close the race window with snapshot create/destroy ioctl */
943eb3bf
JB
7909 if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
7910 new_ino == BTRFS_FIRST_FREE_OBJECTID)
0b246afa 7911 down_read(&fs_info->subvol_sem);
cdd1fedf
DF
7912
7913 /*
c1621871
OS
7914 * For each inode:
7915 * 1 to remove old dir item
7916 * 1 to remove old dir index
7917 * 1 to add new dir item
7918 * 1 to add new dir index
7919 * 1 to update parent inode
7920 *
7921 * If the parents are the same, we only need to account for one
cdd1fedf 7922 */
c1621871
OS
7923 trans_num_items = (old_dir == new_dir ? 9 : 10);
7924 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
7925 /*
7926 * 1 to remove old root ref
7927 * 1 to remove old root backref
7928 * 1 to add new root ref
7929 * 1 to add new root backref
7930 */
7931 trans_num_items += 4;
7932 } else {
7933 /*
7934 * 1 to update inode item
7935 * 1 to remove old inode ref
7936 * 1 to add new inode ref
7937 */
7938 trans_num_items += 3;
7939 }
7940 if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
7941 trans_num_items += 4;
7942 else
7943 trans_num_items += 3;
7944 trans = btrfs_start_transaction(root, trans_num_items);
cdd1fedf
DF
7945 if (IS_ERR(trans)) {
7946 ret = PTR_ERR(trans);
7947 goto out_notrans;
7948 }
7949
00aa8e87
JB
7950 if (dest != root) {
7951 ret = btrfs_record_root_in_trans(trans, dest);
7952 if (ret)
7953 goto out_fail;
7954 }
3e174099 7955
cdd1fedf
DF
7956 /*
7957 * We need to find a free sequence number both in the source and
7958 * in the destination directory for the exchange.
7959 */
877574e2 7960 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
cdd1fedf
DF
7961 if (ret)
7962 goto out_fail;
877574e2 7963 ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
cdd1fedf
DF
7964 if (ret)
7965 goto out_fail;
7966
7967 BTRFS_I(old_inode)->dir_index = 0ULL;
7968 BTRFS_I(new_inode)->dir_index = 0ULL;
7969
7970 /* Reference for the source. */
7971 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
7972 /* force full log commit if subvolume involved. */
90787766 7973 btrfs_set_log_full_commit(trans);
cdd1fedf 7974 } else {
6db75318 7975 ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
f85b7379
DS
7976 btrfs_ino(BTRFS_I(new_dir)),
7977 old_idx);
cdd1fedf
DF
7978 if (ret)
7979 goto out_fail;
dc09ef35 7980 need_abort = true;
cdd1fedf
DF
7981 }
7982
7983 /* And now for the dest. */
7984 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
7985 /* force full log commit if subvolume involved. */
90787766 7986 btrfs_set_log_full_commit(trans);
cdd1fedf 7987 } else {
6db75318 7988 ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
f85b7379
DS
7989 btrfs_ino(BTRFS_I(old_dir)),
7990 new_idx);
dc09ef35
JB
7991 if (ret) {
7992 if (need_abort)
7993 btrfs_abort_transaction(trans, ret);
cdd1fedf 7994 goto out_fail;
dc09ef35 7995 }
cdd1fedf
DF
7996 }
7997
7998 /* Update inode version and ctime/mtime. */
7999 inode_inc_iversion(old_dir);
8000 inode_inc_iversion(new_dir);
8001 inode_inc_iversion(old_inode);
8002 inode_inc_iversion(new_inode);
130f1eca 8003 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
cdd1fedf
DF
8004
8005 if (old_dentry->d_parent != new_dentry->d_parent) {
f85b7379 8006 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
59fcf388 8007 BTRFS_I(old_inode), true);
f85b7379 8008 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
59fcf388 8009 BTRFS_I(new_inode), true);
cdd1fedf
DF
8010 }
8011
8012 /* src is a subvolume */
8013 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
5b7544cb 8014 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
cdd1fedf 8015 } else { /* src is an inode */
4467af88 8016 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
4ec5934e 8017 BTRFS_I(old_dentry->d_inode),
6db75318 8018 old_name, &old_rename_ctx);
cdd1fedf 8019 if (!ret)
8b9d0322 8020 ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
cdd1fedf
DF
8021 }
8022 if (ret) {
66642832 8023 btrfs_abort_transaction(trans, ret);
cdd1fedf
DF
8024 goto out_fail;
8025 }
8026
8027 /* dest is a subvolume */
8028 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
5b7544cb 8029 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
cdd1fedf 8030 } else { /* dest is an inode */
4467af88 8031 ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
4ec5934e 8032 BTRFS_I(new_dentry->d_inode),
6db75318 8033 new_name, &new_rename_ctx);
cdd1fedf 8034 if (!ret)
8b9d0322 8035 ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
cdd1fedf
DF
8036 }
8037 if (ret) {
66642832 8038 btrfs_abort_transaction(trans, ret);
cdd1fedf
DF
8039 goto out_fail;
8040 }
8041
db0a669f 8042 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
6db75318 8043 new_name, 0, old_idx);
cdd1fedf 8044 if (ret) {
66642832 8045 btrfs_abort_transaction(trans, ret);
cdd1fedf
DF
8046 goto out_fail;
8047 }
8048
db0a669f 8049 ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
6db75318 8050 old_name, 0, new_idx);
cdd1fedf 8051 if (ret) {
66642832 8052 btrfs_abort_transaction(trans, ret);
cdd1fedf
DF
8053 goto out_fail;
8054 }
8055
8056 if (old_inode->i_nlink == 1)
8057 BTRFS_I(old_inode)->dir_index = old_idx;
8058 if (new_inode->i_nlink == 1)
8059 BTRFS_I(new_inode)->dir_index = new_idx;
8060
259c4b96
FM
8061 /*
8062 * Now pin the logs of the roots. We do it to ensure that no other task
8063 * can sync the logs while we are in progress with the rename, because
8064 * that could result in an inconsistency in case any of the inodes that
8065 * are part of this rename operation were logged before.
8066 */
8067 if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8068 btrfs_pin_log_trans(root);
8069 if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8070 btrfs_pin_log_trans(dest);
8071
8072 /* Do the log updates for all inodes. */
8073 if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
d5f5bd54 8074 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
88d2beec 8075 old_rename_ctx.index, new_dentry->d_parent);
259c4b96 8076 if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
d5f5bd54 8077 btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
88d2beec 8078 new_rename_ctx.index, old_dentry->d_parent);
259c4b96
FM
8079
8080 /* Now unpin the logs. */
8081 if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8082 btrfs_end_log_trans(root);
8083 if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
cdd1fedf 8084 btrfs_end_log_trans(dest);
cdd1fedf 8085out_fail:
75b463d2
FM
8086 ret2 = btrfs_end_transaction(trans);
8087 ret = ret ? ret : ret2;
cdd1fedf 8088out_notrans:
943eb3bf
JB
8089 if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
8090 old_ino == BTRFS_FIRST_FREE_OBJECTID)
0b246afa 8091 up_read(&fs_info->subvol_sem);
cdd1fedf 8092
ab3c5c18
STD
8093 fscrypt_free_filename(&new_fname);
8094 fscrypt_free_filename(&old_fname);
cdd1fedf
DF
8095 return ret;
8096}
8097
f2d40141 8098static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
a1fd0c35
OS
8099 struct inode *dir)
8100{
8101 struct inode *inode;
8102
8103 inode = new_inode(dir->i_sb);
8104 if (inode) {
f2d40141 8105 inode_init_owner(idmap, inode, dir,
a1fd0c35
OS
8106 S_IFCHR | WHITEOUT_MODE);
8107 inode->i_op = &btrfs_special_inode_operations;
8108 init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
8109 }
8110 return inode;
8111}
8112
f2d40141 8113static int btrfs_rename(struct mnt_idmap *idmap,
ca07274c
CB
8114 struct inode *old_dir, struct dentry *old_dentry,
8115 struct inode *new_dir, struct dentry *new_dentry,
8116 unsigned int flags)
39279cc3 8117{
41044b41 8118 struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
3538d68d
OS
8119 struct btrfs_new_inode_args whiteout_args = {
8120 .dir = old_dir,
8121 .dentry = old_dentry,
8122 };
39279cc3 8123 struct btrfs_trans_handle *trans;
5062af35 8124 unsigned int trans_num_items;
39279cc3 8125 struct btrfs_root *root = BTRFS_I(old_dir)->root;
4df27c4d 8126 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
2b0143b5
DH
8127 struct inode *new_inode = d_inode(new_dentry);
8128 struct inode *old_inode = d_inode(old_dentry);
88d2beec 8129 struct btrfs_rename_ctx rename_ctx;
00e4e6b3 8130 u64 index = 0;
39279cc3 8131 int ret;
75b463d2 8132 int ret2;
4a0cc7ca 8133 u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
ab3c5c18 8134 struct fscrypt_name old_fname, new_fname;
39279cc3 8135
4a0cc7ca 8136 if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
f679a840
YZ
8137 return -EPERM;
8138
4df27c4d 8139 /* we only allow rename subvolume link between subvolumes */
33345d01 8140 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
3394e160
CM
8141 return -EXDEV;
8142
33345d01 8143 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
4a0cc7ca 8144 (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
39279cc3 8145 return -ENOTEMPTY;
5f39d397 8146
4df27c4d
YZ
8147 if (S_ISDIR(old_inode->i_mode) && new_inode &&
8148 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
8149 return -ENOTEMPTY;
9c52057c 8150
ab3c5c18
STD
8151 ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
8152 if (ret)
8153 return ret;
9c52057c 8154
ab3c5c18
STD
8155 ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
8156 if (ret) {
8157 fscrypt_free_filename(&old_fname);
8158 return ret;
8159 }
9c52057c 8160
9c52057c 8161 /* check for collisions, even if the name isn't there */
6db75318 8162 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
9c52057c
CM
8163 if (ret) {
8164 if (ret == -EEXIST) {
8165 /* we shouldn't get
8166 * eexist without a new_inode */
fae7f21c 8167 if (WARN_ON(!new_inode)) {
ab3c5c18 8168 goto out_fscrypt_names;
9c52057c
CM
8169 }
8170 } else {
8171 /* maybe -EOVERFLOW */
ab3c5c18 8172 goto out_fscrypt_names;
9c52057c
CM
8173 }
8174 }
8175 ret = 0;
8176
5a3f23d5 8177 /*
8d875f95
CM
8178 * we're using rename to replace one file with another. Start IO on it
8179 * now so we don't add too much work to the end of the transaction
5a3f23d5 8180 */
8d875f95 8181 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
5a3f23d5
CM
8182 filemap_flush(old_inode->i_mapping);
8183
a1fd0c35 8184 if (flags & RENAME_WHITEOUT) {
f2d40141 8185 whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
abe3bf74
CJ
8186 if (!whiteout_args.inode) {
8187 ret = -ENOMEM;
8188 goto out_fscrypt_names;
8189 }
3538d68d
OS
8190 ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
8191 if (ret)
8192 goto out_whiteout_inode;
8193 } else {
8194 /* 1 to update the old parent inode. */
8195 trans_num_items = 1;
a1fd0c35
OS
8196 }
8197
c1621871
OS
8198 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8199 /* Close the race window with snapshot create/destroy ioctl */
0b246afa 8200 down_read(&fs_info->subvol_sem);
c1621871
OS
8201 /*
8202 * 1 to remove old root ref
8203 * 1 to remove old root backref
8204 * 1 to add new root ref
8205 * 1 to add new root backref
8206 */
3538d68d 8207 trans_num_items += 4;
c1621871
OS
8208 } else {
8209 /*
8210 * 1 to update inode
8211 * 1 to remove old inode ref
8212 * 1 to add new inode ref
8213 */
3538d68d 8214 trans_num_items += 3;
c1621871 8215 }
a22285a6 8216 /*
c1621871
OS
8217 * 1 to remove old dir item
8218 * 1 to remove old dir index
c1621871
OS
8219 * 1 to add new dir item
8220 * 1 to add new dir index
a22285a6 8221 */
3538d68d
OS
8222 trans_num_items += 4;
8223 /* 1 to update new parent inode if it's not the same as the old parent */
c1621871
OS
8224 if (new_dir != old_dir)
8225 trans_num_items++;
8226 if (new_inode) {
8227 /*
8228 * 1 to update inode
8229 * 1 to remove inode ref
8230 * 1 to remove dir item
8231 * 1 to remove dir index
8232 * 1 to possibly add orphan item
8233 */
8234 trans_num_items += 5;
8235 }
5062af35 8236 trans = btrfs_start_transaction(root, trans_num_items);
b44c59a8 8237 if (IS_ERR(trans)) {
cdd1fedf
DF
8238 ret = PTR_ERR(trans);
8239 goto out_notrans;
8240 }
76dda93c 8241
b0fec6fd
JB
8242 if (dest != root) {
8243 ret = btrfs_record_root_in_trans(trans, dest);
8244 if (ret)
8245 goto out_fail;
8246 }
5f39d397 8247
877574e2 8248 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
a5719521
YZ
8249 if (ret)
8250 goto out_fail;
5a3f23d5 8251
67de1176 8252 BTRFS_I(old_inode)->dir_index = 0ULL;
33345d01 8253 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
4df27c4d 8254 /* force full log commit if subvolume involved. */
90787766 8255 btrfs_set_log_full_commit(trans);
4df27c4d 8256 } else {
6db75318
STD
8257 ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
8258 old_ino, btrfs_ino(BTRFS_I(new_dir)),
8259 index);
a5719521
YZ
8260 if (ret)
8261 goto out_fail;
4df27c4d 8262 }
5a3f23d5 8263
0c4d2d95
JB
8264 inode_inc_iversion(old_dir);
8265 inode_inc_iversion(new_dir);
8266 inode_inc_iversion(old_inode);
130f1eca 8267 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
5f39d397 8268
12fcfd22 8269 if (old_dentry->d_parent != new_dentry->d_parent)
f85b7379 8270 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
59fcf388 8271 BTRFS_I(old_inode), true);
12fcfd22 8272
33345d01 8273 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
5b7544cb 8274 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
4df27c4d 8275 } else {
4467af88 8276 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
e43eec81 8277 BTRFS_I(d_inode(old_dentry)),
6db75318 8278 &old_fname.disk_name, &rename_ctx);
92986796 8279 if (!ret)
8b9d0322 8280 ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
4df27c4d 8281 }
79787eaa 8282 if (ret) {
66642832 8283 btrfs_abort_transaction(trans, ret);
79787eaa
JM
8284 goto out_fail;
8285 }
39279cc3
CM
8286
8287 if (new_inode) {
0c4d2d95 8288 inode_inc_iversion(new_inode);
4a0cc7ca 8289 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
4df27c4d 8290 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
5b7544cb 8291 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
4df27c4d
YZ
8292 BUG_ON(new_inode->i_nlink == 0);
8293 } else {
4467af88 8294 ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
4ec5934e 8295 BTRFS_I(d_inode(new_dentry)),
6db75318 8296 &new_fname.disk_name);
4df27c4d 8297 }
4ef31a45 8298 if (!ret && new_inode->i_nlink == 0)
73f2e545
NB
8299 ret = btrfs_orphan_add(trans,
8300 BTRFS_I(d_inode(new_dentry)));
79787eaa 8301 if (ret) {
66642832 8302 btrfs_abort_transaction(trans, ret);
79787eaa
JM
8303 goto out_fail;
8304 }
39279cc3 8305 }
aec7477b 8306
db0a669f 8307 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
6db75318 8308 &new_fname.disk_name, 0, index);
79787eaa 8309 if (ret) {
66642832 8310 btrfs_abort_transaction(trans, ret);
79787eaa
JM
8311 goto out_fail;
8312 }
39279cc3 8313
67de1176
MX
8314 if (old_inode->i_nlink == 1)
8315 BTRFS_I(old_inode)->dir_index = index;
8316
259c4b96 8317 if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
d5f5bd54 8318 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
88d2beec 8319 rename_ctx.index, new_dentry->d_parent);
cdd1fedf
DF
8320
8321 if (flags & RENAME_WHITEOUT) {
caae78e0 8322 ret = btrfs_create_new_inode(trans, &whiteout_args);
cdd1fedf 8323 if (ret) {
66642832 8324 btrfs_abort_transaction(trans, ret);
cdd1fedf 8325 goto out_fail;
caae78e0
OS
8326 } else {
8327 unlock_new_inode(whiteout_args.inode);
8328 iput(whiteout_args.inode);
8329 whiteout_args.inode = NULL;
cdd1fedf 8330 }
4df27c4d 8331 }
39279cc3 8332out_fail:
75b463d2
FM
8333 ret2 = btrfs_end_transaction(trans);
8334 ret = ret ? ret : ret2;
b44c59a8 8335out_notrans:
33345d01 8336 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
0b246afa 8337 up_read(&fs_info->subvol_sem);
a1fd0c35 8338 if (flags & RENAME_WHITEOUT)
3538d68d
OS
8339 btrfs_new_inode_args_destroy(&whiteout_args);
8340out_whiteout_inode:
8341 if (flags & RENAME_WHITEOUT)
8342 iput(whiteout_args.inode);
ab3c5c18
STD
8343out_fscrypt_names:
8344 fscrypt_free_filename(&old_fname);
8345 fscrypt_free_filename(&new_fname);
39279cc3
CM
8346 return ret;
8347}
8348
e18275ae 8349static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
549c7297
CB
8350 struct dentry *old_dentry, struct inode *new_dir,
8351 struct dentry *new_dentry, unsigned int flags)
80ace85c 8352{
ca6dee6b
FM
8353 int ret;
8354
cdd1fedf 8355 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
80ace85c
MS
8356 return -EINVAL;
8357
cdd1fedf 8358 if (flags & RENAME_EXCHANGE)
ca6dee6b
FM
8359 ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
8360 new_dentry);
8361 else
f2d40141 8362 ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
ca6dee6b 8363 new_dentry, flags);
cdd1fedf 8364
ca6dee6b
FM
8365 btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
8366
8367 return ret;
80ace85c
MS
8368}
8369
3a2f8c07
NB
8370struct btrfs_delalloc_work {
8371 struct inode *inode;
8372 struct completion completion;
8373 struct list_head list;
8374 struct btrfs_work work;
8375};
8376
8ccf6f19
MX
8377static void btrfs_run_delalloc_work(struct btrfs_work *work)
8378{
8379 struct btrfs_delalloc_work *delalloc_work;
9f23e289 8380 struct inode *inode;
8ccf6f19
MX
8381
8382 delalloc_work = container_of(work, struct btrfs_delalloc_work,
8383 work);
9f23e289 8384 inode = delalloc_work->inode;
30424601
DS
8385 filemap_flush(inode->i_mapping);
8386 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8387 &BTRFS_I(inode)->runtime_flags))
9f23e289 8388 filemap_flush(inode->i_mapping);
8ccf6f19 8389
076da91c 8390 iput(inode);
8ccf6f19
MX
8391 complete(&delalloc_work->completion);
8392}
8393
3a2f8c07 8394static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
8ccf6f19
MX
8395{
8396 struct btrfs_delalloc_work *work;
8397
100d5702 8398 work = kmalloc(sizeof(*work), GFP_NOFS);
8ccf6f19
MX
8399 if (!work)
8400 return NULL;
8401
8402 init_completion(&work->completion);
8403 INIT_LIST_HEAD(&work->list);
8404 work->inode = inode;
078b8b90 8405 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);
8ccf6f19
MX
8406
8407 return work;
8408}
8409
d352ac68
CM
8410/*
8411 * some fairly slow code that needs optimization. This walks the list
8412 * of all the inodes with pending delalloc and forces them to disk.
8413 */
e076ab2a
JB
8414static int start_delalloc_inodes(struct btrfs_root *root,
8415 struct writeback_control *wbc, bool snapshot,
3d45f221 8416 bool in_reclaim_context)
ea8c2819 8417{
ea8c2819 8418 struct btrfs_inode *binode;
5b21f2ed 8419 struct inode *inode;
8ccf6f19 8420 struct btrfs_delalloc_work *work, *next;
84af994b
RJ
8421 LIST_HEAD(works);
8422 LIST_HEAD(splice);
8ccf6f19 8423 int ret = 0;
e076ab2a 8424 bool full_flush = wbc->nr_to_write == LONG_MAX;
ea8c2819 8425
573bfb72 8426 mutex_lock(&root->delalloc_mutex);
eb73c1b7
MX
8427 spin_lock(&root->delalloc_lock);
8428 list_splice_init(&root->delalloc_inodes, &splice);
1eafa6c7
MX
8429 while (!list_empty(&splice)) {
8430 binode = list_entry(splice.next, struct btrfs_inode,
ea8c2819 8431 delalloc_inodes);
1eafa6c7 8432
eb73c1b7
MX
8433 list_move_tail(&binode->delalloc_inodes,
8434 &root->delalloc_inodes);
3d45f221
FM
8435
8436 if (in_reclaim_context &&
8437 test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
8438 continue;
8439
5b21f2ed 8440 inode = igrab(&binode->vfs_inode);
df0af1a5 8441 if (!inode) {
eb73c1b7 8442 cond_resched_lock(&root->delalloc_lock);
1eafa6c7 8443 continue;
df0af1a5 8444 }
eb73c1b7 8445 spin_unlock(&root->delalloc_lock);
1eafa6c7 8446
3cd24c69
EL
8447 if (snapshot)
8448 set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
8449 &binode->runtime_flags);
e076ab2a
JB
8450 if (full_flush) {
8451 work = btrfs_alloc_delalloc_work(inode);
8452 if (!work) {
8453 iput(inode);
8454 ret = -ENOMEM;
8455 goto out;
8456 }
8457 list_add_tail(&work->list, &works);
8458 btrfs_queue_work(root->fs_info->flush_workers,
8459 &work->work);
8460 } else {
b3776305 8461 ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
e55cf7ca 8462 btrfs_add_delayed_iput(BTRFS_I(inode));
e076ab2a 8463 if (ret || wbc->nr_to_write <= 0)
b4912139
JB
8464 goto out;
8465 }
5b21f2ed 8466 cond_resched();
eb73c1b7 8467 spin_lock(&root->delalloc_lock);
ea8c2819 8468 }
eb73c1b7 8469 spin_unlock(&root->delalloc_lock);
8c8bee1d 8470
a1ecaabb 8471out:
eb73c1b7
MX
8472 list_for_each_entry_safe(work, next, &works, list) {
8473 list_del_init(&work->list);
40012f96
NB
8474 wait_for_completion(&work->completion);
8475 kfree(work);
eb73c1b7
MX
8476 }
8477
81f1d390 8478 if (!list_empty(&splice)) {
eb73c1b7
MX
8479 spin_lock(&root->delalloc_lock);
8480 list_splice_tail(&splice, &root->delalloc_inodes);
8481 spin_unlock(&root->delalloc_lock);
8482 }
573bfb72 8483 mutex_unlock(&root->delalloc_mutex);
eb73c1b7
MX
8484 return ret;
8485}
1eafa6c7 8486
f9baa501 8487int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
eb73c1b7 8488{
e076ab2a
JB
8489 struct writeback_control wbc = {
8490 .nr_to_write = LONG_MAX,
8491 .sync_mode = WB_SYNC_NONE,
8492 .range_start = 0,
8493 .range_end = LLONG_MAX,
8494 };
0b246afa 8495 struct btrfs_fs_info *fs_info = root->fs_info;
1eafa6c7 8496
84961539 8497 if (BTRFS_FS_ERROR(fs_info))
eb73c1b7
MX
8498 return -EROFS;
8499
f9baa501 8500 return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
eb73c1b7
MX
8501}
8502
9db4dc24 8503int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
3d45f221 8504 bool in_reclaim_context)
eb73c1b7 8505{
e076ab2a 8506 struct writeback_control wbc = {
9db4dc24 8507 .nr_to_write = nr,
e076ab2a
JB
8508 .sync_mode = WB_SYNC_NONE,
8509 .range_start = 0,
8510 .range_end = LLONG_MAX,
8511 };
eb73c1b7 8512 struct btrfs_root *root;
84af994b 8513 LIST_HEAD(splice);
eb73c1b7
MX
8514 int ret;
8515
84961539 8516 if (BTRFS_FS_ERROR(fs_info))
eb73c1b7
MX
8517 return -EROFS;
8518
573bfb72 8519 mutex_lock(&fs_info->delalloc_root_mutex);
eb73c1b7
MX
8520 spin_lock(&fs_info->delalloc_root_lock);
8521 list_splice_init(&fs_info->delalloc_roots, &splice);
d7830b71 8522 while (!list_empty(&splice)) {
e076ab2a
JB
8523 /*
8524 * Reset nr_to_write here so we know that we're doing a full
8525 * flush.
8526 */
9db4dc24 8527 if (nr == LONG_MAX)
e076ab2a
JB
8528 wbc.nr_to_write = LONG_MAX;
8529
eb73c1b7
MX
8530 root = list_first_entry(&splice, struct btrfs_root,
8531 delalloc_root);
00246528 8532 root = btrfs_grab_root(root);
eb73c1b7
MX
8533 BUG_ON(!root);
8534 list_move_tail(&root->delalloc_root,
8535 &fs_info->delalloc_roots);
8536 spin_unlock(&fs_info->delalloc_root_lock);
8537
e076ab2a 8538 ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
00246528 8539 btrfs_put_root(root);
e076ab2a 8540 if (ret < 0 || wbc.nr_to_write <= 0)
eb73c1b7 8541 goto out;
eb73c1b7 8542 spin_lock(&fs_info->delalloc_root_lock);
8ccf6f19 8543 }
eb73c1b7 8544 spin_unlock(&fs_info->delalloc_root_lock);
1eafa6c7 8545
6c255e67 8546 ret = 0;
eb73c1b7 8547out:
81f1d390 8548 if (!list_empty(&splice)) {
eb73c1b7
MX
8549 spin_lock(&fs_info->delalloc_root_lock);
8550 list_splice_tail(&splice, &fs_info->delalloc_roots);
8551 spin_unlock(&fs_info->delalloc_root_lock);
1eafa6c7 8552 }
573bfb72 8553 mutex_unlock(&fs_info->delalloc_root_mutex);
8ccf6f19 8554 return ret;
ea8c2819
CM
8555}
8556
7a77db95 8557static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
549c7297 8558 struct dentry *dentry, const char *symname)
39279cc3 8559{
41044b41 8560 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
39279cc3
CM
8561 struct btrfs_trans_handle *trans;
8562 struct btrfs_root *root = BTRFS_I(dir)->root;
8563 struct btrfs_path *path;
8564 struct btrfs_key key;
a1fd0c35 8565 struct inode *inode;
3538d68d
OS
8566 struct btrfs_new_inode_args new_inode_args = {
8567 .dir = dir,
8568 .dentry = dentry,
8569 };
8570 unsigned int trans_num_items;
39279cc3 8571 int err;
39279cc3
CM
8572 int name_len;
8573 int datasize;
5f39d397 8574 unsigned long ptr;
39279cc3 8575 struct btrfs_file_extent_item *ei;
5f39d397 8576 struct extent_buffer *leaf;
39279cc3 8577
f06becc4 8578 name_len = strlen(symname);
0b246afa 8579 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
39279cc3 8580 return -ENAMETOOLONG;
1832a6d5 8581
a1fd0c35
OS
8582 inode = new_inode(dir->i_sb);
8583 if (!inode)
8584 return -ENOMEM;
f2d40141 8585 inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
a1fd0c35
OS
8586 inode->i_op = &btrfs_symlink_inode_operations;
8587 inode_nohighmem(inode);
8588 inode->i_mapping->a_ops = &btrfs_aops;
caae78e0
OS
8589 btrfs_i_size_write(BTRFS_I(inode), name_len);
8590 inode_set_bytes(inode, name_len);
a1fd0c35 8591
3538d68d
OS
8592 new_inode_args.inode = inode;
8593 err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
caae78e0
OS
8594 if (err)
8595 goto out_inode;
3538d68d
OS
8596 /* 1 additional item for the inline extent */
8597 trans_num_items++;
8598
8599 trans = btrfs_start_transaction(root, trans_num_items);
a1fd0c35 8600 if (IS_ERR(trans)) {
3538d68d
OS
8601 err = PTR_ERR(trans);
8602 goto out_new_inode_args;
a1fd0c35 8603 }
1832a6d5 8604
caae78e0 8605 err = btrfs_create_new_inode(trans, &new_inode_args);
b0d5d10f 8606 if (err)
caae78e0 8607 goto out;
ad19db71 8608
39279cc3 8609 path = btrfs_alloc_path();
d8926bb3
MF
8610 if (!path) {
8611 err = -ENOMEM;
caae78e0
OS
8612 btrfs_abort_transaction(trans, err);
8613 discard_new_inode(inode);
8614 inode = NULL;
8615 goto out;
d8926bb3 8616 }
4a0cc7ca 8617 key.objectid = btrfs_ino(BTRFS_I(inode));
39279cc3 8618 key.offset = 0;
962a298f 8619 key.type = BTRFS_EXTENT_DATA_KEY;
39279cc3
CM
8620 datasize = btrfs_file_extent_calc_inline_size(name_len);
8621 err = btrfs_insert_empty_item(trans, root, path, &key,
8622 datasize);
54aa1f4d 8623 if (err) {
caae78e0 8624 btrfs_abort_transaction(trans, err);
b0839166 8625 btrfs_free_path(path);
caae78e0
OS
8626 discard_new_inode(inode);
8627 inode = NULL;
8628 goto out;
54aa1f4d 8629 }
5f39d397
CM
8630 leaf = path->nodes[0];
8631 ei = btrfs_item_ptr(leaf, path->slots[0],
8632 struct btrfs_file_extent_item);
8633 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
8634 btrfs_set_file_extent_type(leaf, ei,
39279cc3 8635 BTRFS_FILE_EXTENT_INLINE);
c8b97818
CM
8636 btrfs_set_file_extent_encryption(leaf, ei, 0);
8637 btrfs_set_file_extent_compression(leaf, ei, 0);
8638 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
8639 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
8640
39279cc3 8641 ptr = btrfs_file_extent_inline_start(ei);
5f39d397 8642 write_extent_buffer(leaf, symname, ptr, name_len);
50564b65 8643 btrfs_mark_buffer_dirty(trans, leaf);
39279cc3 8644 btrfs_free_path(path);
5f39d397 8645
1e2e547a 8646 d_instantiate_new(dentry, inode);
caae78e0
OS
8647 err = 0;
8648out:
3a45bb20 8649 btrfs_end_transaction(trans);
2ff7e61e 8650 btrfs_btree_balance_dirty(fs_info);
3538d68d
OS
8651out_new_inode_args:
8652 btrfs_new_inode_args_destroy(&new_inode_args);
caae78e0
OS
8653out_inode:
8654 if (err)
8655 iput(inode);
39279cc3
CM
8656 return err;
8657}
16432985 8658
8fccebfa
FM
8659static struct btrfs_trans_handle *insert_prealloc_file_extent(
8660 struct btrfs_trans_handle *trans_in,
90dffd0c
NB
8661 struct btrfs_inode *inode,
8662 struct btrfs_key *ins,
203f44c5
QW
8663 u64 file_offset)
8664{
8665 struct btrfs_file_extent_item stack_fi;
bf385648 8666 struct btrfs_replace_extent_info extent_info;
8fccebfa
FM
8667 struct btrfs_trans_handle *trans = trans_in;
8668 struct btrfs_path *path;
203f44c5
QW
8669 u64 start = ins->objectid;
8670 u64 len = ins->offset;
9e65bfca 8671 u64 qgroup_released = 0;
9729f10a 8672 int ret;
203f44c5
QW
8673
8674 memset(&stack_fi, 0, sizeof(stack_fi));
8675
8676 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
8677 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
8678 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
8679 btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
8680 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
8681 btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
8682 /* Encryption and other encoding is reserved and all 0 */
8683
9e65bfca
BB
8684 ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
8685 if (ret < 0)
8686 return ERR_PTR(ret);
8fccebfa
FM
8687
8688 if (trans) {
90dffd0c 8689 ret = insert_reserved_file_extent(trans, inode,
2766ff61 8690 file_offset, &stack_fi,
fbf48bb0 8691 true, qgroup_released);
8fccebfa 8692 if (ret)
a3ee79bd 8693 goto free_qgroup;
8fccebfa
FM
8694 return trans;
8695 }
8696
8697 extent_info.disk_offset = start;
8698 extent_info.disk_len = len;
8699 extent_info.data_offset = 0;
8700 extent_info.data_len = len;
8701 extent_info.file_offset = file_offset;
8702 extent_info.extent_buf = (char *)&stack_fi;
8fccebfa 8703 extent_info.is_new_extent = true;
983d8209 8704 extent_info.update_times = true;
fbf48bb0 8705 extent_info.qgroup_reserved = qgroup_released;
8fccebfa
FM
8706 extent_info.insertions = 0;
8707
8708 path = btrfs_alloc_path();
a3ee79bd
QW
8709 if (!path) {
8710 ret = -ENOMEM;
8711 goto free_qgroup;
8712 }
8fccebfa 8713
bfc78479 8714 ret = btrfs_replace_file_extents(inode, path, file_offset,
8fccebfa
FM
8715 file_offset + len - 1, &extent_info,
8716 &trans);
8717 btrfs_free_path(path);
8718 if (ret)
a3ee79bd 8719 goto free_qgroup;
8fccebfa 8720 return trans;
a3ee79bd
QW
8721
8722free_qgroup:
8723 /*
8724 * We have released qgroup data range at the beginning of the function,
8725 * and normally qgroup_released bytes will be freed when committing
8726 * transaction.
8727 * But if we error out early, we have to free what we have released
8728 * or we leak qgroup data reservation.
8729 */
8730 btrfs_qgroup_free_refroot(inode->root->fs_info,
e094f480 8731 btrfs_root_id(inode->root), qgroup_released,
a3ee79bd
QW
8732 BTRFS_QGROUP_RSV_DATA);
8733 return ERR_PTR(ret);
203f44c5 8734}
8fccebfa 8735
0af3d00b
JB
8736static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
8737 u64 start, u64 num_bytes, u64 min_size,
8738 loff_t actual_len, u64 *alloc_hint,
8739 struct btrfs_trans_handle *trans)
d899e052 8740{
41044b41 8741 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
5dc562c5 8742 struct extent_map *em;
d899e052
YZ
8743 struct btrfs_root *root = BTRFS_I(inode)->root;
8744 struct btrfs_key ins;
d899e052 8745 u64 cur_offset = start;
b778cf96 8746 u64 clear_offset = start;
55a61d1d 8747 u64 i_size;
154ea289 8748 u64 cur_bytes;
0b670dc4 8749 u64 last_alloc = (u64)-1;
d899e052 8750 int ret = 0;
0af3d00b 8751 bool own_trans = true;
18513091 8752 u64 end = start + num_bytes - 1;
d899e052 8753
0af3d00b
JB
8754 if (trans)
8755 own_trans = false;
d899e052 8756 while (num_bytes > 0) {
ee22184b 8757 cur_bytes = min_t(u64, num_bytes, SZ_256M);
154ea289 8758 cur_bytes = max(cur_bytes, min_size);
0b670dc4
JB
8759 /*
8760 * If we are severely fragmented we could end up with really
8761 * small allocations, so if the allocator is returning small
8762 * chunks lets make its job easier by only searching for those
8763 * sized chunks.
8764 */
8765 cur_bytes = min(cur_bytes, last_alloc);
18513091
WX
8766 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
8767 min_size, 0, *alloc_hint, &ins, 1, 0);
8fccebfa 8768 if (ret)
a22285a6 8769 break;
b778cf96
JB
8770
8771 /*
8772 * We've reserved this space, and thus converted it from
8773 * ->bytes_may_use to ->bytes_reserved. Any error that happens
8774 * from here on out we will only need to clear our reservation
8775 * for the remaining unreserved area, so advance our
8776 * clear_offset by our extent size.
8777 */
8778 clear_offset += ins.offset;
5a303d5d 8779
0b670dc4 8780 last_alloc = ins.offset;
90dffd0c
NB
8781 trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
8782 &ins, cur_offset);
1afc708d
FM
8783 /*
8784 * Now that we inserted the prealloc extent we can finally
8785 * decrement the number of reservations in the block group.
8786 * If we did it before, we could race with relocation and have
8787 * relocation miss the reserved extent, making it fail later.
8788 */
8789 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
8fccebfa
FM
8790 if (IS_ERR(trans)) {
8791 ret = PTR_ERR(trans);
2ff7e61e 8792 btrfs_free_reserved_extent(fs_info, ins.objectid,
e570fd27 8793 ins.offset, 0);
79787eaa
JM
8794 break;
8795 }
31193213 8796
5dc562c5
JB
8797 em = alloc_extent_map();
8798 if (!em) {
a1ba4c08
FM
8799 btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
8800 cur_offset + ins.offset - 1, false);
23e3337f 8801 btrfs_set_inode_full_sync(BTRFS_I(inode));
5dc562c5
JB
8802 goto next;
8803 }
8804
8805 em->start = cur_offset;
5dc562c5 8806 em->len = ins.offset;
3d2ac992
QW
8807 em->disk_bytenr = ins.objectid;
8808 em->offset = 0;
e8fe524d 8809 em->disk_num_bytes = ins.offset;
cc95bef6 8810 em->ram_bytes = ins.offset;
f86f7a75 8811 em->flags |= EXTENT_FLAG_PREALLOC;
5dc562c5
JB
8812 em->generation = trans->transid;
8813
a1ba4c08 8814 ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
5dc562c5
JB
8815 free_extent_map(em);
8816next:
d899e052
YZ
8817 num_bytes -= ins.offset;
8818 cur_offset += ins.offset;
efa56464 8819 *alloc_hint = ins.objectid + ins.offset;
5a303d5d 8820
0c4d2d95 8821 inode_inc_iversion(inode);
2a9462de 8822 inode_set_ctime_current(inode);
6cbff00f 8823 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
d899e052 8824 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
efa56464
YZ
8825 (actual_len > inode->i_size) &&
8826 (cur_offset > inode->i_size)) {
d1ea6a61 8827 if (cur_offset > actual_len)
55a61d1d 8828 i_size = actual_len;
d1ea6a61 8829 else
55a61d1d
JB
8830 i_size = cur_offset;
8831 i_size_write(inode, i_size);
76aea537 8832 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
5a303d5d
YZ
8833 }
8834
8b9d0322 8835 ret = btrfs_update_inode(trans, BTRFS_I(inode));
79787eaa
JM
8836
8837 if (ret) {
66642832 8838 btrfs_abort_transaction(trans, ret);
79787eaa 8839 if (own_trans)
3a45bb20 8840 btrfs_end_transaction(trans);
79787eaa
JM
8841 break;
8842 }
d899e052 8843
8fccebfa 8844 if (own_trans) {
3a45bb20 8845 btrfs_end_transaction(trans);
8fccebfa
FM
8846 trans = NULL;
8847 }
5a303d5d 8848 }
b778cf96 8849 if (clear_offset < end)
25ce28ca 8850 btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
b778cf96 8851 end - clear_offset + 1);
d899e052
YZ
8852 return ret;
8853}
8854
0af3d00b
JB
8855int btrfs_prealloc_file_range(struct inode *inode, int mode,
8856 u64 start, u64 num_bytes, u64 min_size,
8857 loff_t actual_len, u64 *alloc_hint)
8858{
8859 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8860 min_size, actual_len, alloc_hint,
8861 NULL);
8862}
8863
8864int btrfs_prealloc_file_range_trans(struct inode *inode,
8865 struct btrfs_trans_handle *trans, int mode,
8866 u64 start, u64 num_bytes, u64 min_size,
8867 loff_t actual_len, u64 *alloc_hint)
8868{
8869 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8870 min_size, actual_len, alloc_hint, trans);
8871}
8872
4609e1f1 8873static int btrfs_permission(struct mnt_idmap *idmap,
549c7297 8874 struct inode *inode, int mask)
fdebe2bd 8875{
b83cc969 8876 struct btrfs_root *root = BTRFS_I(inode)->root;
cb6db4e5 8877 umode_t mode = inode->i_mode;
b83cc969 8878
cb6db4e5
JM
8879 if (mask & MAY_WRITE &&
8880 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
8881 if (btrfs_root_readonly(root))
8882 return -EROFS;
8883 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
8884 return -EACCES;
8885 }
4609e1f1 8886 return generic_permission(idmap, inode, mask);
fdebe2bd 8887}
39279cc3 8888
011e2b71 8889static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
863f144f 8890 struct file *file, umode_t mode)
ef3b9af5 8891{
41044b41 8892 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
ef3b9af5
FM
8893 struct btrfs_trans_handle *trans;
8894 struct btrfs_root *root = BTRFS_I(dir)->root;
a1fd0c35 8895 struct inode *inode;
3538d68d
OS
8896 struct btrfs_new_inode_args new_inode_args = {
8897 .dir = dir,
863f144f 8898 .dentry = file->f_path.dentry,
3538d68d
OS
8899 .orphan = true,
8900 };
8901 unsigned int trans_num_items;
a1fd0c35
OS
8902 int ret;
8903
8904 inode = new_inode(dir->i_sb);
8905 if (!inode)
8906 return -ENOMEM;
f2d40141 8907 inode_init_owner(idmap, inode, dir, mode);
a1fd0c35
OS
8908 inode->i_fop = &btrfs_file_operations;
8909 inode->i_op = &btrfs_file_inode_operations;
8910 inode->i_mapping->a_ops = &btrfs_aops;
ef3b9af5 8911
3538d68d
OS
8912 new_inode_args.inode = inode;
8913 ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
caae78e0
OS
8914 if (ret)
8915 goto out_inode;
3538d68d
OS
8916
8917 trans = btrfs_start_transaction(root, trans_num_items);
a1fd0c35 8918 if (IS_ERR(trans)) {
3538d68d
OS
8919 ret = PTR_ERR(trans);
8920 goto out_new_inode_args;
a1fd0c35 8921 }
ef3b9af5 8922
caae78e0 8923 ret = btrfs_create_new_inode(trans, &new_inode_args);
ef3b9af5 8924
5762b5c9 8925 /*
3538d68d
OS
8926 * We set number of links to 0 in btrfs_create_new_inode(), and here we
8927 * set it to 1 because d_tmpfile() will issue a warning if the count is
8928 * 0, through:
5762b5c9
FM
8929 *
8930 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
8931 */
8932 set_nlink(inode, 1);
caae78e0
OS
8933
8934 if (!ret) {
863f144f 8935 d_tmpfile(file, inode);
caae78e0
OS
8936 unlock_new_inode(inode);
8937 mark_inode_dirty(inode);
8938 }
8939
3a45bb20 8940 btrfs_end_transaction(trans);
2ff7e61e 8941 btrfs_btree_balance_dirty(fs_info);
3538d68d
OS
8942out_new_inode_args:
8943 btrfs_new_inode_args_destroy(&new_inode_args);
caae78e0
OS
8944out_inode:
8945 if (ret)
8946 iput(inode);
863f144f 8947 return finish_open_simple(file, ret);
ef3b9af5
FM
8948}
8949
d2a91064 8950void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
c6100a4b 8951{
d2a91064 8952 struct btrfs_fs_info *fs_info = inode->root->fs_info;
c6100a4b
JB
8953 unsigned long index = start >> PAGE_SHIFT;
8954 unsigned long end_index = end >> PAGE_SHIFT;
8955 struct page *page;
d2a91064 8956 u32 len;
c6100a4b 8957
d2a91064
QW
8958 ASSERT(end + 1 - start <= U32_MAX);
8959 len = end + 1 - start;
c6100a4b 8960 while (index <= end_index) {
d2a91064 8961 page = find_get_page(inode->vfs_inode.i_mapping, index);
c6100a4b 8962 ASSERT(page); /* Pages should be in the extent_io_tree */
d2a91064 8963
55151ea9
QW
8964 /* This is for data, which doesn't yet support larger folio. */
8965 ASSERT(folio_order(page_folio(page)) == 0);
8966 btrfs_folio_set_writeback(fs_info, page_folio(page), start, len);
c6100a4b
JB
8967 put_page(page);
8968 index++;
8969 }
8970}
8971
3ea4dc5b
OS
8972int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
8973 int compress_type)
1881fba8
OS
8974{
8975 switch (compress_type) {
8976 case BTRFS_COMPRESS_NONE:
8977 return BTRFS_ENCODED_IO_COMPRESSION_NONE;
8978 case BTRFS_COMPRESS_ZLIB:
8979 return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
8980 case BTRFS_COMPRESS_LZO:
8981 /*
8982 * The LZO format depends on the sector size. 64K is the maximum
8983 * sector size that we support.
8984 */
8985 if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
8986 return -EINVAL;
8987 return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
8988 (fs_info->sectorsize_bits - 12);
8989 case BTRFS_COMPRESS_ZSTD:
8990 return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
8991 default:
8992 return -EUCLEAN;
8993 }
8994}
8995
8996static ssize_t btrfs_encoded_read_inline(
8997 struct kiocb *iocb,
8998 struct iov_iter *iter, u64 start,
8999 u64 lockend,
9000 struct extent_state **cached_state,
9001 u64 extent_start, size_t count,
9002 struct btrfs_ioctl_encoded_io_args *encoded,
9003 bool *unlocked)
9004{
9005 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9006 struct btrfs_root *root = inode->root;
9007 struct btrfs_fs_info *fs_info = root->fs_info;
9008 struct extent_io_tree *io_tree = &inode->io_tree;
9009 struct btrfs_path *path;
9010 struct extent_buffer *leaf;
9011 struct btrfs_file_extent_item *item;
9012 u64 ram_bytes;
9013 unsigned long ptr;
9014 void *tmp;
9015 ssize_t ret;
9016
9017 path = btrfs_alloc_path();
9018 if (!path) {
9019 ret = -ENOMEM;
9020 goto out;
9021 }
9022 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
9023 extent_start, 0);
9024 if (ret) {
9025 if (ret > 0) {
9026 /* The extent item disappeared? */
9027 ret = -EIO;
9028 }
9029 goto out;
9030 }
9031 leaf = path->nodes[0];
9032 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
9033
9034 ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
9035 ptr = btrfs_file_extent_inline_start(item);
9036
9037 encoded->len = min_t(u64, extent_start + ram_bytes,
9038 inode->vfs_inode.i_size) - iocb->ki_pos;
9039 ret = btrfs_encoded_io_compression_from_extent(fs_info,
9040 btrfs_file_extent_compression(leaf, item));
9041 if (ret < 0)
9042 goto out;
9043 encoded->compression = ret;
9044 if (encoded->compression) {
9045 size_t inline_size;
9046
9047 inline_size = btrfs_file_extent_inline_item_len(leaf,
9048 path->slots[0]);
9049 if (inline_size > count) {
9050 ret = -ENOBUFS;
9051 goto out;
9052 }
9053 count = inline_size;
9054 encoded->unencoded_len = ram_bytes;
9055 encoded->unencoded_offset = iocb->ki_pos - extent_start;
9056 } else {
9057 count = min_t(u64, count, encoded->len);
9058 encoded->len = count;
9059 encoded->unencoded_len = count;
9060 ptr += iocb->ki_pos - extent_start;
9061 }
9062
9063 tmp = kmalloc(count, GFP_NOFS);
9064 if (!tmp) {
9065 ret = -ENOMEM;
9066 goto out;
9067 }
9068 read_extent_buffer(leaf, tmp, ptr, count);
9069 btrfs_release_path(path);
570eb97b 9070 unlock_extent(io_tree, start, lockend, cached_state);
e5d4d75b 9071 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
1881fba8
OS
9072 *unlocked = true;
9073
9074 ret = copy_to_iter(tmp, count, iter);
9075 if (ret != count)
9076 ret = -EFAULT;
9077 kfree(tmp);
9078out:
9079 btrfs_free_path(path);
9080 return ret;
9081}
9082
9083struct btrfs_encoded_read_private {
1881fba8
OS
9084 wait_queue_head_t wait;
9085 atomic_t pending;
9086 blk_status_t status;
1881fba8
OS
9087};
9088
917f32a2 9089static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
1881fba8 9090{
917f32a2 9091 struct btrfs_encoded_read_private *priv = bbio->private;
1881fba8 9092
7609afac 9093 if (bbio->bio.bi_status) {
1881fba8
OS
9094 /*
9095 * The memory barrier implied by the atomic_dec_return() here
9096 * pairs with the memory barrier implied by the
9097 * atomic_dec_return() or io_wait_event() in
9098 * btrfs_encoded_read_regular_fill_pages() to ensure that this
9099 * write is observed before the load of status in
9100 * btrfs_encoded_read_regular_fill_pages().
9101 */
7609afac 9102 WRITE_ONCE(priv->status, bbio->bio.bi_status);
1881fba8
OS
9103 }
9104 if (!atomic_dec_return(&priv->pending))
9105 wake_up(&priv->wait);
917f32a2 9106 bio_put(&bbio->bio);
1881fba8
OS
9107}
9108
3ea4dc5b
OS
9109int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
9110 u64 file_offset, u64 disk_bytenr,
9111 u64 disk_io_size, struct page **pages)
1881fba8 9112{
4317ff00 9113 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1881fba8 9114 struct btrfs_encoded_read_private priv = {
1881fba8 9115 .pending = ATOMIC_INIT(1),
1881fba8
OS
9116 };
9117 unsigned long i = 0;
b41bbd29 9118 struct btrfs_bio *bbio;
1881fba8
OS
9119
9120 init_waitqueue_head(&priv.wait);
1881fba8 9121
4317ff00
QW
9122 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9123 btrfs_encoded_read_endio, &priv);
b41bbd29 9124 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
4317ff00 9125 bbio->inode = inode;
1881fba8 9126
34f888ce
CH
9127 do {
9128 size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
9129
b41bbd29 9130 if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
34f888ce 9131 atomic_inc(&priv.pending);
b41bbd29 9132 btrfs_submit_bio(bbio, 0);
34f888ce 9133
4317ff00 9134 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
b41bbd29
CH
9135 btrfs_encoded_read_endio, &priv);
9136 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
4317ff00 9137 bbio->inode = inode;
34f888ce 9138 continue;
1881fba8 9139 }
34f888ce
CH
9140
9141 i++;
9142 disk_bytenr += bytes;
9143 disk_io_size -= bytes;
9144 } while (disk_io_size);
9145
9146 atomic_inc(&priv.pending);
b41bbd29 9147 btrfs_submit_bio(bbio, 0);
1881fba8 9148
1881fba8
OS
9149 if (atomic_dec_return(&priv.pending))
9150 io_wait_event(priv.wait, !atomic_read(&priv.pending));
9151 /* See btrfs_encoded_read_endio() for ordering. */
9152 return blk_status_to_errno(READ_ONCE(priv.status));
9153}
9154
9155static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
9156 struct iov_iter *iter,
9157 u64 start, u64 lockend,
9158 struct extent_state **cached_state,
9159 u64 disk_bytenr, u64 disk_io_size,
9160 size_t count, bool compressed,
9161 bool *unlocked)
9162{
9163 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9164 struct extent_io_tree *io_tree = &inode->io_tree;
9165 struct page **pages;
9166 unsigned long nr_pages, i;
9167 u64 cur;
9168 size_t page_offset;
9169 ssize_t ret;
9170
9171 nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
9172 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
9173 if (!pages)
9174 return -ENOMEM;
0fbf6cbd 9175 ret = btrfs_alloc_page_array(nr_pages, pages, false);
dd137dd1
STD
9176 if (ret) {
9177 ret = -ENOMEM;
9178 goto out;
1881fba8 9179 }
1881fba8
OS
9180
9181 ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
9182 disk_io_size, pages);
9183 if (ret)
9184 goto out;
9185
570eb97b 9186 unlock_extent(io_tree, start, lockend, cached_state);
e5d4d75b 9187 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
1881fba8
OS
9188 *unlocked = true;
9189
9190 if (compressed) {
9191 i = 0;
9192 page_offset = 0;
9193 } else {
9194 i = (iocb->ki_pos - start) >> PAGE_SHIFT;
9195 page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
9196 }
9197 cur = 0;
9198 while (cur < count) {
9199 size_t bytes = min_t(size_t, count - cur,
9200 PAGE_SIZE - page_offset);
9201
9202 if (copy_page_to_iter(pages[i], page_offset, bytes,
9203 iter) != bytes) {
9204 ret = -EFAULT;
9205 goto out;
9206 }
9207 i++;
9208 cur += bytes;
9209 page_offset = 0;
9210 }
9211 ret = count;
9212out:
9213 for (i = 0; i < nr_pages; i++) {
9214 if (pages[i])
9215 __free_page(pages[i]);
9216 }
9217 kfree(pages);
9218 return ret;
9219}
9220
9221ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
9222 struct btrfs_ioctl_encoded_io_args *encoded)
9223{
9224 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9225 struct btrfs_fs_info *fs_info = inode->root->fs_info;
9226 struct extent_io_tree *io_tree = &inode->io_tree;
9227 ssize_t ret;
9228 size_t count = iov_iter_count(iter);
9229 u64 start, lockend, disk_bytenr, disk_io_size;
9230 struct extent_state *cached_state = NULL;
9231 struct extent_map *em;
9232 bool unlocked = false;
9233
9234 file_accessed(iocb->ki_filp);
9235
29b6352b 9236 btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
1881fba8
OS
9237
9238 if (iocb->ki_pos >= inode->vfs_inode.i_size) {
e5d4d75b 9239 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
1881fba8
OS
9240 return 0;
9241 }
9242 start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
9243 /*
9244 * We don't know how long the extent containing iocb->ki_pos is, but if
9245 * it's compressed we know that it won't be longer than this.
9246 */
9247 lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
9248
9249 for (;;) {
9250 struct btrfs_ordered_extent *ordered;
9251
e641e323 9252 ret = btrfs_wait_ordered_range(inode, start,
1881fba8
OS
9253 lockend - start + 1);
9254 if (ret)
9255 goto out_unlock_inode;
570eb97b 9256 lock_extent(io_tree, start, lockend, &cached_state);
1881fba8
OS
9257 ordered = btrfs_lookup_ordered_range(inode, start,
9258 lockend - start + 1);
9259 if (!ordered)
9260 break;
9261 btrfs_put_ordered_extent(ordered);
570eb97b 9262 unlock_extent(io_tree, start, lockend, &cached_state);
1881fba8
OS
9263 cond_resched();
9264 }
9265
8bab0a30 9266 em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
1881fba8
OS
9267 if (IS_ERR(em)) {
9268 ret = PTR_ERR(em);
9269 goto out_unlock_extent;
9270 }
9271
c77a8c61 9272 if (em->disk_bytenr == EXTENT_MAP_INLINE) {
1881fba8
OS
9273 u64 extent_start = em->start;
9274
9275 /*
9276 * For inline extents we get everything we need out of the
9277 * extent item.
9278 */
9279 free_extent_map(em);
9280 em = NULL;
9281 ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
9282 &cached_state, extent_start,
9283 count, encoded, &unlocked);
9284 goto out;
9285 }
9286
9287 /*
9288 * We only want to return up to EOF even if the extent extends beyond
9289 * that.
9290 */
9291 encoded->len = min_t(u64, extent_map_end(em),
9292 inode->vfs_inode.i_size) - iocb->ki_pos;
c77a8c61 9293 if (em->disk_bytenr == EXTENT_MAP_HOLE ||
f86f7a75 9294 (em->flags & EXTENT_FLAG_PREALLOC)) {
1881fba8
OS
9295 disk_bytenr = EXTENT_MAP_HOLE;
9296 count = min_t(u64, count, encoded->len);
9297 encoded->len = count;
9298 encoded->unencoded_len = count;
f86f7a75 9299 } else if (extent_map_is_compressed(em)) {
c77a8c61 9300 disk_bytenr = em->disk_bytenr;
1881fba8
OS
9301 /*
9302 * Bail if the buffer isn't large enough to return the whole
9303 * compressed extent.
9304 */
e28b851e 9305 if (em->disk_num_bytes > count) {
1881fba8
OS
9306 ret = -ENOBUFS;
9307 goto out_em;
9308 }
e28b851e
QW
9309 disk_io_size = em->disk_num_bytes;
9310 count = em->disk_num_bytes;
1881fba8 9311 encoded->unencoded_len = em->ram_bytes;
4aa7b5d1 9312 encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
1881fba8 9313 ret = btrfs_encoded_io_compression_from_extent(fs_info,
f86f7a75 9314 extent_map_compression(em));
1881fba8
OS
9315 if (ret < 0)
9316 goto out_em;
9317 encoded->compression = ret;
9318 } else {
c77a8c61 9319 disk_bytenr = extent_map_block_start(em) + (start - em->start);
1881fba8
OS
9320 if (encoded->len > count)
9321 encoded->len = count;
9322 /*
9323 * Don't read beyond what we locked. This also limits the page
9324 * allocations that we'll do.
9325 */
9326 disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
9327 count = start + disk_io_size - iocb->ki_pos;
9328 encoded->len = count;
9329 encoded->unencoded_len = count;
9330 disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
9331 }
9332 free_extent_map(em);
9333 em = NULL;
9334
9335 if (disk_bytenr == EXTENT_MAP_HOLE) {
570eb97b 9336 unlock_extent(io_tree, start, lockend, &cached_state);
e5d4d75b 9337 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
1881fba8
OS
9338 unlocked = true;
9339 ret = iov_iter_zero(count, iter);
9340 if (ret != count)
9341 ret = -EFAULT;
9342 } else {
9343 ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
9344 &cached_state, disk_bytenr,
9345 disk_io_size, count,
9346 encoded->compression,
9347 &unlocked);
9348 }
9349
9350out:
9351 if (ret >= 0)
9352 iocb->ki_pos += encoded->len;
9353out_em:
9354 free_extent_map(em);
9355out_unlock_extent:
9356 if (!unlocked)
570eb97b 9357 unlock_extent(io_tree, start, lockend, &cached_state);
1881fba8
OS
9358out_unlock_inode:
9359 if (!unlocked)
e5d4d75b 9360 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
1881fba8
OS
9361 return ret;
9362}
9363
7c0c7269
OS
9364ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
9365 const struct btrfs_ioctl_encoded_io_args *encoded)
9366{
9367 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9368 struct btrfs_root *root = inode->root;
9369 struct btrfs_fs_info *fs_info = root->fs_info;
9370 struct extent_io_tree *io_tree = &inode->io_tree;
9371 struct extent_changeset *data_reserved = NULL;
9372 struct extent_state *cached_state = NULL;
d611935b 9373 struct btrfs_ordered_extent *ordered;
3d2ac992 9374 struct btrfs_file_extent file_extent;
7c0c7269
OS
9375 int compression;
9376 size_t orig_count;
9377 u64 start, end;
9378 u64 num_bytes, ram_bytes, disk_num_bytes;
400b172b
QW
9379 unsigned long nr_folios, i;
9380 struct folio **folios;
7c0c7269
OS
9381 struct btrfs_key ins;
9382 bool extent_reserved = false;
9383 struct extent_map *em;
9384 ssize_t ret;
9385
9386 switch (encoded->compression) {
9387 case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
9388 compression = BTRFS_COMPRESS_ZLIB;
9389 break;
9390 case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
9391 compression = BTRFS_COMPRESS_ZSTD;
9392 break;
9393 case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
9394 case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
9395 case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
9396 case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
9397 case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
9398 /* The sector size must match for LZO. */
9399 if (encoded->compression -
9400 BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
9401 fs_info->sectorsize_bits)
9402 return -EINVAL;
9403 compression = BTRFS_COMPRESS_LZO;
9404 break;
9405 default:
9406 return -EINVAL;
9407 }
9408 if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
9409 return -EINVAL;
9410
1bd96c92
FM
9411 /*
9412 * Compressed extents should always have checksums, so error out if we
9413 * have a NOCOW file or inode was created while mounted with NODATASUM.
9414 */
9415 if (inode->flags & BTRFS_INODE_NODATASUM)
9416 return -EINVAL;
9417
7c0c7269
OS
9418 orig_count = iov_iter_count(from);
9419
9420 /* The extent size must be sane. */
9421 if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
9422 orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
9423 return -EINVAL;
9424
9425 /*
9426 * The compressed data must be smaller than the decompressed data.
9427 *
9428 * It's of course possible for data to compress to larger or the same
9429 * size, but the buffered I/O path falls back to no compression for such
9430 * data, and we don't want to break any assumptions by creating these
9431 * extents.
9432 *
9433 * Note that this is less strict than the current check we have that the
9434 * compressed data must be at least one sector smaller than the
9435 * decompressed data. We only want to enforce the weaker requirement
9436 * from old kernels that it is at least one byte smaller.
9437 */
9438 if (orig_count >= encoded->unencoded_len)
9439 return -EINVAL;
9440
9441 /* The extent must start on a sector boundary. */
9442 start = iocb->ki_pos;
9443 if (!IS_ALIGNED(start, fs_info->sectorsize))
9444 return -EINVAL;
9445
9446 /*
9447 * The extent must end on a sector boundary. However, we allow a write
9448 * which ends at or extends i_size to have an unaligned length; we round
9449 * up the extent size and set i_size to the unaligned end.
9450 */
9451 if (start + encoded->len < inode->vfs_inode.i_size &&
9452 !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
9453 return -EINVAL;
9454
9455 /* Finally, the offset in the unencoded data must be sector-aligned. */
9456 if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
9457 return -EINVAL;
9458
9459 num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
9460 ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
9461 end = start + num_bytes - 1;
9462
9463 /*
9464 * If the extent cannot be inline, the compressed data on disk must be
9465 * sector-aligned. For convenience, we extend it with zeroes if it
9466 * isn't.
9467 */
9468 disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
400b172b
QW
9469 nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
9470 folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
9471 if (!folios)
7c0c7269 9472 return -ENOMEM;
400b172b 9473 for (i = 0; i < nr_folios; i++) {
7c0c7269
OS
9474 size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
9475 char *kaddr;
9476
400b172b
QW
9477 folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
9478 if (!folios[i]) {
7c0c7269 9479 ret = -ENOMEM;
400b172b 9480 goto out_folios;
7c0c7269 9481 }
400b172b 9482 kaddr = kmap_local_folio(folios[i], 0);
7c0c7269 9483 if (copy_from_iter(kaddr, bytes, from) != bytes) {
70826b6b 9484 kunmap_local(kaddr);
7c0c7269 9485 ret = -EFAULT;
400b172b 9486 goto out_folios;
7c0c7269
OS
9487 }
9488 if (bytes < PAGE_SIZE)
9489 memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
70826b6b 9490 kunmap_local(kaddr);
7c0c7269
OS
9491 }
9492
9493 for (;;) {
9494 struct btrfs_ordered_extent *ordered;
9495
e641e323 9496 ret = btrfs_wait_ordered_range(inode, start, num_bytes);
7c0c7269 9497 if (ret)
400b172b 9498 goto out_folios;
7c0c7269
OS
9499 ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
9500 start >> PAGE_SHIFT,
9501 end >> PAGE_SHIFT);
9502 if (ret)
400b172b 9503 goto out_folios;
570eb97b 9504 lock_extent(io_tree, start, end, &cached_state);
7c0c7269
OS
9505 ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
9506 if (!ordered &&
9507 !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
9508 break;
9509 if (ordered)
9510 btrfs_put_ordered_extent(ordered);
570eb97b 9511 unlock_extent(io_tree, start, end, &cached_state);
7c0c7269
OS
9512 cond_resched();
9513 }
9514
9515 /*
9516 * We don't use the higher-level delalloc space functions because our
9517 * num_bytes and disk_num_bytes are different.
9518 */
9519 ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
9520 if (ret)
9521 goto out_unlock;
9522 ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
9523 if (ret)
9524 goto out_free_data_space;
d4135134
FM
9525 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
9526 false);
7c0c7269
OS
9527 if (ret)
9528 goto out_qgroup_free_data;
9529
9530 /* Try an inline extent first. */
6eecfa22 9531 if (encoded->unencoded_len == encoded->len &&
cd241a8f
JB
9532 encoded->unencoded_offset == 0 &&
9533 can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
0586d0a8
JB
9534 ret = __cow_file_range_inline(inode, start, encoded->len,
9535 orig_count, compression, folios[0],
9536 true);
7c0c7269
OS
9537 if (ret <= 0) {
9538 if (ret == 0)
9539 ret = orig_count;
9540 goto out_delalloc_release;
9541 }
9542 }
9543
9544 ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
9545 disk_num_bytes, 0, 0, &ins, 1, 1);
9546 if (ret)
9547 goto out_delalloc_release;
9548 extent_reserved = true;
9549
3d2ac992
QW
9550 file_extent.disk_bytenr = ins.objectid;
9551 file_extent.disk_num_bytes = ins.offset;
9552 file_extent.num_bytes = num_bytes;
9553 file_extent.ram_bytes = ram_bytes;
9554 file_extent.offset = encoded->unencoded_offset;
9555 file_extent.compression = compression;
9aa29a20 9556 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
7c0c7269
OS
9557 if (IS_ERR(em)) {
9558 ret = PTR_ERR(em);
9559 goto out_free_reserved;
9560 }
9561 free_extent_map(em);
9562
e9ea31fb 9563 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
7c0c7269 9564 (1 << BTRFS_ORDERED_ENCODED) |
e9ea31fb 9565 (1 << BTRFS_ORDERED_COMPRESSED));
d611935b 9566 if (IS_ERR(ordered)) {
4c0c8cfc 9567 btrfs_drop_extent_map_range(inode, start, end, false);
d611935b 9568 ret = PTR_ERR(ordered);
7c0c7269
OS
9569 goto out_free_reserved;
9570 }
9571 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9572
9573 if (start + encoded->len > inode->vfs_inode.i_size)
9574 i_size_write(&inode->vfs_inode, start + encoded->len);
9575
570eb97b 9576 unlock_extent(io_tree, start, end, &cached_state);
7c0c7269
OS
9577
9578 btrfs_delalloc_release_extents(inode, num_bytes);
9579
400b172b 9580 btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
7c0c7269
OS
9581 ret = orig_count;
9582 goto out;
9583
9584out_free_reserved:
9585 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9586 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
9587out_delalloc_release:
9588 btrfs_delalloc_release_extents(inode, num_bytes);
9589 btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
9590out_qgroup_free_data:
9591 if (ret < 0)
9e65bfca 9592 btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
7c0c7269
OS
9593out_free_data_space:
9594 /*
9595 * If btrfs_reserve_extent() succeeded, then we already decremented
9596 * bytes_may_use.
9597 */
9598 if (!extent_reserved)
9599 btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
9600out_unlock:
570eb97b 9601 unlock_extent(io_tree, start, end, &cached_state);
400b172b
QW
9602out_folios:
9603 for (i = 0; i < nr_folios; i++) {
9604 if (folios[i])
da0386c1 9605 folio_put(folios[i]);
7c0c7269 9606 }
400b172b 9607 kvfree(folios);
7c0c7269
OS
9608out:
9609 if (ret >= 0)
9610 iocb->ki_pos += encoded->len;
9611 return ret;
9612}
9613
ed46ff3d
OS
9614#ifdef CONFIG_SWAP
9615/*
9616 * Add an entry indicating a block group or device which is pinned by a
9617 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
9618 * negative errno on failure.
9619 */
9620static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
9621 bool is_block_group)
9622{
9623 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
9624 struct btrfs_swapfile_pin *sp, *entry;
9625 struct rb_node **p;
9626 struct rb_node *parent = NULL;
9627
9628 sp = kmalloc(sizeof(*sp), GFP_NOFS);
9629 if (!sp)
9630 return -ENOMEM;
9631 sp->ptr = ptr;
9632 sp->inode = inode;
9633 sp->is_block_group = is_block_group;
195a49ea 9634 sp->bg_extent_count = 1;
ed46ff3d
OS
9635
9636 spin_lock(&fs_info->swapfile_pins_lock);
9637 p = &fs_info->swapfile_pins.rb_node;
9638 while (*p) {
9639 parent = *p;
9640 entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
9641 if (sp->ptr < entry->ptr ||
9642 (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
9643 p = &(*p)->rb_left;
9644 } else if (sp->ptr > entry->ptr ||
9645 (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
9646 p = &(*p)->rb_right;
9647 } else {
195a49ea
FM
9648 if (is_block_group)
9649 entry->bg_extent_count++;
ed46ff3d
OS
9650 spin_unlock(&fs_info->swapfile_pins_lock);
9651 kfree(sp);
9652 return 1;
9653 }
9654 }
9655 rb_link_node(&sp->node, parent, p);
9656 rb_insert_color(&sp->node, &fs_info->swapfile_pins);
9657 spin_unlock(&fs_info->swapfile_pins_lock);
9658 return 0;
9659}
9660
9661/* Free all of the entries pinned by this swapfile. */
9662static void btrfs_free_swapfile_pins(struct inode *inode)
9663{
9664 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
9665 struct btrfs_swapfile_pin *sp;
9666 struct rb_node *node, *next;
9667
9668 spin_lock(&fs_info->swapfile_pins_lock);
9669 node = rb_first(&fs_info->swapfile_pins);
9670 while (node) {
9671 next = rb_next(node);
9672 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
9673 if (sp->inode == inode) {
9674 rb_erase(&sp->node, &fs_info->swapfile_pins);
195a49ea
FM
9675 if (sp->is_block_group) {
9676 btrfs_dec_block_group_swap_extents(sp->ptr,
9677 sp->bg_extent_count);
ed46ff3d 9678 btrfs_put_block_group(sp->ptr);
195a49ea 9679 }
ed46ff3d
OS
9680 kfree(sp);
9681 }
9682 node = next;
9683 }
9684 spin_unlock(&fs_info->swapfile_pins_lock);
9685}
9686
9687struct btrfs_swap_info {
9688 u64 start;
9689 u64 block_start;
9690 u64 block_len;
9691 u64 lowest_ppage;
9692 u64 highest_ppage;
9693 unsigned long nr_pages;
9694 int nr_extents;
9695};
9696
9697static int btrfs_add_swap_extent(struct swap_info_struct *sis,
9698 struct btrfs_swap_info *bsi)
9699{
9700 unsigned long nr_pages;
c2f82263 9701 unsigned long max_pages;
ed46ff3d
OS
9702 u64 first_ppage, first_ppage_reported, next_ppage;
9703 int ret;
9704
c2f82263
FM
9705 /*
9706 * Our swapfile may have had its size extended after the swap header was
9707 * written. In that case activating the swapfile should not go beyond
9708 * the max size set in the swap header.
9709 */
9710 if (bsi->nr_pages >= sis->max)
9711 return 0;
9712
9713 max_pages = sis->max - bsi->nr_pages;
ce394a7f
YZ
9714 first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
9715 next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
ed46ff3d
OS
9716
9717 if (first_ppage >= next_ppage)
9718 return 0;
9719 nr_pages = next_ppage - first_ppage;
c2f82263 9720 nr_pages = min(nr_pages, max_pages);
ed46ff3d
OS
9721
9722 first_ppage_reported = first_ppage;
9723 if (bsi->start == 0)
9724 first_ppage_reported++;
9725 if (bsi->lowest_ppage > first_ppage_reported)
9726 bsi->lowest_ppage = first_ppage_reported;
9727 if (bsi->highest_ppage < (next_ppage - 1))
9728 bsi->highest_ppage = next_ppage - 1;
9729
9730 ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
9731 if (ret < 0)
9732 return ret;
9733 bsi->nr_extents += ret;
9734 bsi->nr_pages += nr_pages;
9735 return 0;
9736}
9737
9738static void btrfs_swap_deactivate(struct file *file)
9739{
9740 struct inode *inode = file_inode(file);
9741
9742 btrfs_free_swapfile_pins(inode);
9743 atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
9744}
9745
9746static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
9747 sector_t *span)
9748{
9749 struct inode *inode = file_inode(file);
dd0734f2
FM
9750 struct btrfs_root *root = BTRFS_I(inode)->root;
9751 struct btrfs_fs_info *fs_info = root->fs_info;
ed46ff3d
OS
9752 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
9753 struct extent_state *cached_state = NULL;
9754 struct extent_map *em = NULL;
7dc66abb 9755 struct btrfs_chunk_map *map = NULL;
ed46ff3d
OS
9756 struct btrfs_device *device = NULL;
9757 struct btrfs_swap_info bsi = {
9758 .lowest_ppage = (sector_t)-1ULL,
9759 };
9760 int ret = 0;
9761 u64 isize;
9762 u64 start;
9763
9764 /*
9765 * If the swap file was just created, make sure delalloc is done. If the
9766 * file changes again after this, the user is doing something stupid and
9767 * we don't really care.
9768 */
e641e323 9769 ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
ed46ff3d
OS
9770 if (ret)
9771 return ret;
9772
9773 /*
9774 * The inode is locked, so these flags won't change after we check them.
9775 */
9776 if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
9777 btrfs_warn(fs_info, "swapfile must not be compressed");
9778 return -EINVAL;
9779 }
9780 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
9781 btrfs_warn(fs_info, "swapfile must not be copy-on-write");
9782 return -EINVAL;
9783 }
9784 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
9785 btrfs_warn(fs_info, "swapfile must not be checksummed");
9786 return -EINVAL;
9787 }
9788
9789 /*
9790 * Balance or device remove/replace/resize can move stuff around from
c3e1f96c
GR
9791 * under us. The exclop protection makes sure they aren't running/won't
9792 * run concurrently while we are mapping the swap extents, and
9793 * fs_info->swapfile_pins prevents them from running while the swap
9794 * file is active and moving the extents. Note that this also prevents
9795 * a concurrent device add which isn't actually necessary, but it's not
ed46ff3d
OS
9796 * really worth the trouble to allow it.
9797 */
c3e1f96c 9798 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
ed46ff3d
OS
9799 btrfs_warn(fs_info,
9800 "cannot activate swapfile while exclusive operation is running");
9801 return -EBUSY;
9802 }
dd0734f2
FM
9803
9804 /*
9805 * Prevent snapshot creation while we are activating the swap file.
9806 * We do not want to race with snapshot creation. If snapshot creation
9807 * already started before we bumped nr_swapfiles from 0 to 1 and
9808 * completes before the first write into the swap file after it is
9809 * activated, than that write would fallback to COW.
9810 */
9811 if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
9812 btrfs_exclop_finish(fs_info);
9813 btrfs_warn(fs_info,
9814 "cannot activate swapfile because snapshot creation is in progress");
9815 return -EINVAL;
9816 }
ed46ff3d
OS
9817 /*
9818 * Snapshots can create extents which require COW even if NODATACOW is
9819 * set. We use this counter to prevent snapshots. We must increment it
9820 * before walking the extents because we don't want a concurrent
9821 * snapshot to run after we've already checked the extents.
60021bd7
KH
9822 *
9823 * It is possible that subvolume is marked for deletion but still not
9824 * removed yet. To prevent this race, we check the root status before
9825 * activating the swapfile.
ed46ff3d 9826 */
60021bd7
KH
9827 spin_lock(&root->root_item_lock);
9828 if (btrfs_root_dead(root)) {
9829 spin_unlock(&root->root_item_lock);
9830
9831 btrfs_exclop_finish(fs_info);
9832 btrfs_warn(fs_info,
9833 "cannot activate swapfile because subvolume %llu is being deleted",
e094f480 9834 btrfs_root_id(root));
60021bd7
KH
9835 return -EPERM;
9836 }
dd0734f2 9837 atomic_inc(&root->nr_swapfiles);
60021bd7 9838 spin_unlock(&root->root_item_lock);
ed46ff3d
OS
9839
9840 isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
9841
570eb97b 9842 lock_extent(io_tree, 0, isize - 1, &cached_state);
ed46ff3d
OS
9843 start = 0;
9844 while (start < isize) {
9845 u64 logical_block_start, physical_block_start;
32da5386 9846 struct btrfs_block_group *bg;
ed46ff3d
OS
9847 u64 len = isize - start;
9848
8bab0a30 9849 em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
ed46ff3d
OS
9850 if (IS_ERR(em)) {
9851 ret = PTR_ERR(em);
9852 goto out;
9853 }
9854
c77a8c61 9855 if (em->disk_bytenr == EXTENT_MAP_HOLE) {
ed46ff3d
OS
9856 btrfs_warn(fs_info, "swapfile must not have holes");
9857 ret = -EINVAL;
9858 goto out;
9859 }
c77a8c61 9860 if (em->disk_bytenr == EXTENT_MAP_INLINE) {
ed46ff3d
OS
9861 /*
9862 * It's unlikely we'll ever actually find ourselves
9863 * here, as a file small enough to fit inline won't be
9864 * big enough to store more than the swap header, but in
9865 * case something changes in the future, let's catch it
9866 * here rather than later.
9867 */
9868 btrfs_warn(fs_info, "swapfile must not be inline");
9869 ret = -EINVAL;
9870 goto out;
9871 }
f86f7a75 9872 if (extent_map_is_compressed(em)) {
ed46ff3d
OS
9873 btrfs_warn(fs_info, "swapfile must not be compressed");
9874 ret = -EINVAL;
9875 goto out;
9876 }
9877
c77a8c61 9878 logical_block_start = extent_map_block_start(em) + (start - em->start);
ed46ff3d
OS
9879 len = min(len, em->len - (start - em->start));
9880 free_extent_map(em);
9881 em = NULL;
9882
cdc627e6 9883 ret = can_nocow_extent(inode, start, &len, NULL, false, true);
ed46ff3d
OS
9884 if (ret < 0) {
9885 goto out;
9886 } else if (ret) {
9887 ret = 0;
9888 } else {
9889 btrfs_warn(fs_info,
9890 "swapfile must not be copy-on-write");
9891 ret = -EINVAL;
9892 goto out;
9893 }
9894
7dc66abb
FM
9895 map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
9896 if (IS_ERR(map)) {
9897 ret = PTR_ERR(map);
ed46ff3d
OS
9898 goto out;
9899 }
9900
7dc66abb 9901 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
ed46ff3d
OS
9902 btrfs_warn(fs_info,
9903 "swapfile must have single data profile");
9904 ret = -EINVAL;
9905 goto out;
9906 }
9907
9908 if (device == NULL) {
7dc66abb 9909 device = map->stripes[0].dev;
ed46ff3d
OS
9910 ret = btrfs_add_swapfile_pin(inode, device, false);
9911 if (ret == 1)
9912 ret = 0;
9913 else if (ret)
9914 goto out;
7dc66abb 9915 } else if (device != map->stripes[0].dev) {
ed46ff3d
OS
9916 btrfs_warn(fs_info, "swapfile must be on one device");
9917 ret = -EINVAL;
9918 goto out;
9919 }
9920
7dc66abb
FM
9921 physical_block_start = (map->stripes[0].physical +
9922 (logical_block_start - map->start));
9923 len = min(len, map->chunk_len - (logical_block_start - map->start));
9924 btrfs_free_chunk_map(map);
9925 map = NULL;
ed46ff3d
OS
9926
9927 bg = btrfs_lookup_block_group(fs_info, logical_block_start);
9928 if (!bg) {
9929 btrfs_warn(fs_info,
9930 "could not find block group containing swapfile");
9931 ret = -EINVAL;
9932 goto out;
9933 }
9934
195a49ea
FM
9935 if (!btrfs_inc_block_group_swap_extents(bg)) {
9936 btrfs_warn(fs_info,
9937 "block group for swapfile at %llu is read-only%s",
9938 bg->start,
9939 atomic_read(&fs_info->scrubs_running) ?
9940 " (scrub running)" : "");
9941 btrfs_put_block_group(bg);
9942 ret = -EINVAL;
9943 goto out;
9944 }
9945
ed46ff3d
OS
9946 ret = btrfs_add_swapfile_pin(inode, bg, true);
9947 if (ret) {
9948 btrfs_put_block_group(bg);
9949 if (ret == 1)
9950 ret = 0;
9951 else
9952 goto out;
9953 }
9954
9955 if (bsi.block_len &&
9956 bsi.block_start + bsi.block_len == physical_block_start) {
9957 bsi.block_len += len;
9958 } else {
9959 if (bsi.block_len) {
9960 ret = btrfs_add_swap_extent(sis, &bsi);
9961 if (ret)
9962 goto out;
9963 }
9964 bsi.start = start;
9965 bsi.block_start = physical_block_start;
9966 bsi.block_len = len;
9967 }
9968
9969 start += len;
9970 }
9971
9972 if (bsi.block_len)
9973 ret = btrfs_add_swap_extent(sis, &bsi);
9974
9975out:
9976 if (!IS_ERR_OR_NULL(em))
9977 free_extent_map(em);
7dc66abb
FM
9978 if (!IS_ERR_OR_NULL(map))
9979 btrfs_free_chunk_map(map);
ed46ff3d 9980
570eb97b 9981 unlock_extent(io_tree, 0, isize - 1, &cached_state);
ed46ff3d
OS
9982
9983 if (ret)
9984 btrfs_swap_deactivate(file);
9985
dd0734f2
FM
9986 btrfs_drew_write_unlock(&root->snapshot_lock);
9987
c3e1f96c 9988 btrfs_exclop_finish(fs_info);
ed46ff3d
OS
9989
9990 if (ret)
9991 return ret;
9992
9993 if (device)
9994 sis->bdev = device->bdev;
9995 *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
9996 sis->max = bsi.nr_pages;
9997 sis->pages = bsi.nr_pages - 1;
9998 sis->highest_bit = bsi.nr_pages - 1;
9999 return bsi.nr_extents;
10000}
10001#else
10002static void btrfs_swap_deactivate(struct file *file)
10003{
10004}
10005
10006static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10007 sector_t *span)
10008{
10009 return -EOPNOTSUPP;
10010}
10011#endif
10012
2766ff61
FM
10013/*
10014 * Update the number of bytes used in the VFS' inode. When we replace extents in
10015 * a range (clone, dedupe, fallocate's zero range), we must update the number of
10016 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10017 * always get a correct value.
10018 */
10019void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10020 const u64 add_bytes,
10021 const u64 del_bytes)
10022{
10023 if (add_bytes == del_bytes)
10024 return;
10025
10026 spin_lock(&inode->lock);
10027 if (del_bytes > 0)
10028 inode_sub_bytes(&inode->vfs_inode, del_bytes);
10029 if (add_bytes > 0)
10030 inode_add_bytes(&inode->vfs_inode, add_bytes);
10031 spin_unlock(&inode->lock);
10032}
10033
43dd529a 10034/*
63c34cb4
FM
10035 * Verify that there are no ordered extents for a given file range.
10036 *
10037 * @inode: The target inode.
10038 * @start: Start offset of the file range, should be sector size aligned.
10039 * @end: End offset (inclusive) of the file range, its value +1 should be
10040 * sector size aligned.
10041 *
10042 * This should typically be used for cases where we locked an inode's VFS lock in
10043 * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10044 * we have flushed all delalloc in the range, we have waited for all ordered
10045 * extents in the range to complete and finally we have locked the file range in
10046 * the inode's io_tree.
10047 */
10048void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10049{
10050 struct btrfs_root *root = inode->root;
10051 struct btrfs_ordered_extent *ordered;
10052
10053 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10054 return;
10055
10056 ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
10057 if (ordered) {
10058 btrfs_err(root->fs_info,
10059"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
e094f480 10060 start, end, btrfs_ino(inode), btrfs_root_id(root),
63c34cb4
FM
10061 ordered->file_offset,
10062 ordered->file_offset + ordered->num_bytes - 1);
10063 btrfs_put_ordered_extent(ordered);
10064 }
10065
10066 ASSERT(ordered == NULL);
10067}
10068
5e485ac6
FM
10069/*
10070 * Find the first inode with a minimum number.
10071 *
10072 * @root: The root to search for.
10073 * @min_ino: The minimum inode number.
10074 *
10075 * Find the first inode in the @root with a number >= @min_ino and return it.
10076 * Returns NULL if no such inode found.
10077 */
10078struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
10079{
5e485ac6 10080 struct btrfs_inode *inode;
310b2f5d 10081 unsigned long from = min_ino;
5e485ac6 10082
e2844cce 10083 xa_lock(&root->inodes);
310b2f5d
FM
10084 while (true) {
10085 inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
10086 if (!inode)
10087 break;
10088 if (igrab(&inode->vfs_inode))
5e485ac6 10089 break;
5e485ac6 10090
310b2f5d 10091 from = btrfs_ino(inode) + 1;
e2844cce 10092 cond_resched_lock(&root->inodes.xa_lock);
5e485ac6 10093 }
e2844cce 10094 xa_unlock(&root->inodes);
5e485ac6 10095
310b2f5d 10096 return inode;
5e485ac6
FM
10097}
10098
6e1d5dcc 10099static const struct inode_operations btrfs_dir_inode_operations = {
3394e160 10100 .getattr = btrfs_getattr,
39279cc3
CM
10101 .lookup = btrfs_lookup,
10102 .create = btrfs_create,
10103 .unlink = btrfs_unlink,
10104 .link = btrfs_link,
10105 .mkdir = btrfs_mkdir,
10106 .rmdir = btrfs_rmdir,
2773bf00 10107 .rename = btrfs_rename2,
39279cc3
CM
10108 .symlink = btrfs_symlink,
10109 .setattr = btrfs_setattr,
618e21d5 10110 .mknod = btrfs_mknod,
5103e947 10111 .listxattr = btrfs_listxattr,
fdebe2bd 10112 .permission = btrfs_permission,
cac2f8b8 10113 .get_inode_acl = btrfs_get_acl,
996a710d 10114 .set_acl = btrfs_set_acl,
93fd63c2 10115 .update_time = btrfs_update_time,
ef3b9af5 10116 .tmpfile = btrfs_tmpfile,
97fc2977
MS
10117 .fileattr_get = btrfs_fileattr_get,
10118 .fileattr_set = btrfs_fileattr_set,
39279cc3 10119};
76dda93c 10120
828c0950 10121static const struct file_operations btrfs_dir_file_operations = {
e60aa5da 10122 .llseek = btrfs_dir_llseek,
39279cc3 10123 .read = generic_read_dir,
02dbfc99 10124 .iterate_shared = btrfs_real_readdir,
23b5ec74 10125 .open = btrfs_opendir,
34287aa3 10126 .unlocked_ioctl = btrfs_ioctl,
39279cc3 10127#ifdef CONFIG_COMPAT
4c63c245 10128 .compat_ioctl = btrfs_compat_ioctl,
39279cc3 10129#endif
6bf13c0c 10130 .release = btrfs_release_file,
e02119d5 10131 .fsync = btrfs_sync_file,
39279cc3
CM
10132};
10133
35054394
CM
10134/*
10135 * btrfs doesn't support the bmap operation because swapfiles
10136 * use bmap to make a mapping of extents in the file. They assume
10137 * these extents won't change over the life of the file and they
10138 * use the bmap result to do IO directly to the drive.
10139 *
10140 * the btrfs bmap call would return logical addresses that aren't
10141 * suitable for IO and they also will change frequently as COW
10142 * operations happen. So, swapfile + btrfs == corruption.
10143 *
10144 * For now we're avoiding this by dropping bmap.
10145 */
7f09410b 10146static const struct address_space_operations btrfs_aops = {
fb12489b 10147 .read_folio = btrfs_read_folio,
b293f02e 10148 .writepages = btrfs_writepages,
ba206a02 10149 .readahead = btrfs_readahead,
895586eb 10150 .invalidate_folio = btrfs_invalidate_folio,
872617a0 10151 .launder_folio = btrfs_launder_folio,
f913cff3 10152 .release_folio = btrfs_release_folio,
e7a60a17 10153 .migrate_folio = btrfs_migrate_folio,
187c82cb 10154 .dirty_folio = filemap_dirty_folio,
af7628d6 10155 .error_remove_folio = generic_error_remove_folio,
ed46ff3d
OS
10156 .swap_activate = btrfs_swap_activate,
10157 .swap_deactivate = btrfs_swap_deactivate,
39279cc3
CM
10158};
10159
6e1d5dcc 10160static const struct inode_operations btrfs_file_inode_operations = {
39279cc3
CM
10161 .getattr = btrfs_getattr,
10162 .setattr = btrfs_setattr,
5103e947 10163 .listxattr = btrfs_listxattr,
fdebe2bd 10164 .permission = btrfs_permission,
1506fcc8 10165 .fiemap = btrfs_fiemap,
cac2f8b8 10166 .get_inode_acl = btrfs_get_acl,
996a710d 10167 .set_acl = btrfs_set_acl,
e41f941a 10168 .update_time = btrfs_update_time,
97fc2977
MS
10169 .fileattr_get = btrfs_fileattr_get,
10170 .fileattr_set = btrfs_fileattr_set,
39279cc3 10171};
6e1d5dcc 10172static const struct inode_operations btrfs_special_inode_operations = {
618e21d5
JB
10173 .getattr = btrfs_getattr,
10174 .setattr = btrfs_setattr,
fdebe2bd 10175 .permission = btrfs_permission,
33268eaf 10176 .listxattr = btrfs_listxattr,
cac2f8b8 10177 .get_inode_acl = btrfs_get_acl,
996a710d 10178 .set_acl = btrfs_set_acl,
e41f941a 10179 .update_time = btrfs_update_time,
618e21d5 10180};
6e1d5dcc 10181static const struct inode_operations btrfs_symlink_inode_operations = {
6b255391 10182 .get_link = page_get_link,
f209561a 10183 .getattr = btrfs_getattr,
22c44fe6 10184 .setattr = btrfs_setattr,
fdebe2bd 10185 .permission = btrfs_permission,
0279b4cd 10186 .listxattr = btrfs_listxattr,
e41f941a 10187 .update_time = btrfs_update_time,
39279cc3 10188};
76dda93c 10189
82d339d9 10190const struct dentry_operations btrfs_dentry_operations = {
76dda93c
YZ
10191 .d_delete = btrfs_dentry_delete,
10192};
This page took 5.643403 seconds and 4 git commands to generate.