fs/btrfs/inode.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <crypto/hash.h>
   7 #include <linux/kernel.h>
   8 #include <linux/bio.h>
   9 #include <linux/blk-cgroup.h>
  10 #include <linux/file.h>
  11 #include <linux/fs.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/highmem.h>
  14 #include <linux/time.h>
  15 #include <linux/init.h>
  16 #include <linux/string.h>
  17 #include <linux/backing-dev.h>
  18 #include <linux/writeback.h>
  19 #include <linux/compat.h>
  20 #include <linux/xattr.h>
  21 #include <linux/posix_acl.h>
  22 #include <linux/falloc.h>
  23 #include <linux/slab.h>
  24 #include <linux/ratelimit.h>
  25 #include <linux/btrfs.h>
  26 #include <linux/blkdev.h>
  27 #include <linux/posix_acl_xattr.h>
  28 #include <linux/uio.h>
  29 #include <linux/magic.h>
  30 #include <linux/iversion.h>
  31 #include <linux/swap.h>
  32 #include <linux/migrate.h>
  33 #include <linux/sched/mm.h>
  34 #include <linux/iomap.h>
  35 #include <asm/unaligned.h>
  36 #include <linux/fsverity.h>
  37 #include "misc.h"
  38 #include "ctree.h"
  39 #include "disk-io.h"
  40 #include "transaction.h"
  41 #include "btrfs_inode.h"
  42 #include "print-tree.h"
  43 #include "ordered-data.h"
  44 #include "xattr.h"
  45 #include "tree-log.h"
  46 #include "volumes.h"
  47 #include "compression.h"
  48 #include "locking.h"
  49 #include "free-space-cache.h"
  50 #include "props.h"
  51 #include "qgroup.h"
  52 #include "delalloc-space.h"
  53 #include "block-group.h"
  54 #include "space-info.h"
  55 #include "zoned.h"
  56 #include "subpage.h"
  57
  58 struct btrfs_iget_args {
  59         u64 ino;
  60         struct btrfs_root *root;
  61 };
  62
  63 struct btrfs_dio_data {
  64         u64 reserve;
  65         loff_t length;
  66         ssize_t submitted;
  67         struct extent_changeset *data_reserved;
  68 };
  69
  70 static const struct inode_operations btrfs_dir_inode_operations;
  71 static const struct inode_operations btrfs_symlink_inode_operations;
  72 static const struct inode_operations btrfs_special_inode_operations;
  73 static const struct inode_operations btrfs_file_inode_operations;
  74 static const struct address_space_operations btrfs_aops;
  75 static const struct file_operations btrfs_dir_file_operations;
  76
  77 static struct kmem_cache *btrfs_inode_cachep;
  78 struct kmem_cache *btrfs_trans_handle_cachep;
  79 struct kmem_cache *btrfs_path_cachep;
  80 struct kmem_cache *btrfs_free_space_cachep;
  81 struct kmem_cache *btrfs_free_space_bitmap_cachep;
  82
  83 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
  84 static int btrfs_truncate(struct inode *inode, bool skip_writeback);
  85 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
  86 static noinline int cow_file_range(struct btrfs_inode *inode,
  87                                    struct page *locked_page,
  88                                    u64 start, u64 end, int *page_started,
  89                                    unsigned long *nr_written, int unlock);
  90 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
  91                                        u64 len, u64 orig_start, u64 block_start,
  92                                        u64 block_len, u64 orig_block_len,
  93                                        u64 ram_bytes, int compress_type,
  94                                        int type);
  95
  96 static void __endio_write_update_ordered(struct btrfs_inode *inode,
  97                                          const u64 offset, const u64 bytes,
  98                                          const bool uptodate);
  99
 100 /*
 101  * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
 102  *
 103  * ilock_flags can have the following bit set:
 104  *
 105  * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
 106  * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
 107  *                   return -EAGAIN
 108  * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
 109  */
 110 int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
 111 {
 112         if (ilock_flags & BTRFS_ILOCK_SHARED) {
 113                 if (ilock_flags & BTRFS_ILOCK_TRY) {
 114                         if (!inode_trylock_shared(inode))
 115                                 return -EAGAIN;
 116                         else
 117                                 return 0;
 118                 }
 119                 inode_lock_shared(inode);
 120         } else {
 121                 if (ilock_flags & BTRFS_ILOCK_TRY) {
 122                         if (!inode_trylock(inode))
 123                                 return -EAGAIN;
 124                         else
 125                                 return 0;
 126                 }
 127                 inode_lock(inode);
 128         }
 129         if (ilock_flags & BTRFS_ILOCK_MMAP)
 130                 down_write(&BTRFS_I(inode)->i_mmap_lock);
 131         return 0;
 132 }
 133
 134 /*
 135  * btrfs_inode_unlock - unock inode i_rwsem
 136  *
 137  * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
 138  * to decide whether the lock acquired is shared or exclusive.
 139  */
 140 void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
 141 {
 142         if (ilock_flags & BTRFS_ILOCK_MMAP)
 143                 up_write(&BTRFS_I(inode)->i_mmap_lock);
 144         if (ilock_flags & BTRFS_ILOCK_SHARED)
 145                 inode_unlock_shared(inode);
 146         else
 147                 inode_unlock(inode);
 148 }
 149
 150 /*
 151  * Cleanup all submitted ordered extents in specified range to handle errors
 152  * from the btrfs_run_delalloc_range() callback.
 153  *
 154  * NOTE: caller must ensure that when an error happens, it can not call
 155  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
 156  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
 157  * to be released, which we want to happen only when finishing the ordered
 158  * extent (btrfs_finish_ordered_io()).
 159  */
 160 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
 161                                                  struct page *locked_page,
 162                                                  u64 offset, u64 bytes)
 163 {
 164         unsigned long index = offset >> PAGE_SHIFT;
 165         unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
 166         u64 page_start = page_offset(locked_page);
 167         u64 page_end = page_start + PAGE_SIZE - 1;
 168
 169         struct page *page;
 170
 171         while (index <= end_index) {
 172                 /*
 173                  * For locked page, we will call end_extent_writepage() on it
 174                  * in run_delalloc_range() for the error handling.  That
 175                  * end_extent_writepage() function will call
 176                  * btrfs_mark_ordered_io_finished() to clear page Ordered and
 177                  * run the ordered extent accounting.
 178                  *
 179                  * Here we can't just clear the Ordered bit, or
 180                  * btrfs_mark_ordered_io_finished() would skip the accounting
 181                  * for the page range, and the ordered extent will never finish.
 182                  */
 183                 if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
 184                         index++;
 185                         continue;
 186                 }
 187                 page = find_get_page(inode->vfs_inode.i_mapping, index);
 188                 index++;
 189                 if (!page)
 190                         continue;
 191
 192                 /*
 193                  * Here we just clear all Ordered bits for every page in the
 194                  * range, then __endio_write_update_ordered() will handle
 195                  * the ordered extent accounting for the range.
 196                  */
 197                 btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
 198                                                offset, bytes);
 199                 put_page(page);
 200         }
 201
 202         /* The locked page covers the full range, nothing needs to be done */
 203         if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
 204                 return;
 205         /*
 206          * In case this page belongs to the delalloc range being instantiated
 207          * then skip it, since the first page of a range is going to be
 208          * properly cleaned up by the caller of run_delalloc_range
 209          */
 210         if (page_start >= offset && page_end <= (offset + bytes - 1)) {
 211                 bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
 212                 offset = page_offset(locked_page) + PAGE_SIZE;
 213         }
 214
 215         return __endio_write_update_ordered(inode, offset, bytes, false);
 216 }
 217
 218 static int btrfs_dirty_inode(struct inode *inode);
 219
 220 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 221                                      struct inode *inode,  struct inode *dir,
 222                                      const struct qstr *qstr)
 223 {
 224         int err;
 225
 226         err = btrfs_init_acl(trans, inode, dir);
 227         if (!err)
 228                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
 229         return err;
 230 }
 231
 232 /*
 233  * this does all the hard work for inserting an inline extent into
 234  * the btree.  The caller should have done a btrfs_drop_extents so that
 235  * no overlapping inline items exist in the btree
 236  */
 237 static int insert_inline_extent(struct btrfs_trans_handle *trans,
 238                                 struct btrfs_path *path, bool extent_inserted,
 239                                 struct btrfs_root *root, struct inode *inode,
 240                                 u64 start, size_t size, size_t compressed_size,
 241                                 int compress_type,
 242                                 struct page **compressed_pages)
 243 {
 244         struct extent_buffer *leaf;
 245         struct page *page = NULL;
 246         char *kaddr;
 247         unsigned long ptr;
 248         struct btrfs_file_extent_item *ei;
 249         int ret;
 250         size_t cur_size = size;
 251         unsigned long offset;
 252
 253         ASSERT((compressed_size > 0 && compressed_pages) ||
 254                (compressed_size == 0 && !compressed_pages));
 255
 256         if (compressed_size && compressed_pages)
 257                 cur_size = compressed_size;
 258
 259         if (!extent_inserted) {
 260                 struct btrfs_key key;
 261                 size_t datasize;
 262
 263                 key.objectid = btrfs_ino(BTRFS_I(inode));
 264                 key.offset = start;
 265                 key.type = BTRFS_EXTENT_DATA_KEY;
 266
 267                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
 268                 ret = btrfs_insert_empty_item(trans, root, path, &key,
 269                                               datasize);
 270                 if (ret)
 271                         goto fail;
 272         }
 273         leaf = path->nodes[0];
 274         ei = btrfs_item_ptr(leaf, path->slots[0],
 275                             struct btrfs_file_extent_item);
 276         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 277         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
 278         btrfs_set_file_extent_encryption(leaf, ei, 0);
 279         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
 280         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
 281         ptr = btrfs_file_extent_inline_start(ei);
 282
 283         if (compress_type != BTRFS_COMPRESS_NONE) {
 284                 struct page *cpage;
 285                 int i = 0;
 286                 while (compressed_size > 0) {
 287                         cpage = compressed_pages[i];
 288                         cur_size = min_t(unsigned long, compressed_size,
 289                                        PAGE_SIZE);
 290
 291                         kaddr = kmap_atomic(cpage);
 292                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
 293                         kunmap_atomic(kaddr);
 294
 295                         i++;
 296                         ptr += cur_size;
 297                         compressed_size -= cur_size;
 298                 }
 299                 btrfs_set_file_extent_compression(leaf, ei,
 300                                                   compress_type);
 301         } else {
 302                 page = find_get_page(inode->i_mapping,
 303                                      start >> PAGE_SHIFT);
 304                 btrfs_set_file_extent_compression(leaf, ei, 0);
 305                 kaddr = kmap_atomic(page);
 306                 offset = offset_in_page(start);
 307                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
 308                 kunmap_atomic(kaddr);
 309                 put_page(page);
 310         }
 311         btrfs_mark_buffer_dirty(leaf);
 312         btrfs_release_path(path);
 313
 314         /*
 315          * We align size to sectorsize for inline extents just for simplicity
 316          * sake.
 317          */
 318         size = ALIGN(size, root->fs_info->sectorsize);
 319         ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
 320         if (ret)
 321                 goto fail;
 322
 323         /*
 324          * we're an inline extent, so nobody can
 325          * extend the file past i_size without locking
 326          * a page we already have locked.
 327          *
 328          * We must do any isize and inode updates
 329          * before we unlock the pages.  Otherwise we
 330          * could end up racing with unlink.
 331          */
 332         BTRFS_I(inode)->disk_i_size = inode->i_size;
 333 fail:
 334         return ret;
 335 }
 336
 337
 338 /*
 339  * conditionally insert an inline extent into the file.  This
 340  * does the checks required to make sure the data is small enough
 341  * to fit as an inline extent.
 342  */
 343 static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
 344                                           u64 end, size_t compressed_size,
 345                                           int compress_type,
 346                                           struct page **compressed_pages)
 347 {
 348         struct btrfs_drop_extents_args drop_args = { 0 };
 349         struct btrfs_root *root = inode->root;
 350         struct btrfs_fs_info *fs_info = root->fs_info;
 351         struct btrfs_trans_handle *trans;
 352         u64 isize = i_size_read(&inode->vfs_inode);
 353         u64 actual_end = min(end + 1, isize);
 354         u64 inline_len = actual_end - start;
 355         u64 aligned_end = ALIGN(end, fs_info->sectorsize);
 356         u64 data_len = inline_len;
 357         int ret;
 358         struct btrfs_path *path;
 359
 360         if (compressed_size)
 361                 data_len = compressed_size;
 362
 363         if (start > 0 ||
 364             actual_end > fs_info->sectorsize ||
 365             data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
 366             (!compressed_size &&
 367             (actual_end & (fs_info->sectorsize - 1)) == 0) ||
 368             end + 1 < isize ||
 369             data_len > fs_info->max_inline) {
 370                 return 1;
 371         }
 372
 373         path = btrfs_alloc_path();
 374         if (!path)
 375                 return -ENOMEM;
 376
 377         trans = btrfs_join_transaction(root);
 378         if (IS_ERR(trans)) {
 379                 btrfs_free_path(path);
 380                 return PTR_ERR(trans);
 381         }
 382         trans->block_rsv = &inode->block_rsv;
 383
 384         drop_args.path = path;
 385         drop_args.start = start;
 386         drop_args.end = aligned_end;
 387         drop_args.drop_cache = true;
 388         drop_args.replace_extent = true;
 389
 390         if (compressed_size && compressed_pages)
 391                 drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
 392                    compressed_size);
 393         else
 394                 drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
 395                     inline_len);
 396
 397         ret = btrfs_drop_extents(trans, root, inode, &drop_args);
 398         if (ret) {
 399                 btrfs_abort_transaction(trans, ret);
 400                 goto out;
 401         }
 402
 403         if (isize > actual_end)
 404                 inline_len = min_t(u64, isize, actual_end);
 405         ret = insert_inline_extent(trans, path, drop_args.extent_inserted,
 406                                    root, &inode->vfs_inode, start,
 407                                    inline_len, compressed_size,
 408                                    compress_type, compressed_pages);
 409         if (ret && ret != -ENOSPC) {
 410                 btrfs_abort_transaction(trans, ret);
 411                 goto out;
 412         } else if (ret == -ENOSPC) {
 413                 ret = 1;
 414                 goto out;
 415         }
 416
 417         btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found);
 418         ret = btrfs_update_inode(trans, root, inode);
 419         if (ret && ret != -ENOSPC) {
 420                 btrfs_abort_transaction(trans, ret);
 421                 goto out;
 422         } else if (ret == -ENOSPC) {
 423                 ret = 1;
 424                 goto out;
 425         }
 426
 427         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
 428 out:
 429         /*
 430          * Don't forget to free the reserved space, as for inlined extent
 431          * it won't count as data extent, free them directly here.
 432          * And at reserve time, it's always aligned to page size, so
 433          * just free one page here.
 434          */
 435         btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
 436         btrfs_free_path(path);
 437         btrfs_end_transaction(trans);
 438         return ret;
 439 }
 440
 441 struct async_extent {
 442         u64 start;
 443         u64 ram_size;
 444         u64 compressed_size;
 445         struct page **pages;
 446         unsigned long nr_pages;
 447         int compress_type;
 448         struct list_head list;
 449 };
 450
 451 struct async_chunk {
 452         struct inode *inode;
 453         struct page *locked_page;
 454         u64 start;
 455         u64 end;
 456         unsigned int write_flags;
 457         struct list_head extents;
 458         struct cgroup_subsys_state *blkcg_css;
 459         struct btrfs_work work;
 460         struct async_cow *async_cow;
 461 };
 462
 463 struct async_cow {
 464         atomic_t num_chunks;
 465         struct async_chunk chunks[];
 466 };
 467
 468 static noinline int add_async_extent(struct async_chunk *cow,
 469                                      u64 start, u64 ram_size,
 470                                      u64 compressed_size,
 471                                      struct page **pages,
 472                                      unsigned long nr_pages,
 473                                      int compress_type)
 474 {
 475         struct async_extent *async_extent;
 476
 477         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
 478         BUG_ON(!async_extent); /* -ENOMEM */
 479         async_extent->start = start;
 480         async_extent->ram_size = ram_size;
 481         async_extent->compressed_size = compressed_size;
 482         async_extent->pages = pages;
 483         async_extent->nr_pages = nr_pages;
 484         async_extent->compress_type = compress_type;
 485         list_add_tail(&async_extent->list, &cow->extents);
 486         return 0;
 487 }
 488
 489 /*
 490  * Check if the inode has flags compatible with compression
 491  */
 492 static inline bool inode_can_compress(struct btrfs_inode *inode)
 493 {
 494         if (inode->flags & BTRFS_INODE_NODATACOW ||
 495             inode->flags & BTRFS_INODE_NODATASUM)
 496                 return false;
 497         return true;
 498 }
 499
 500 /*
 501  * Check if the inode needs to be submitted to compression, based on mount
 502  * options, defragmentation, properties or heuristics.
 503  */
 504 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
 505                                       u64 end)
 506 {
 507         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 508
 509         if (!inode_can_compress(inode)) {
 510                 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
 511                         KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
 512                         btrfs_ino(inode));
 513                 return 0;
 514         }
 515         /*
 516          * Special check for subpage.
 517          *
 518          * We lock the full page then run each delalloc range in the page, thus
 519          * for the following case, we will hit some subpage specific corner case:
 520          *
 521          * 0            32K             64K
 522          * |    |///////|       |///////|
 523          *              \- A            \- B
 524          *
 525          * In above case, both range A and range B will try to unlock the full
 526          * page [0, 64K), causing the one finished later will have page
 527          * unlocked already, triggering various page lock requirement BUG_ON()s.
 528          *
 529          * So here we add an artificial limit that subpage compression can only
 530          * if the range is fully page aligned.
 531          *
 532          * In theory we only need to ensure the first page is fully covered, but
 533          * the tailing partial page will be locked until the full compression
 534          * finishes, delaying the write of other range.
 535          *
 536          * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
 537          * first to prevent any submitted async extent to unlock the full page.
 538          * By this, we can ensure for subpage case that only the last async_cow
 539          * will unlock the full page.
 540          */
 541         if (fs_info->sectorsize < PAGE_SIZE) {
 542                 if (!IS_ALIGNED(start, PAGE_SIZE) ||
 543                     !IS_ALIGNED(end + 1, PAGE_SIZE))
 544                         return 0;
 545         }
 546
 547         /* force compress */
 548         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
 549                 return 1;
 550         /* defrag ioctl */
 551         if (inode->defrag_compress)
 552                 return 1;
 553         /* bad compression ratios */
 554         if (inode->flags & BTRFS_INODE_NOCOMPRESS)
 555                 return 0;
 556         if (btrfs_test_opt(fs_info, COMPRESS) ||
 557             inode->flags & BTRFS_INODE_COMPRESS ||
 558             inode->prop_compress)
 559                 return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
 560         return 0;
 561 }
 562
 563 static inline void inode_should_defrag(struct btrfs_inode *inode,
 564                 u64 start, u64 end, u64 num_bytes, u64 small_write)
 565 {
 566         /* If this is a small write inside eof, kick off a defrag */
 567         if (num_bytes < small_write &&
 568             (start > 0 || end + 1 < inode->disk_i_size))
 569                 btrfs_add_inode_defrag(NULL, inode);
 570 }
 571
 572 /*
 573  * we create compressed extents in two phases.  The first
 574  * phase compresses a range of pages that have already been
 575  * locked (both pages and state bits are locked).
 576  *
 577  * This is done inside an ordered work queue, and the compression
 578  * is spread across many cpus.  The actual IO submission is step
 579  * two, and the ordered work queue takes care of making sure that
 580  * happens in the same order things were put onto the queue by
 581  * writepages and friends.
 582  *
 583  * If this code finds it can't get good compression, it puts an
 584  * entry onto the work queue to write the uncompressed bytes.  This
 585  * makes sure that both compressed inodes and uncompressed inodes
 586  * are written in the same order that the flusher thread sent them
 587  * down.
 588  */
 589 static noinline int compress_file_range(struct async_chunk *async_chunk)
 590 {
 591         struct inode *inode = async_chunk->inode;
 592         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 593         u64 blocksize = fs_info->sectorsize;
 594         u64 start = async_chunk->start;
 595         u64 end = async_chunk->end;
 596         u64 actual_end;
 597         u64 i_size;
 598         int ret = 0;
 599         struct page **pages = NULL;
 600         unsigned long nr_pages;
 601         unsigned long total_compressed = 0;
 602         unsigned long total_in = 0;
 603         int i;
 604         int will_compress;
 605         int compress_type = fs_info->compress_type;
 606         int compressed_extents = 0;
 607         int redirty = 0;
 608
 609         inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
 610                         SZ_16K);
 611
 612         /*
 613          * We need to save i_size before now because it could change in between
 614          * us evaluating the size and assigning it.  This is because we lock and
 615          * unlock the page in truncate and fallocate, and then modify the i_size
 616          * later on.
 617          *
 618          * The barriers are to emulate READ_ONCE, remove that once i_size_read
 619          * does that for us.
 620          */
 621         barrier();
 622         i_size = i_size_read(inode);
 623         barrier();
 624         actual_end = min_t(u64, i_size, end + 1);
 625 again:
 626         will_compress = 0;
 627         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
 628         BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
 629         nr_pages = min_t(unsigned long, nr_pages,
 630                         BTRFS_MAX_COMPRESSED / PAGE_SIZE);
 631
 632         /*
 633          * we don't want to send crud past the end of i_size through
 634          * compression, that's just a waste of CPU time.  So, if the
 635          * end of the file is before the start of our current
 636          * requested range of bytes, we bail out to the uncompressed
 637          * cleanup code that can deal with all of this.
 638          *
 639          * It isn't really the fastest way to fix things, but this is a
 640          * very uncommon corner.
 641          */
 642         if (actual_end <= start)
 643                 goto cleanup_and_bail_uncompressed;
 644
 645         total_compressed = actual_end - start;
 646
 647         /*
 648          * Skip compression for a small file range(<=blocksize) that
 649          * isn't an inline extent, since it doesn't save disk space at all.
 650          */
 651         if (total_compressed <= blocksize &&
 652            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 653                 goto cleanup_and_bail_uncompressed;
 654
 655         /*
 656          * For subpage case, we require full page alignment for the sector
 657          * aligned range.
 658          * Thus we must also check against @actual_end, not just @end.
 659          */
 660         if (blocksize < PAGE_SIZE) {
 661                 if (!IS_ALIGNED(start, PAGE_SIZE) ||
 662                     !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
 663                         goto cleanup_and_bail_uncompressed;
 664         }
 665
 666         total_compressed = min_t(unsigned long, total_compressed,
 667                         BTRFS_MAX_UNCOMPRESSED);
 668         total_in = 0;
 669         ret = 0;
 670
 671         /*
 672          * we do compression for mount -o compress and when the
 673          * inode has not been flagged as nocompress.  This flag can
 674          * change at any time if we discover bad compression ratios.
 675          */
 676         if (inode_need_compress(BTRFS_I(inode), start, end)) {
 677                 WARN_ON(pages);
 678                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
 679                 if (!pages) {
 680                         /* just bail out to the uncompressed code */
 681                         nr_pages = 0;
 682                         goto cont;
 683                 }
 684
 685                 if (BTRFS_I(inode)->defrag_compress)
 686                         compress_type = BTRFS_I(inode)->defrag_compress;
 687                 else if (BTRFS_I(inode)->prop_compress)
 688                         compress_type = BTRFS_I(inode)->prop_compress;
 689
 690                 /*
 691                  * we need to call clear_page_dirty_for_io on each
 692                  * page in the range.  Otherwise applications with the file
 693                  * mmap'd can wander in and change the page contents while
 694                  * we are compressing them.
 695                  *
 696                  * If the compression fails for any reason, we set the pages
 697                  * dirty again later on.
 698                  *
 699                  * Note that the remaining part is redirtied, the start pointer
 700                  * has moved, the end is the original one.
 701                  */
 702                 if (!redirty) {
 703                         extent_range_clear_dirty_for_io(inode, start, end);
 704                         redirty = 1;
 705                 }
 706
 707                 /* Compression level is applied here and only here */
 708                 ret = btrfs_compress_pages(
 709                         compress_type | (fs_info->compress_level << 4),
 710                                            inode->i_mapping, start,
 711                                            pages,
 712                                            &nr_pages,
 713                                            &total_in,
 714                                            &total_compressed);
 715
 716                 if (!ret) {
 717                         unsigned long offset = offset_in_page(total_compressed);
 718                         struct page *page = pages[nr_pages - 1];
 719
 720                         /* zero the tail end of the last page, we might be
 721                          * sending it down to disk
 722                          */
 723                         if (offset)
 724                                 memzero_page(page, offset, PAGE_SIZE - offset);
 725                         will_compress = 1;
 726                 }
 727         }
 728 cont:
 729         /*
 730          * Check cow_file_range() for why we don't even try to create inline
 731          * extent for subpage case.
 732          */
 733         if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
 734                 /* lets try to make an inline extent */
 735                 if (ret || total_in < actual_end) {
 736                         /* we didn't compress the entire range, try
 737                          * to make an uncompressed inline extent.
 738                          */
 739                         ret = cow_file_range_inline(BTRFS_I(inode), start, end,
 740                                                     0, BTRFS_COMPRESS_NONE,
 741                                                     NULL);
 742                 } else {
 743                         /* try making a compressed inline extent */
 744                         ret = cow_file_range_inline(BTRFS_I(inode), start, end,
 745                                                     total_compressed,
 746                                                     compress_type, pages);
 747                 }
 748                 if (ret <= 0) {
 749                         unsigned long clear_flags = EXTENT_DELALLOC |
 750                                 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
 751                                 EXTENT_DO_ACCOUNTING;
 752                         unsigned long page_error_op;
 753
 754                         page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
 755
 756                         /*
 757                          * inline extent creation worked or returned error,
 758                          * we don't need to create any more async work items.
 759                          * Unlock and free up our temp pages.
 760                          *
 761                          * We use DO_ACCOUNTING here because we need the
 762                          * delalloc_release_metadata to be done _after_ we drop
 763                          * our outstanding extent for clearing delalloc for this
 764                          * range.
 765                          */
 766                         extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
 767                                                      NULL,
 768                                                      clear_flags,
 769                                                      PAGE_UNLOCK |
 770                                                      PAGE_START_WRITEBACK |
 771                                                      page_error_op |
 772                                                      PAGE_END_WRITEBACK);
 773
 774                         /*
 775                          * Ensure we only free the compressed pages if we have
 776                          * them allocated, as we can still reach here with
 777                          * inode_need_compress() == false.
 778                          */
 779                         if (pages) {
 780                                 for (i = 0; i < nr_pages; i++) {
 781                                         WARN_ON(pages[i]->mapping);
 782                                         put_page(pages[i]);
 783                                 }
 784                                 kfree(pages);
 785                         }
 786                         return 0;
 787                 }
 788         }
 789
 790         if (will_compress) {
 791                 /*
 792                  * we aren't doing an inline extent round the compressed size
 793                  * up to a block size boundary so the allocator does sane
 794                  * things
 795                  */
 796                 total_compressed = ALIGN(total_compressed, blocksize);
 797
 798                 /*
 799                  * one last check to make sure the compression is really a
 800                  * win, compare the page count read with the blocks on disk,
 801                  * compression must free at least one sector size
 802                  */
 803                 total_in = round_up(total_in, fs_info->sectorsize);
 804                 if (total_compressed + blocksize <= total_in) {
 805                         compressed_extents++;
 806
 807                         /*
 808                          * The async work queues will take care of doing actual
 809                          * allocation on disk for these compressed pages, and
 810                          * will submit them to the elevator.
 811                          */
 812                         add_async_extent(async_chunk, start, total_in,
 813                                         total_compressed, pages, nr_pages,
 814                                         compress_type);
 815
 816                         if (start + total_in < end) {
 817                                 start += total_in;
 818                                 pages = NULL;
 819                                 cond_resched();
 820                                 goto again;
 821                         }
 822                         return compressed_extents;
 823                 }
 824         }
 825         if (pages) {
 826                 /*
 827                  * the compression code ran but failed to make things smaller,
 828                  * free any pages it allocated and our page pointer array
 829                  */
 830                 for (i = 0; i < nr_pages; i++) {
 831                         WARN_ON(pages[i]->mapping);
 832                         put_page(pages[i]);
 833                 }
 834                 kfree(pages);
 835                 pages = NULL;
 836                 total_compressed = 0;
 837                 nr_pages = 0;
 838
 839                 /* flag the file so we don't compress in the future */
 840                 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
 841                     !(BTRFS_I(inode)->prop_compress)) {
 842                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
 843                 }
 844         }
 845 cleanup_and_bail_uncompressed:
 846         /*
 847          * No compression, but we still need to write the pages in the file
 848          * we've been given so far.  redirty the locked page if it corresponds
 849          * to our extent and set things up for the async work queue to run
 850          * cow_file_range to do the normal delalloc dance.
 851          */
 852         if (async_chunk->locked_page &&
 853             (page_offset(async_chunk->locked_page) >= start &&
 854              page_offset(async_chunk->locked_page)) <= end) {
 855                 __set_page_dirty_nobuffers(async_chunk->locked_page);
 856                 /* unlocked later on in the async handlers */
 857         }
 858
 859         if (redirty)
 860                 extent_range_redirty_for_io(inode, start, end);
 861         add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
 862                          BTRFS_COMPRESS_NONE);
 863         compressed_extents++;
 864
 865         return compressed_extents;
 866 }
 867
 868 static void free_async_extent_pages(struct async_extent *async_extent)
 869 {
 870         int i;
 871
 872         if (!async_extent->pages)
 873                 return;
 874
 875         for (i = 0; i < async_extent->nr_pages; i++) {
 876                 WARN_ON(async_extent->pages[i]->mapping);
 877                 put_page(async_extent->pages[i]);
 878         }
 879         kfree(async_extent->pages);
 880         async_extent->nr_pages = 0;
 881         async_extent->pages = NULL;
 882 }
 883
 884 static int submit_uncompressed_range(struct btrfs_inode *inode,
 885                                      struct async_extent *async_extent,
 886                                      struct page *locked_page)
 887 {
 888         u64 start = async_extent->start;
 889         u64 end = async_extent->start + async_extent->ram_size - 1;
 890         unsigned long nr_written = 0;
 891         int page_started = 0;
 892         int ret;
 893
 894         /*
 895          * Call cow_file_range() to run the delalloc range directly, since we
 896          * won't go to NOCOW or async path again.
 897          *
 898          * Also we call cow_file_range() with @unlock_page == 0, so that we
 899          * can directly submit them without interruption.
 900          */
 901         ret = cow_file_range(inode, locked_page, start, end, &page_started,
 902                              &nr_written, 0);
 903         /* Inline extent inserted, page gets unlocked and everything is done */
 904         if (page_started) {
 905                 ret = 0;
 906                 goto out;
 907         }
 908         if (ret < 0) {
 909                 if (locked_page)
 910                         unlock_page(locked_page);
 911                 goto out;
 912         }
 913
 914         ret = extent_write_locked_range(&inode->vfs_inode, start, end);
 915         /* All pages will be unlocked, including @locked_page */
 916 out:
 917         kfree(async_extent);
 918         return ret;
 919 }
 920
 921 static int submit_one_async_extent(struct btrfs_inode *inode,
 922                                    struct async_chunk *async_chunk,
 923                                    struct async_extent *async_extent,
 924                                    u64 *alloc_hint)
 925 {
 926         struct extent_io_tree *io_tree = &inode->io_tree;
 927         struct btrfs_root *root = inode->root;
 928         struct btrfs_fs_info *fs_info = root->fs_info;
 929         struct btrfs_key ins;
 930         struct page *locked_page = NULL;
 931         struct extent_map *em;
 932         int ret = 0;
 933         u64 start = async_extent->start;
 934         u64 end = async_extent->start + async_extent->ram_size - 1;
 935
 936         /*
 937          * If async_chunk->locked_page is in the async_extent range, we need to
 938          * handle it.
 939          */
 940         if (async_chunk->locked_page) {
 941                 u64 locked_page_start = page_offset(async_chunk->locked_page);
 942                 u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
 943
 944                 if (!(start >= locked_page_end || end <= locked_page_start))
 945                         locked_page = async_chunk->locked_page;
 946         }
 947         lock_extent(io_tree, start, end);
 948
 949         /* We have fall back to uncompressed write */
 950         if (!async_extent->pages)
 951                 return submit_uncompressed_range(inode, async_extent, locked_page);
 952
 953         ret = btrfs_reserve_extent(root, async_extent->ram_size,
 954                                    async_extent->compressed_size,
 955                                    async_extent->compressed_size,
 956                                    0, *alloc_hint, &ins, 1, 1);
 957         if (ret) {
 958                 free_async_extent_pages(async_extent);
 959                 /*
 960                  * Here we used to try again by going back to non-compressed
 961                  * path for ENOSPC.  But we can't reserve space even for
 962                  * compressed size, how could it work for uncompressed size
 963                  * which requires larger size?  So here we directly go error
 964                  * path.
 965                  */
 966                 goto out_free;
 967         }
 968
 969         /* Here we're doing allocation and writeback of the compressed pages */
 970         em = create_io_em(inode, start,
 971                           async_extent->ram_size,       /* len */
 972                           start,                        /* orig_start */
 973                           ins.objectid,                 /* block_start */
 974                           ins.offset,                   /* block_len */
 975                           ins.offset,                   /* orig_block_len */
 976                           async_extent->ram_size,       /* ram_bytes */
 977                           async_extent->compress_type,
 978                           BTRFS_ORDERED_COMPRESSED);
 979         if (IS_ERR(em)) {
 980                 ret = PTR_ERR(em);
 981                 goto out_free_reserve;
 982         }
 983         free_extent_map(em);
 984
 985         ret = btrfs_add_ordered_extent_compress(inode, start,   /* file_offset */
 986                                         ins.objectid,           /* disk_bytenr */
 987                                         async_extent->ram_size, /* num_bytes */
 988                                         ins.offset,             /* disk_num_bytes */
 989                                         async_extent->compress_type);
 990         if (ret) {
 991                 btrfs_drop_extent_cache(inode, start, end, 0);
 992                 goto out_free_reserve;
 993         }
 994         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 995
 996         /* Clear dirty, set writeback and unlock the pages. */
 997         extent_clear_unlock_delalloc(inode, start, end,
 998                         NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
 999                         PAGE_UNLOCK | PAGE_START_WRITEBACK);
1000         if (btrfs_submit_compressed_write(inode, start, /* file_offset */
1001                             async_extent->ram_size,     /* num_bytes */
1002                             ins.objectid,               /* disk_bytenr */
1003                             ins.offset,                 /* compressed_len */
1004                             async_extent->pages,        /* compressed_pages */
1005                             async_extent->nr_pages,
1006                             async_chunk->write_flags,
1007                             async_chunk->blkcg_css)) {
1008                 const u64 start = async_extent->start;
1009                 const u64 end = start + async_extent->ram_size - 1;
1010
1011                 btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
1012
1013                 extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
1014                                              PAGE_END_WRITEBACK | PAGE_SET_ERROR);
1015                 free_async_extent_pages(async_extent);
1016         }
1017         *alloc_hint = ins.objectid + ins.offset;
1018         kfree(async_extent);
1019         return ret;
1020
1021 out_free_reserve:
1022         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1023         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1024 out_free:
1025         extent_clear_unlock_delalloc(inode, start, end,
1026                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
1027                                      EXTENT_DELALLOC_NEW |
1028                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1029                                      PAGE_UNLOCK | PAGE_START_WRITEBACK |
1030                                      PAGE_END_WRITEBACK | PAGE_SET_ERROR);
1031         free_async_extent_pages(async_extent);
1032         kfree(async_extent);
1033         return ret;
1034 }
1035
1036 /*
1037  * Phase two of compressed writeback.  This is the ordered portion of the code,
1038  * which only gets called in the order the work was queued.  We walk all the
1039  * async extents created by compress_file_range and send them down to the disk.
1040  */
1041 static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
1042 {
1043         struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
1044         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1045         struct async_extent *async_extent;
1046         u64 alloc_hint = 0;
1047         int ret = 0;
1048
1049         while (!list_empty(&async_chunk->extents)) {
1050                 u64 extent_start;
1051                 u64 ram_size;
1052
1053                 async_extent = list_entry(async_chunk->extents.next,
1054                                           struct async_extent, list);
1055                 list_del(&async_extent->list);
1056                 extent_start = async_extent->start;
1057                 ram_size = async_extent->ram_size;
1058
1059                 ret = submit_one_async_extent(inode, async_chunk, async_extent,
1060                                               &alloc_hint);
1061                 btrfs_debug(fs_info,
1062 "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
1063                             inode->root->root_key.objectid,
1064                             btrfs_ino(inode), extent_start, ram_size, ret);
1065         }
1066 }
1067
1068 static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1069                                       u64 num_bytes)
1070 {
1071         struct extent_map_tree *em_tree = &inode->extent_tree;
1072         struct extent_map *em;
1073         u64 alloc_hint = 0;
1074
1075         read_lock(&em_tree->lock);
1076         em = search_extent_mapping(em_tree, start, num_bytes);
1077         if (em) {
1078                 /*
1079                  * if block start isn't an actual block number then find the
1080                  * first block in this inode and use that as a hint.  If that
1081                  * block is also bogus then just don't worry about it.
1082                  */
1083                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1084                         free_extent_map(em);
1085                         em = search_extent_mapping(em_tree, 0, 0);
1086                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
1087                                 alloc_hint = em->block_start;
1088                         if (em)
1089                                 free_extent_map(em);
1090                 } else {
1091                         alloc_hint = em->block_start;
1092                         free_extent_map(em);
1093                 }
1094         }
1095         read_unlock(&em_tree->lock);
1096
1097         return alloc_hint;
1098 }
1099
1100 /*
1101  * when extent_io.c finds a delayed allocation range in the file,
1102  * the call backs end up in this code.  The basic idea is to
1103  * allocate extents on disk for the range, and create ordered data structs
1104  * in ram to track those extents.
1105  *
1106  * locked_page is the page that writepage had locked already.  We use
1107  * it to make sure we don't do extra locks or unlocks.
1108  *
1109  * *page_started is set to one if we unlock locked_page and do everything
1110  * required to start IO on it.  It may be clean and already done with
1111  * IO when we return.
1112  */
1113 static noinline int cow_file_range(struct btrfs_inode *inode,
1114                                    struct page *locked_page,
1115                                    u64 start, u64 end, int *page_started,
1116                                    unsigned long *nr_written, int unlock)
1117 {
1118         struct btrfs_root *root = inode->root;
1119         struct btrfs_fs_info *fs_info = root->fs_info;
1120         u64 alloc_hint = 0;
1121         u64 num_bytes;
1122         unsigned long ram_size;
1123         u64 cur_alloc_size = 0;
1124         u64 min_alloc_size;
1125         u64 blocksize = fs_info->sectorsize;
1126         struct btrfs_key ins;
1127         struct extent_map *em;
1128         unsigned clear_bits;
1129         unsigned long page_ops;
1130         bool extent_reserved = false;
1131         int ret = 0;
1132
1133         if (btrfs_is_free_space_inode(inode)) {
1134                 WARN_ON_ONCE(1);
1135                 ret = -EINVAL;
1136                 goto out_unlock;
1137         }
1138
1139         num_bytes = ALIGN(end - start + 1, blocksize);
1140         num_bytes = max(blocksize,  num_bytes);
1141         ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1142
1143         inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1144
1145         /*
1146          * Due to the page size limit, for subpage we can only trigger the
1147          * writeback for the dirty sectors of page, that means data writeback
1148          * is doing more writeback than what we want.
1149          *
1150          * This is especially unexpected for some call sites like fallocate,
1151          * where we only increase i_size after everything is done.
1152          * This means we can trigger inline extent even if we didn't want to.
1153          * So here we skip inline extent creation completely.
1154          */
1155         if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
1156                 /* lets try to make an inline extent */
1157                 ret = cow_file_range_inline(inode, start, end, 0,
1158                                             BTRFS_COMPRESS_NONE, NULL);
1159                 if (ret == 0) {
1160                         /*
1161                          * We use DO_ACCOUNTING here because we need the
1162                          * delalloc_release_metadata to be run _after_ we drop
1163                          * our outstanding extent for clearing delalloc for this
1164                          * range.
1165                          */
1166                         extent_clear_unlock_delalloc(inode, start, end,
1167                                      locked_page,
1168                                      EXTENT_LOCKED | EXTENT_DELALLOC |
1169                                      EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1170                                      EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1171                                      PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
1172                         *nr_written = *nr_written +
1173                              (end - start + PAGE_SIZE) / PAGE_SIZE;
1174                         *page_started = 1;
1175                         /*
1176                          * locked_page is locked by the caller of
1177                          * writepage_delalloc(), not locked by
1178                          * __process_pages_contig().
1179                          *
1180                          * We can't let __process_pages_contig() to unlock it,
1181                          * as it doesn't have any subpage::writers recorded.
1182                          *
1183                          * Here we manually unlock the page, since the caller
1184                          * can't use page_started to determine if it's an
1185                          * inline extent or a compressed extent.
1186                          */
1187                         unlock_page(locked_page);
1188                         goto out;
1189                 } else if (ret < 0) {
1190                         goto out_unlock;
1191                 }
1192         }
1193
1194         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1195         btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
1196
1197         /*
1198          * Relocation relies on the relocated extents to have exactly the same
1199          * size as the original extents. Normally writeback for relocation data
1200          * extents follows a NOCOW path because relocation preallocates the
1201          * extents. However, due to an operation such as scrub turning a block
1202          * group to RO mode, it may fallback to COW mode, so we must make sure
1203          * an extent allocated during COW has exactly the requested size and can
1204          * not be split into smaller extents, otherwise relocation breaks and
1205          * fails during the stage where it updates the bytenr of file extent
1206          * items.
1207          */
1208         if (btrfs_is_data_reloc_root(root))
1209                 min_alloc_size = num_bytes;
1210         else
1211                 min_alloc_size = fs_info->sectorsize;
1212
1213         while (num_bytes > 0) {
1214                 cur_alloc_size = num_bytes;
1215                 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1216                                            min_alloc_size, 0, alloc_hint,
1217                                            &ins, 1, 1);
1218                 if (ret < 0)
1219                         goto out_unlock;
1220                 cur_alloc_size = ins.offset;
1221                 extent_reserved = true;
1222
1223                 ram_size = ins.offset;
1224                 em = create_io_em(inode, start, ins.offset, /* len */
1225                                   start, /* orig_start */
1226                                   ins.objectid, /* block_start */
1227                                   ins.offset, /* block_len */
1228                                   ins.offset, /* orig_block_len */
1229                                   ram_size, /* ram_bytes */
1230                                   BTRFS_COMPRESS_NONE, /* compress_type */
1231                                   BTRFS_ORDERED_REGULAR /* type */);
1232                 if (IS_ERR(em)) {
1233                         ret = PTR_ERR(em);
1234                         goto out_reserve;
1235                 }
1236                 free_extent_map(em);
1237
1238                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1239                                                ram_size, cur_alloc_size,
1240                                                BTRFS_ORDERED_REGULAR);
1241                 if (ret)
1242                         goto out_drop_extent_cache;
1243
1244                 if (btrfs_is_data_reloc_root(root)) {
1245                         ret = btrfs_reloc_clone_csums(inode, start,
1246                                                       cur_alloc_size);
1247                         /*
1248                          * Only drop cache here, and process as normal.
1249                          *
1250                          * We must not allow extent_clear_unlock_delalloc()
1251                          * at out_unlock label to free meta of this ordered
1252                          * extent, as its meta should be freed by
1253                          * btrfs_finish_ordered_io().
1254                          *
1255                          * So we must continue until @start is increased to
1256                          * skip current ordered extent.
1257                          */
1258                         if (ret)
1259                                 btrfs_drop_extent_cache(inode, start,
1260                                                 start + ram_size - 1, 0);
1261                 }
1262
1263                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1264
1265                 /*
1266                  * We're not doing compressed IO, don't unlock the first page
1267                  * (which the caller expects to stay locked), don't clear any
1268                  * dirty bits and don't set any writeback bits
1269                  *
1270                  * Do set the Ordered (Private2) bit so we know this page was
1271                  * properly setup for writepage.
1272                  */
1273                 page_ops = unlock ? PAGE_UNLOCK : 0;
1274                 page_ops |= PAGE_SET_ORDERED;
1275
1276                 extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
1277                                              locked_page,
1278                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1279                                              page_ops);
1280                 if (num_bytes < cur_alloc_size)
1281                         num_bytes = 0;
1282                 else
1283                         num_bytes -= cur_alloc_size;
1284                 alloc_hint = ins.objectid + ins.offset;
1285                 start += cur_alloc_size;
1286                 extent_reserved = false;
1287
1288                 /*
1289                  * btrfs_reloc_clone_csums() error, since start is increased
1290                  * extent_clear_unlock_delalloc() at out_unlock label won't
1291                  * free metadata of current ordered extent, we're OK to exit.
1292                  */
1293                 if (ret)
1294                         goto out_unlock;
1295         }
1296 out:
1297         return ret;
1298
1299 out_drop_extent_cache:
1300         btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1301 out_reserve:
1302         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1303         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1304 out_unlock:
1305         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1306                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1307         page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1308         /*
1309          * If we reserved an extent for our delalloc range (or a subrange) and
1310          * failed to create the respective ordered extent, then it means that
1311          * when we reserved the extent we decremented the extent's size from
1312          * the data space_info's bytes_may_use counter and incremented the
1313          * space_info's bytes_reserved counter by the same amount. We must make
1314          * sure extent_clear_unlock_delalloc() does not try to decrement again
1315          * the data space_info's bytes_may_use counter, therefore we do not pass
1316          * it the flag EXTENT_CLEAR_DATA_RESV.
1317          */
1318         if (extent_reserved) {
1319                 extent_clear_unlock_delalloc(inode, start,
1320                                              start + cur_alloc_size - 1,
1321                                              locked_page,
1322                                              clear_bits,
1323                                              page_ops);
1324                 start += cur_alloc_size;
1325                 if (start >= end)
1326                         goto out;
1327         }
1328         extent_clear_unlock_delalloc(inode, start, end, locked_page,
1329                                      clear_bits | EXTENT_CLEAR_DATA_RESV,
1330                                      page_ops);
1331         goto out;
1332 }
1333
1334 /*
1335  * work queue call back to started compression on a file and pages
1336  */
1337 static noinline void async_cow_start(struct btrfs_work *work)
1338 {
1339         struct async_chunk *async_chunk;
1340         int compressed_extents;
1341
1342         async_chunk = container_of(work, struct async_chunk, work);
1343
1344         compressed_extents = compress_file_range(async_chunk);
1345         if (compressed_extents == 0) {
1346                 btrfs_add_delayed_iput(async_chunk->inode);
1347                 async_chunk->inode = NULL;
1348         }
1349 }
1350
1351 /*
1352  * work queue call back to submit previously compressed pages
1353  */
1354 static noinline void async_cow_submit(struct btrfs_work *work)
1355 {
1356         struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1357                                                      work);
1358         struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1359         unsigned long nr_pages;
1360
1361         nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1362                 PAGE_SHIFT;
1363
1364         /*
1365          * ->inode could be NULL if async_chunk_start has failed to compress,
1366          * in which case we don't have anything to submit, yet we need to
1367          * always adjust ->async_delalloc_pages as its paired with the init
1368          * happening in cow_file_range_async
1369          */
1370         if (async_chunk->inode)
1371                 submit_compressed_extents(async_chunk);
1372
1373         /* atomic_sub_return implies a barrier */
1374         if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1375             5 * SZ_1M)
1376                 cond_wake_up_nomb(&fs_info->async_submit_wait);
1377 }
1378
1379 static noinline void async_cow_free(struct btrfs_work *work)
1380 {
1381         struct async_chunk *async_chunk;
1382         struct async_cow *async_cow;
1383
1384         async_chunk = container_of(work, struct async_chunk, work);
1385         if (async_chunk->inode)
1386                 btrfs_add_delayed_iput(async_chunk->inode);
1387         if (async_chunk->blkcg_css)
1388                 css_put(async_chunk->blkcg_css);
1389
1390         async_cow = async_chunk->async_cow;
1391         if (atomic_dec_and_test(&async_cow->num_chunks))
1392                 kvfree(async_cow);
1393 }
1394
1395 static int cow_file_range_async(struct btrfs_inode *inode,
1396                                 struct writeback_control *wbc,
1397                                 struct page *locked_page,
1398                                 u64 start, u64 end, int *page_started,
1399                                 unsigned long *nr_written)
1400 {
1401         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1402         struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1403         struct async_cow *ctx;
1404         struct async_chunk *async_chunk;
1405         unsigned long nr_pages;
1406         u64 cur_end;
1407         u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1408         int i;
1409         bool should_compress;
1410         unsigned nofs_flag;
1411         const unsigned int write_flags = wbc_to_write_flags(wbc);
1412
1413         unlock_extent(&inode->io_tree, start, end);
1414
1415         if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
1416             !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
1417                 num_chunks = 1;
1418                 should_compress = false;
1419         } else {
1420                 should_compress = true;
1421         }
1422
1423         nofs_flag = memalloc_nofs_save();
1424         ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1425         memalloc_nofs_restore(nofs_flag);
1426
1427         if (!ctx) {
1428                 unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
1429                         EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1430                         EXTENT_DO_ACCOUNTING;
1431                 unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
1432                                          PAGE_END_WRITEBACK | PAGE_SET_ERROR;
1433
1434                 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1435                                              clear_bits, page_ops);
1436                 return -ENOMEM;
1437         }
1438
1439         async_chunk = ctx->chunks;
1440         atomic_set(&ctx->num_chunks, num_chunks);
1441
1442         for (i = 0; i < num_chunks; i++) {
1443                 if (should_compress)
1444                         cur_end = min(end, start + SZ_512K - 1);
1445                 else
1446                         cur_end = end;
1447
1448                 /*
1449                  * igrab is called higher up in the call chain, take only the
1450                  * lightweight reference for the callback lifetime
1451                  */
1452                 ihold(&inode->vfs_inode);
1453                 async_chunk[i].async_cow = ctx;
1454                 async_chunk[i].inode = &inode->vfs_inode;
1455                 async_chunk[i].start = start;
1456                 async_chunk[i].end = cur_end;
1457                 async_chunk[i].write_flags = write_flags;
1458                 INIT_LIST_HEAD(&async_chunk[i].extents);
1459
1460                 /*
1461                  * The locked_page comes all the way from writepage and its
1462                  * the original page we were actually given.  As we spread
1463                  * this large delalloc region across multiple async_chunk
1464                  * structs, only the first struct needs a pointer to locked_page
1465                  *
1466                  * This way we don't need racey decisions about who is supposed
1467                  * to unlock it.
1468                  */
1469                 if (locked_page) {
1470                         /*
1471                          * Depending on the compressibility, the pages might or
1472                          * might not go through async.  We want all of them to
1473                          * be accounted against wbc once.  Let's do it here
1474                          * before the paths diverge.  wbc accounting is used
1475                          * only for foreign writeback detection and doesn't
1476                          * need full accuracy.  Just account the whole thing
1477                          * against the first page.
1478                          */
1479                         wbc_account_cgroup_owner(wbc, locked_page,
1480                                                  cur_end - start);
1481                         async_chunk[i].locked_page = locked_page;
1482                         locked_page = NULL;
1483                 } else {
1484                         async_chunk[i].locked_page = NULL;
1485                 }
1486
1487                 if (blkcg_css != blkcg_root_css) {
1488                         css_get(blkcg_css);
1489                         async_chunk[i].blkcg_css = blkcg_css;
1490                 } else {
1491                         async_chunk[i].blkcg_css = NULL;
1492                 }
1493
1494                 btrfs_init_work(&async_chunk[i].work, async_cow_start,
1495                                 async_cow_submit, async_cow_free);
1496
1497                 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1498                 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1499
1500                 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1501
1502                 *nr_written += nr_pages;
1503                 start = cur_end + 1;
1504         }
1505         *page_started = 1;
1506         return 0;
1507 }
1508
1509 static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
1510                                        struct page *locked_page, u64 start,
1511                                        u64 end, int *page_started,
1512                                        unsigned long *nr_written)
1513 {
1514         int ret;
1515
1516         ret = cow_file_range(inode, locked_page, start, end, page_started,
1517                              nr_written, 0);
1518         if (ret)
1519                 return ret;
1520
1521         if (*page_started)
1522                 return 0;
1523
1524         __set_page_dirty_nobuffers(locked_page);
1525         account_page_redirty(locked_page);
1526         extent_write_locked_range(&inode->vfs_inode, start, end);
1527         *page_started = 1;
1528
1529         return 0;
1530 }
1531
1532 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1533                                         u64 bytenr, u64 num_bytes)
1534 {
1535         int ret;
1536         struct btrfs_ordered_sum *sums;
1537         LIST_HEAD(list);
1538
1539         ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1540                                        bytenr + num_bytes - 1, &list, 0);
1541         if (ret == 0 && list_empty(&list))
1542                 return 0;
1543
1544         while (!list_empty(&list)) {
1545                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1546                 list_del(&sums->list);
1547                 kfree(sums);
1548         }
1549         if (ret < 0)
1550                 return ret;
1551         return 1;
1552 }
1553
1554 static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
1555                            const u64 start, const u64 end,
1556                            int *page_started, unsigned long *nr_written)
1557 {
1558         const bool is_space_ino = btrfs_is_free_space_inode(inode);
1559         const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
1560         const u64 range_bytes = end + 1 - start;
1561         struct extent_io_tree *io_tree = &inode->io_tree;
1562         u64 range_start = start;
1563         u64 count;
1564
1565         /*
1566          * If EXTENT_NORESERVE is set it means that when the buffered write was
1567          * made we had not enough available data space and therefore we did not
1568          * reserve data space for it, since we though we could do NOCOW for the
1569          * respective file range (either there is prealloc extent or the inode
1570          * has the NOCOW bit set).
1571          *
1572          * However when we need to fallback to COW mode (because for example the
1573          * block group for the corresponding extent was turned to RO mode by a
1574          * scrub or relocation) we need to do the following:
1575          *
1576          * 1) We increment the bytes_may_use counter of the data space info.
1577          *    If COW succeeds, it allocates a new data extent and after doing
1578          *    that it decrements the space info's bytes_may_use counter and
1579          *    increments its bytes_reserved counter by the same amount (we do
1580          *    this at btrfs_add_reserved_bytes()). So we need to increment the
1581          *    bytes_may_use counter to compensate (when space is reserved at
1582          *    buffered write time, the bytes_may_use counter is incremented);
1583          *
1584          * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1585          *    that if the COW path fails for any reason, it decrements (through
1586          *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1587          *    data space info, which we incremented in the step above.
1588          *
1589          * If we need to fallback to cow and the inode corresponds to a free
1590          * space cache inode or an inode of the data relocation tree, we must
1591          * also increment bytes_may_use of the data space_info for the same
1592          * reason. Space caches and relocated data extents always get a prealloc
1593          * extent for them, however scrub or balance may have set the block
1594          * group that contains that extent to RO mode and therefore force COW
1595          * when starting writeback.
1596          */
1597         count = count_range_bits(io_tree, &range_start, end, range_bytes,
1598                                  EXTENT_NORESERVE, 0);
1599         if (count > 0 || is_space_ino || is_reloc_ino) {
1600                 u64 bytes = count;
1601                 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1602                 struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1603
1604                 if (is_space_ino || is_reloc_ino)
1605                         bytes = range_bytes;
1606
1607                 spin_lock(&sinfo->lock);
1608                 btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
1609                 spin_unlock(&sinfo->lock);
1610
1611                 if (count > 0)
1612                         clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1613                                          0, 0, NULL);
1614         }
1615
1616         return cow_file_range(inode, locked_page, start, end, page_started,
1617                               nr_written, 1);
1618 }
1619
1620 /*
1621  * when nowcow writeback call back.  This checks for snapshots or COW copies
1622  * of the extents that exist in the file, and COWs the file as required.
1623  *
1624  * If no cow copies or snapshots exist, we write directly to the existing
1625  * blocks on disk
1626  */
1627 static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
1628                                        struct page *locked_page,
1629                                        const u64 start, const u64 end,
1630                                        int *page_started,
1631                                        unsigned long *nr_written)
1632 {
1633         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1634         struct btrfs_root *root = inode->root;
1635         struct btrfs_path *path;
1636         u64 cow_start = (u64)-1;
1637         u64 cur_offset = start;
1638         int ret;
1639         bool check_prev = true;
1640         const bool freespace_inode = btrfs_is_free_space_inode(inode);
1641         u64 ino = btrfs_ino(inode);
1642         bool nocow = false;
1643         u64 disk_bytenr = 0;
1644         const bool force = inode->flags & BTRFS_INODE_NODATACOW;
1645
1646         path = btrfs_alloc_path();
1647         if (!path) {
1648                 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1649                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1650                                              EXTENT_DO_ACCOUNTING |
1651                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1652                                              PAGE_START_WRITEBACK |
1653                                              PAGE_END_WRITEBACK);
1654                 return -ENOMEM;
1655         }
1656
1657         while (1) {
1658                 struct btrfs_key found_key;
1659                 struct btrfs_file_extent_item *fi;
1660                 struct extent_buffer *leaf;
1661                 u64 extent_end;
1662                 u64 extent_offset;
1663                 u64 num_bytes = 0;
1664                 u64 disk_num_bytes;
1665                 u64 ram_bytes;
1666                 int extent_type;
1667
1668                 nocow = false;
1669
1670                 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1671                                                cur_offset, 0);
1672                 if (ret < 0)
1673                         goto error;
1674
1675                 /*
1676                  * If there is no extent for our range when doing the initial
1677                  * search, then go back to the previous slot as it will be the
1678                  * one containing the search offset
1679                  */
1680                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1681                         leaf = path->nodes[0];
1682                         btrfs_item_key_to_cpu(leaf, &found_key,
1683                                               path->slots[0] - 1);
1684                         if (found_key.objectid == ino &&
1685                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1686                                 path->slots[0]--;
1687                 }
1688                 check_prev = false;
1689 next_slot:
1690                 /* Go to next leaf if we have exhausted the current one */
1691                 leaf = path->nodes[0];
1692                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1693                         ret = btrfs_next_leaf(root, path);
1694                         if (ret < 0) {
1695                                 if (cow_start != (u64)-1)
1696                                         cur_offset = cow_start;
1697                                 goto error;
1698                         }
1699                         if (ret > 0)
1700                                 break;
1701                         leaf = path->nodes[0];
1702                 }
1703
1704                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1705
1706                 /* Didn't find anything for our INO */
1707                 if (found_key.objectid > ino)
1708                         break;
1709                 /*
1710                  * Keep searching until we find an EXTENT_ITEM or there are no
1711                  * more extents for this inode
1712                  */
1713                 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1714                     found_key.type < BTRFS_EXTENT_DATA_KEY) {
1715                         path->slots[0]++;
1716                         goto next_slot;
1717                 }
1718
1719                 /* Found key is not EXTENT_DATA_KEY or starts after req range */
1720                 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1721                     found_key.offset > end)
1722                         break;
1723
1724                 /*
1725                  * If the found extent starts after requested offset, then
1726                  * adjust extent_end to be right before this extent begins
1727                  */
1728                 if (found_key.offset > cur_offset) {
1729                         extent_end = found_key.offset;
1730                         extent_type = 0;
1731                         goto out_check;
1732                 }
1733
1734                 /*
1735                  * Found extent which begins before our range and potentially
1736                  * intersect it
1737                  */
1738                 fi = btrfs_item_ptr(leaf, path->slots[0],
1739                                     struct btrfs_file_extent_item);
1740                 extent_type = btrfs_file_extent_type(leaf, fi);
1741
1742                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1743                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1744                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1745                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1746                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1747                         extent_end = found_key.offset +
1748                                 btrfs_file_extent_num_bytes(leaf, fi);
1749                         disk_num_bytes =
1750                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1751                         /*
1752                          * If the extent we got ends before our current offset,
1753                          * skip to the next extent.
1754                          */
1755                         if (extent_end <= cur_offset) {
1756                                 path->slots[0]++;
1757                                 goto next_slot;
1758                         }
1759                         /* Skip holes */
1760                         if (disk_bytenr == 0)
1761                                 goto out_check;
1762                         /* Skip compressed/encrypted/encoded extents */
1763                         if (btrfs_file_extent_compression(leaf, fi) ||
1764                             btrfs_file_extent_encryption(leaf, fi) ||
1765                             btrfs_file_extent_other_encoding(leaf, fi))
1766                                 goto out_check;
1767                         /*
1768                          * If extent is created before the last volume's snapshot
1769                          * this implies the extent is shared, hence we can't do
1770                          * nocow. This is the same check as in
1771                          * btrfs_cross_ref_exist but without calling
1772                          * btrfs_search_slot.
1773                          */
1774                         if (!freespace_inode &&
1775                             btrfs_file_extent_generation(leaf, fi) <=
1776                             btrfs_root_last_snapshot(&root->root_item))
1777                                 goto out_check;
1778                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1779                                 goto out_check;
1780
1781                         /*
1782                          * The following checks can be expensive, as they need to
1783                          * take other locks and do btree or rbtree searches, so
1784                          * release the path to avoid blocking other tasks for too
1785                          * long.
1786                          */
1787                         btrfs_release_path(path);
1788
1789                         ret = btrfs_cross_ref_exist(root, ino,
1790                                                     found_key.offset -
1791                                                     extent_offset, disk_bytenr, false);
1792                         if (ret) {
1793                                 /*
1794                                  * ret could be -EIO if the above fails to read
1795                                  * metadata.
1796                                  */
1797                                 if (ret < 0) {
1798                                         if (cow_start != (u64)-1)
1799                                                 cur_offset = cow_start;
1800                                         goto error;
1801                                 }
1802
1803                                 WARN_ON_ONCE(freespace_inode);
1804                                 goto out_check;
1805                         }
1806                         disk_bytenr += extent_offset;
1807                         disk_bytenr += cur_offset - found_key.offset;
1808                         num_bytes = min(end + 1, extent_end) - cur_offset;
1809                         /*
1810                          * If there are pending snapshots for this root, we
1811                          * fall into common COW way
1812                          */
1813                         if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
1814                                 goto out_check;
1815                         /*
1816                          * force cow if csum exists in the range.
1817                          * this ensure that csum for a given extent are
1818                          * either valid or do not exist.
1819                          */
1820                         ret = csum_exist_in_range(fs_info, disk_bytenr,
1821                                                   num_bytes);
1822                         if (ret) {
1823                                 /*
1824                                  * ret could be -EIO if the above fails to read
1825                                  * metadata.
1826                                  */
1827                                 if (ret < 0) {
1828                                         if (cow_start != (u64)-1)
1829                                                 cur_offset = cow_start;
1830                                         goto error;
1831                                 }
1832                                 WARN_ON_ONCE(freespace_inode);
1833                                 goto out_check;
1834                         }
1835                         /* If the extent's block group is RO, we must COW */
1836                         if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
1837                                 goto out_check;
1838                         nocow = true;
1839                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1840                         extent_end = found_key.offset + ram_bytes;
1841                         extent_end = ALIGN(extent_end, fs_info->sectorsize);
1842                         /* Skip extents outside of our requested range */
1843                         if (extent_end <= start) {
1844                                 path->slots[0]++;
1845                                 goto next_slot;
1846                         }
1847                 } else {
1848                         /* If this triggers then we have a memory corruption */
1849                         BUG();
1850                 }
1851 out_check:
1852                 /*
1853                  * If nocow is false then record the beginning of the range
1854                  * that needs to be COWed
1855                  */
1856                 if (!nocow) {
1857                         if (cow_start == (u64)-1)
1858                                 cow_start = cur_offset;
1859                         cur_offset = extent_end;
1860                         if (cur_offset > end)
1861                                 break;
1862                         if (!path->nodes[0])
1863                                 continue;
1864                         path->slots[0]++;
1865                         goto next_slot;
1866                 }
1867
1868                 /*
1869                  * COW range from cow_start to found_key.offset - 1. As the key
1870                  * will contain the beginning of the first extent that can be
1871                  * NOCOW, following one which needs to be COW'ed
1872                  */
1873                 if (cow_start != (u64)-1) {
1874                         ret = fallback_to_cow(inode, locked_page,
1875                                               cow_start, found_key.offset - 1,
1876                                               page_started, nr_written);
1877                         if (ret)
1878                                 goto error;
1879                         cow_start = (u64)-1;
1880                 }
1881
1882                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1883                         u64 orig_start = found_key.offset - extent_offset;
1884                         struct extent_map *em;
1885
1886                         em = create_io_em(inode, cur_offset, num_bytes,
1887                                           orig_start,
1888                                           disk_bytenr, /* block_start */
1889                                           num_bytes, /* block_len */
1890                                           disk_num_bytes, /* orig_block_len */
1891                                           ram_bytes, BTRFS_COMPRESS_NONE,
1892                                           BTRFS_ORDERED_PREALLOC);
1893                         if (IS_ERR(em)) {
1894                                 ret = PTR_ERR(em);
1895                                 goto error;
1896                         }
1897                         free_extent_map(em);
1898                         ret = btrfs_add_ordered_extent(inode, cur_offset,
1899                                                        disk_bytenr, num_bytes,
1900                                                        num_bytes,
1901                                                        BTRFS_ORDERED_PREALLOC);
1902                         if (ret) {
1903                                 btrfs_drop_extent_cache(inode, cur_offset,
1904                                                         cur_offset + num_bytes - 1,
1905                                                         0);
1906                                 goto error;
1907                         }
1908                 } else {
1909                         ret = btrfs_add_ordered_extent(inode, cur_offset,
1910                                                        disk_bytenr, num_bytes,
1911                                                        num_bytes,
1912                                                        BTRFS_ORDERED_NOCOW);
1913                         if (ret)
1914                                 goto error;
1915                 }
1916
1917                 if (nocow)
1918                         btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1919                 nocow = false;
1920
1921                 if (btrfs_is_data_reloc_root(root))
1922                         /*
1923                          * Error handled later, as we must prevent
1924                          * extent_clear_unlock_delalloc() in error handler
1925                          * from freeing metadata of created ordered extent.
1926                          */
1927                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1928                                                       num_bytes);
1929
1930                 extent_clear_unlock_delalloc(inode, cur_offset,
1931                                              cur_offset + num_bytes - 1,
1932                                              locked_page, EXTENT_LOCKED |
1933                                              EXTENT_DELALLOC |
1934                                              EXTENT_CLEAR_DATA_RESV,
1935                                              PAGE_UNLOCK | PAGE_SET_ORDERED);
1936
1937                 cur_offset = extent_end;
1938
1939                 /*
1940                  * btrfs_reloc_clone_csums() error, now we're OK to call error
1941                  * handler, as metadata for created ordered extent will only
1942                  * be freed by btrfs_finish_ordered_io().
1943                  */
1944                 if (ret)
1945                         goto error;
1946                 if (cur_offset > end)
1947                         break;
1948         }
1949         btrfs_release_path(path);
1950
1951         if (cur_offset <= end && cow_start == (u64)-1)
1952                 cow_start = cur_offset;
1953
1954         if (cow_start != (u64)-1) {
1955                 cur_offset = end;
1956                 ret = fallback_to_cow(inode, locked_page, cow_start, end,
1957                                       page_started, nr_written);
1958                 if (ret)
1959                         goto error;
1960         }
1961
1962 error:
1963         if (nocow)
1964                 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1965
1966         if (ret && cur_offset < end)
1967                 extent_clear_unlock_delalloc(inode, cur_offset, end,
1968                                              locked_page, EXTENT_LOCKED |
1969                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
1970                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1971                                              PAGE_START_WRITEBACK |
1972                                              PAGE_END_WRITEBACK);
1973         btrfs_free_path(path);
1974         return ret;
1975 }
1976
1977 static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
1978 {
1979         if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
1980                 if (inode->defrag_bytes &&
1981                     test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
1982                                    0, NULL))
1983                         return false;
1984                 return true;
1985         }
1986         return false;
1987 }
1988
1989 /*
1990  * Function to process delayed allocation (create CoW) for ranges which are
1991  * being touched for the first time.
1992  */
1993 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
1994                 u64 start, u64 end, int *page_started, unsigned long *nr_written,
1995                 struct writeback_control *wbc)
1996 {
1997         int ret;
1998         const bool zoned = btrfs_is_zoned(inode->root->fs_info);
1999
2000         /*
2001          * The range must cover part of the @locked_page, or the returned
2002          * @page_started can confuse the caller.
2003          */
2004         ASSERT(!(end <= page_offset(locked_page) ||
2005                  start >= page_offset(locked_page) + PAGE_SIZE));
2006
2007         if (should_nocow(inode, start, end)) {
2008                 /*
2009                  * Normally on a zoned device we're only doing COW writes, but
2010                  * in case of relocation on a zoned filesystem we have taken
2011                  * precaution, that we're only writing sequentially. It's safe
2012                  * to use run_delalloc_nocow() here, like for  regular
2013                  * preallocated inodes.
2014                  */
2015                 ASSERT(!zoned ||
2016                        (zoned && btrfs_is_data_reloc_root(inode->root)));
2017                 ret = run_delalloc_nocow(inode, locked_page, start, end,
2018                                          page_started, nr_written);
2019         } else if (!inode_can_compress(inode) ||
2020                    !inode_need_compress(inode, start, end)) {
2021                 if (zoned)
2022                         ret = run_delalloc_zoned(inode, locked_page, start, end,
2023                                                  page_started, nr_written);
2024                 else
2025                         ret = cow_file_range(inode, locked_page, start, end,
2026                                              page_started, nr_written, 1);
2027         } else {
2028                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
2029                 ret = cow_file_range_async(inode, wbc, locked_page, start, end,
2030                                            page_started, nr_written);
2031         }
2032         ASSERT(ret <= 0);
2033         if (ret)
2034                 btrfs_cleanup_ordered_extents(inode, locked_page, start,
2035                                               end - start + 1);
2036         return ret;
2037 }
2038
2039 void btrfs_split_delalloc_extent(struct inode *inode,
2040                                  struct extent_state *orig, u64 split)
2041 {
2042         u64 size;
2043
2044         /* not delalloc, ignore it */
2045         if (!(orig->state & EXTENT_DELALLOC))
2046                 return;
2047
2048         size = orig->end - orig->start + 1;
2049         if (size > BTRFS_MAX_EXTENT_SIZE) {
2050                 u32 num_extents;
2051                 u64 new_size;
2052
2053                 /*
2054                  * See the explanation in btrfs_merge_delalloc_extent, the same
2055                  * applies here, just in reverse.
2056                  */
2057                 new_size = orig->end - split + 1;
2058                 num_extents = count_max_extents(new_size);
2059                 new_size = split - orig->start;
2060                 num_extents += count_max_extents(new_size);
2061                 if (count_max_extents(size) >= num_extents)
2062                         return;
2063         }
2064
2065         spin_lock(&BTRFS_I(inode)->lock);
2066         btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
2067         spin_unlock(&BTRFS_I(inode)->lock);
2068 }
2069
2070 /*
2071  * Handle merged delayed allocation extents so we can keep track of new extents
2072  * that are just merged onto old extents, such as when we are doing sequential
2073  * writes, so we can properly account for the metadata space we'll need.
2074  */
2075 void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
2076                                  struct extent_state *other)
2077 {
2078         u64 new_size, old_size;
2079         u32 num_extents;
2080
2081         /* not delalloc, ignore it */
2082         if (!(other->state & EXTENT_DELALLOC))
2083                 return;
2084
2085         if (new->start > other->start)
2086                 new_size = new->end - other->start + 1;
2087         else
2088                 new_size = other->end - new->start + 1;
2089
2090         /* we're not bigger than the max, unreserve the space and go */
2091         if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
2092                 spin_lock(&BTRFS_I(inode)->lock);
2093                 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
2094                 spin_unlock(&BTRFS_I(inode)->lock);
2095                 return;
2096         }
2097
2098         /*
2099          * We have to add up either side to figure out how many extents were
2100          * accounted for before we merged into one big extent.  If the number of
2101          * extents we accounted for is <= the amount we need for the new range
2102          * then we can return, otherwise drop.  Think of it like this
2103          *
2104          * [ 4k][MAX_SIZE]
2105          *
2106          * So we've grown the extent by a MAX_SIZE extent, this would mean we
2107          * need 2 outstanding extents, on one side we have 1 and the other side
2108          * we have 1 so they are == and we can return.  But in this case
2109          *
2110          * [MAX_SIZE+4k][MAX_SIZE+4k]
2111          *
2112          * Each range on their own accounts for 2 extents, but merged together
2113          * they are only 3 extents worth of accounting, so we need to drop in
2114          * this case.
2115          */
2116         old_size = other->end - other->start + 1;
2117         num_extents = count_max_extents(old_size);
2118         old_size = new->end - new->start + 1;
2119         num_extents += count_max_extents(old_size);
2120         if (count_max_extents(new_size) >= num_extents)
2121                 return;
2122
2123         spin_lock(&BTRFS_I(inode)->lock);
2124         btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
2125         spin_unlock(&BTRFS_I(inode)->lock);
2126 }
2127
2128 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
2129                                       struct inode *inode)
2130 {
2131         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2132
2133         spin_lock(&root->delalloc_lock);
2134         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
2135                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
2136                               &root->delalloc_inodes);
2137                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2138                         &BTRFS_I(inode)->runtime_flags);
2139                 root->nr_delalloc_inodes++;
2140                 if (root->nr_delalloc_inodes == 1) {
2141                         spin_lock(&fs_info->delalloc_root_lock);
2142                         BUG_ON(!list_empty(&root->delalloc_root));
2143                         list_add_tail(&root->delalloc_root,
2144                                       &fs_info->delalloc_roots);
2145                         spin_unlock(&fs_info->delalloc_root_lock);
2146                 }
2147         }
2148         spin_unlock(&root->delalloc_lock);
2149 }
2150
2151
2152 void __btrfs_del_delalloc_inode(struct btrfs_root *root,
2153                                 struct btrfs_inode *inode)
2154 {
2155         struct btrfs_fs_info *fs_info = root->fs_info;
2156
2157         if (!list_empty(&inode->delalloc_inodes)) {
2158                 list_del_init(&inode->delalloc_inodes);
2159                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2160                           &inode->runtime_flags);
2161                 root->nr_delalloc_inodes--;
2162                 if (!root->nr_delalloc_inodes) {
2163                         ASSERT(list_empty(&root->delalloc_inodes));
2164                         spin_lock(&fs_info->delalloc_root_lock);
2165                         BUG_ON(list_empty(&root->delalloc_root));
2166                         list_del_init(&root->delalloc_root);
2167                         spin_unlock(&fs_info->delalloc_root_lock);
2168                 }
2169         }
2170 }
2171
2172 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
2173                                      struct btrfs_inode *inode)
2174 {
2175         spin_lock(&root->delalloc_lock);
2176         __btrfs_del_delalloc_inode(root, inode);
2177         spin_unlock(&root->delalloc_lock);
2178 }
2179
2180 /*
2181  * Properly track delayed allocation bytes in the inode and to maintain the
2182  * list of inodes that have pending delalloc work to be done.
2183  */
2184 void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
2185                                unsigned *bits)
2186 {
2187         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2188
2189         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
2190                 WARN_ON(1);
2191         /*
2192          * set_bit and clear bit hooks normally require _irqsave/restore
2193          * but in this case, we are only testing for the DELALLOC
2194          * bit, which is only set or cleared with irqs on
2195          */
2196         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
2197                 struct btrfs_root *root = BTRFS_I(inode)->root;
2198                 u64 len = state->end + 1 - state->start;
2199                 u32 num_extents = count_max_extents(len);
2200                 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
2201
2202                 spin_lock(&BTRFS_I(inode)->lock);
2203                 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
2204                 spin_unlock(&BTRFS_I(inode)->lock);
2205
2206                 /* For sanity tests */
2207                 if (btrfs_is_testing(fs_info))
2208                         return;
2209
2210                 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2211                                          fs_info->delalloc_batch);
2212                 spin_lock(&BTRFS_I(inode)->lock);
2213                 BTRFS_I(inode)->delalloc_bytes += len;
2214                 if (*bits & EXTENT_DEFRAG)
2215                         BTRFS_I(inode)->defrag_bytes += len;
2216                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2217                                          &BTRFS_I(inode)->runtime_flags))
2218                         btrfs_add_delalloc_inodes(root, inode);
2219                 spin_unlock(&BTRFS_I(inode)->lock);
2220         }
2221
2222         if (!(state->state & EXTENT_DELALLOC_NEW) &&
2223             (*bits & EXTENT_DELALLOC_NEW)) {
2224                 spin_lock(&BTRFS_I(inode)->lock);
2225                 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
2226                         state->start;
2227                 spin_unlock(&BTRFS_I(inode)->lock);
2228         }
2229 }
2230
2231 /*
2232  * Once a range is no longer delalloc this function ensures that proper
2233  * accounting happens.
2234  */
2235 void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
2236                                  struct extent_state *state, unsigned *bits)
2237 {
2238         struct btrfs_inode *inode = BTRFS_I(vfs_inode);
2239         struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
2240         u64 len = state->end + 1 - state->start;
2241         u32 num_extents = count_max_extents(len);
2242
2243         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
2244                 spin_lock(&inode->lock);
2245                 inode->defrag_bytes -= len;
2246                 spin_unlock(&inode->lock);
2247         }
2248
2249         /*
2250          * set_bit and clear bit hooks normally require _irqsave/restore
2251          * but in this case, we are only testing for the DELALLOC
2252          * bit, which is only set or cleared with irqs on
2253          */
2254         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
2255                 struct btrfs_root *root = inode->root;
2256                 bool do_list = !btrfs_is_free_space_inode(inode);
2257
2258                 spin_lock(&inode->lock);
2259                 btrfs_mod_outstanding_extents(inode, -num_extents);
2260                 spin_unlock(&inode->lock);
2261
2262                 /*
2263                  * We don't reserve metadata space for space cache inodes so we
2264                  * don't need to call delalloc_release_metadata if there is an
2265                  * error.
2266                  */
2267                 if (*bits & EXTENT_CLEAR_META_RESV &&
2268                     root != fs_info->tree_root)
2269                         btrfs_delalloc_release_metadata(inode, len, false);
2270
2271                 /* For sanity tests. */
2272                 if (btrfs_is_testing(fs_info))
2273                         return;
2274
2275                 if (!btrfs_is_data_reloc_root(root) &&
2276                     do_list && !(state->state & EXTENT_NORESERVE) &&
2277                     (*bits & EXTENT_CLEAR_DATA_RESV))
2278                         btrfs_free_reserved_data_space_noquota(fs_info, len);
2279
2280                 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2281                                          fs_info->delalloc_batch);
2282                 spin_lock(&inode->lock);
2283                 inode->delalloc_bytes -= len;
2284                 if (do_list && inode->delalloc_bytes == 0 &&
2285                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2286                                         &inode->runtime_flags))
2287                         btrfs_del_delalloc_inode(root, inode);
2288                 spin_unlock(&inode->lock);
2289         }
2290
2291         if ((state->state & EXTENT_DELALLOC_NEW) &&
2292             (*bits & EXTENT_DELALLOC_NEW)) {
2293                 spin_lock(&inode->lock);
2294                 ASSERT(inode->new_delalloc_bytes >= len);
2295                 inode->new_delalloc_bytes -= len;
2296                 if (*bits & EXTENT_ADD_INODE_BYTES)
2297                         inode_add_bytes(&inode->vfs_inode, len);
2298                 spin_unlock(&inode->lock);
2299         }
2300 }
2301
2302 /*
2303  * in order to insert checksums into the metadata in large chunks,
2304  * we wait until bio submission time.   All the pages in the bio are
2305  * checksummed and sums are attached onto the ordered extent record.
2306  *
2307  * At IO completion time the cums attached on the ordered extent record
2308  * are inserted into the btree
2309  */
2310 static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
2311                                            u64 dio_file_offset)
2312 {
2313         return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
2314 }
2315
2316 /*
2317  * Split an extent_map at [start, start + len]
2318  *
2319  * This function is intended to be used only for extract_ordered_extent().
2320  */
2321 static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
2322                           u64 pre, u64 post)
2323 {
2324         struct extent_map_tree *em_tree = &inode->extent_tree;
2325         struct extent_map *em;
2326         struct extent_map *split_pre = NULL;
2327         struct extent_map *split_mid = NULL;
2328         struct extent_map *split_post = NULL;
2329         int ret = 0;
2330         unsigned long flags;
2331
2332         /* Sanity check */
2333         if (pre == 0 && post == 0)
2334                 return 0;
2335
2336         split_pre = alloc_extent_map();
2337         if (pre)
2338                 split_mid = alloc_extent_map();
2339         if (post)
2340                 split_post = alloc_extent_map();
2341         if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
2342                 ret = -ENOMEM;
2343                 goto out;
2344         }
2345
2346         ASSERT(pre + post < len);
2347
2348         lock_extent(&inode->io_tree, start, start + len - 1);
2349         write_lock(&em_tree->lock);
2350         em = lookup_extent_mapping(em_tree, start, len);
2351         if (!em) {
2352                 ret = -EIO;
2353                 goto out_unlock;
2354         }
2355
2356         ASSERT(em->len == len);
2357         ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
2358         ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
2359         ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
2360         ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
2361         ASSERT(!list_empty(&em->list));
2362
2363         flags = em->flags;
2364         clear_bit(EXTENT_FLAG_PINNED, &em->flags);
2365
2366         /* First, replace the em with a new extent_map starting from * em->start */
2367         split_pre->start = em->start;
2368         split_pre->len = (pre ? pre : em->len - post);
2369         split_pre->orig_start = split_pre->start;
2370         split_pre->block_start = em->block_start;
2371         split_pre->block_len = split_pre->len;
2372         split_pre->orig_block_len = split_pre->block_len;
2373         split_pre->ram_bytes = split_pre->len;
2374         split_pre->flags = flags;
2375         split_pre->compress_type = em->compress_type;
2376         split_pre->generation = em->generation;
2377
2378         replace_extent_mapping(em_tree, em, split_pre, 1);
2379
2380         /*
2381          * Now we only have an extent_map at:
2382          *     [em->start, em->start + pre] if pre != 0
2383          *     [em->start, em->start + em->len - post] if pre == 0
2384          */
2385
2386         if (pre) {
2387                 /* Insert the middle extent_map */
2388                 split_mid->start = em->start + pre;
2389                 split_mid->len = em->len - pre - post;
2390                 split_mid->orig_start = split_mid->start;
2391                 split_mid->block_start = em->block_start + pre;
2392                 split_mid->block_len = split_mid->len;
2393                 split_mid->orig_block_len = split_mid->block_len;
2394                 split_mid->ram_bytes = split_mid->len;
2395                 split_mid->flags = flags;
2396                 split_mid->compress_type = em->compress_type;
2397                 split_mid->generation = em->generation;
2398                 add_extent_mapping(em_tree, split_mid, 1);
2399         }
2400
2401         if (post) {
2402                 split_post->start = em->start + em->len - post;
2403                 split_post->len = post;
2404                 split_post->orig_start = split_post->start;
2405                 split_post->block_start = em->block_start + em->len - post;
2406                 split_post->block_len = split_post->len;
2407                 split_post->orig_block_len = split_post->block_len;
2408                 split_post->ram_bytes = split_post->len;
2409                 split_post->flags = flags;
2410                 split_post->compress_type = em->compress_type;
2411                 split_post->generation = em->generation;
2412                 add_extent_mapping(em_tree, split_post, 1);
2413         }
2414
2415         /* Once for us */
2416         free_extent_map(em);
2417         /* Once for the tree */
2418         free_extent_map(em);
2419
2420 out_unlock:
2421         write_unlock(&em_tree->lock);
2422         unlock_extent(&inode->io_tree, start, start + len - 1);
2423 out:
2424         free_extent_map(split_pre);
2425         free_extent_map(split_mid);
2426         free_extent_map(split_post);
2427
2428         return ret;
2429 }
2430
2431 static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
2432                                            struct bio *bio, loff_t file_offset)
2433 {
2434         struct btrfs_ordered_extent *ordered;
2435         u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
2436         u64 file_len;
2437         u64 len = bio->bi_iter.bi_size;
2438         u64 end = start + len;
2439         u64 ordered_end;
2440         u64 pre, post;
2441         int ret = 0;
2442
2443         ordered = btrfs_lookup_ordered_extent(inode, file_offset);
2444         if (WARN_ON_ONCE(!ordered))
2445                 return BLK_STS_IOERR;
2446
2447         /* No need to split */
2448         if (ordered->disk_num_bytes == len)
2449                 goto out;
2450
2451         /* We cannot split once end_bio'd ordered extent */
2452         if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
2453                 ret = -EINVAL;
2454                 goto out;
2455         }
2456
2457         /* We cannot split a compressed ordered extent */
2458         if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
2459                 ret = -EINVAL;
2460                 goto out;
2461         }
2462
2463         ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
2464         /* bio must be in one ordered extent */
2465         if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
2466                 ret = -EINVAL;
2467                 goto out;
2468         }
2469
2470         /* Checksum list should be empty */
2471         if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
2472                 ret = -EINVAL;
2473                 goto out;
2474         }
2475
2476         file_len = ordered->num_bytes;
2477         pre = start - ordered->disk_bytenr;
2478         post = ordered_end - end;
2479
2480         ret = btrfs_split_ordered_extent(ordered, pre, post);
2481         if (ret)
2482                 goto out;
2483         ret = split_zoned_em(inode, file_offset, file_len, pre, post);
2484
2485 out:
2486         btrfs_put_ordered_extent(ordered);
2487
2488         return errno_to_blk_status(ret);
2489 }
2490
2491 /*
2492  * extent_io.c submission hook. This does the right thing for csum calculation
2493  * on write, or reading the csums from the tree before a read.
2494  *
2495  * Rules about async/sync submit,
2496  * a) read:                             sync submit
2497  *
2498  * b) write without checksum:           sync submit
2499  *
2500  * c) write with checksum:
2501  *    c-1) if bio is issued by fsync:   sync submit
2502  *         (sync_writers != 0)
2503  *
2504  *    c-2) if root is reloc root:       sync submit
2505  *         (only in case of buffered IO)
2506  *
2507  *    c-3) otherwise:                   async submit
2508  */
2509 blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
2510                                    int mirror_num, unsigned long bio_flags)
2511
2512 {
2513         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2514         struct btrfs_root *root = BTRFS_I(inode)->root;
2515         enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
2516         blk_status_t ret = 0;
2517         int skip_sum;
2518         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
2519
2520         skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
2521                    !fs_info->csum_root;
2522
2523         if (btrfs_is_free_space_inode(BTRFS_I(inode)))
2524                 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
2525
2526         if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
2527                 struct page *page = bio_first_bvec_all(bio)->bv_page;
2528                 loff_t file_offset = page_offset(page);
2529
2530                 ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
2531                 if (ret)
2532                         goto out;
2533         }
2534
2535         if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
2536                 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
2537                 if (ret)
2538                         goto out;
2539
2540                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
2541                         ret = btrfs_submit_compressed_read(inode, bio,
2542                                                            mirror_num,
2543                                                            bio_flags);
2544                         goto out;
2545                 } else {
2546                         /*
2547                          * Lookup bio sums does extra checks around whether we
2548                          * need to csum or not, which is why we ignore skip_sum
2549                          * here.
2550                          */
2551                         ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2552                         if (ret)
2553                                 goto out;
2554                 }
2555                 goto mapit;
2556         } else if (async && !skip_sum) {
2557                 /* csum items have already been cloned */
2558                 if (btrfs_is_data_reloc_root(root))
2559                         goto mapit;
2560                 /* we're doing a write, do the async checksumming */
2561                 ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
2562                                           0, btrfs_submit_bio_start);
2563                 goto out;
2564         } else if (!skip_sum) {
2565                 ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
2566                 if (ret)
2567                         goto out;
2568         }
2569
2570 mapit:
2571         ret = btrfs_map_bio(fs_info, bio, mirror_num);
2572
2573 out:
2574         if (ret) {
2575                 bio->bi_status = ret;
2576                 bio_endio(bio);
2577         }
2578         return ret;
2579 }
2580
2581 /*
2582  * given a list of ordered sums record them in the inode.  This happens
2583  * at IO completion time based on sums calculated at bio submission time.
2584  */
2585 static int add_pending_csums(struct btrfs_trans_handle *trans,
2586                              struct list_head *list)
2587 {
2588         struct btrfs_ordered_sum *sum;
2589         int ret;
2590
2591         list_for_each_entry(sum, list, list) {
2592                 trans->adding_csums = true;
2593                 ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
2594                 trans->adding_csums = false;
2595                 if (ret)
2596                         return ret;
2597         }
2598         return 0;
2599 }
2600
2601 static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2602                                          const u64 start,
2603                                          const u64 len,
2604                                          struct extent_state **cached_state)
2605 {
2606         u64 search_start = start;
2607         const u64 end = start + len - 1;
2608
2609         while (search_start < end) {
2610                 const u64 search_len = end - search_start + 1;
2611                 struct extent_map *em;
2612                 u64 em_len;
2613                 int ret = 0;
2614
2615                 em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
2616                 if (IS_ERR(em))
2617                         return PTR_ERR(em);
2618
2619                 if (em->block_start != EXTENT_MAP_HOLE)
2620                         goto next;
2621
2622                 em_len = em->len;
2623                 if (em->start < search_start)
2624                         em_len -= search_start - em->start;
2625                 if (em_len > search_len)
2626                         em_len = search_len;
2627
2628                 ret = set_extent_bit(&inode->io_tree, search_start,
2629                                      search_start + em_len - 1,
2630                                      EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
2631                                      GFP_NOFS, NULL);
2632 next:
2633                 search_start = extent_map_end(em);
2634                 free_extent_map(em);
2635                 if (ret)
2636                         return ret;
2637         }
2638         return 0;
2639 }
2640
2641 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2642                               unsigned int extra_bits,
2643                               struct extent_state **cached_state)
2644 {
2645         WARN_ON(PAGE_ALIGNED(end));
2646
2647         if (start >= i_size_read(&inode->vfs_inode) &&
2648             !(inode->flags & BTRFS_INODE_PREALLOC)) {
2649                 /*
2650                  * There can't be any extents following eof in this case so just
2651                  * set the delalloc new bit for the range directly.
2652                  */
2653                 extra_bits |= EXTENT_DELALLOC_NEW;
2654         } else {
2655                 int ret;
2656
2657                 ret = btrfs_find_new_delalloc_bytes(inode, start,
2658                                                     end + 1 - start,
2659                                                     cached_state);
2660                 if (ret)
2661                         return ret;
2662         }
2663
2664         return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
2665                                    cached_state);
2666 }
2667
2668 /* see btrfs_writepage_start_hook for details on why this is required */
2669 struct btrfs_writepage_fixup {
2670         struct page *page;
2671         struct inode *inode;
2672         struct btrfs_work work;
2673 };
2674
2675 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2676 {
2677         struct btrfs_writepage_fixup *fixup;
2678         struct btrfs_ordered_extent *ordered;
2679         struct extent_state *cached_state = NULL;
2680         struct extent_changeset *data_reserved = NULL;
2681         struct page *page;
2682         struct btrfs_inode *inode;
2683         u64 page_start;
2684         u64 page_end;
2685         int ret = 0;
2686         bool free_delalloc_space = true;
2687
2688         fixup = container_of(work, struct btrfs_writepage_fixup, work);
2689         page = fixup->page;
2690         inode = BTRFS_I(fixup->inode);
2691         page_start = page_offset(page);
2692         page_end = page_offset(page) + PAGE_SIZE - 1;
2693
2694         /*
2695          * This is similar to page_mkwrite, we need to reserve the space before
2696          * we take the page lock.
2697          */
2698         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2699                                            PAGE_SIZE);
2700 again:
2701         lock_page(page);
2702
2703         /*
2704          * Before we queued this fixup, we took a reference on the page.
2705          * page->mapping may go NULL, but it shouldn't be moved to a different
2706          * address space.
2707          */
2708         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2709                 /*
2710                  * Unfortunately this is a little tricky, either
2711                  *
2712                  * 1) We got here and our page had already been dealt with and
2713                  *    we reserved our space, thus ret == 0, so we need to just
2714                  *    drop our space reservation and bail.  This can happen the
2715                  *    first time we come into the fixup worker, or could happen
2716                  *    while waiting for the ordered extent.
2717                  * 2) Our page was already dealt with, but we happened to get an
2718                  *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
2719                  *    this case we obviously don't have anything to release, but
2720                  *    because the page was already dealt with we don't want to
2721                  *    mark the page with an error, so make sure we're resetting
2722                  *    ret to 0.  This is why we have this check _before_ the ret
2723                  *    check, because we do not want to have a surprise ENOSPC
2724                  *    when the page was already properly dealt with.
2725                  */
2726                 if (!ret) {
2727                         btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2728                         btrfs_delalloc_release_space(inode, data_reserved,
2729                                                      page_start, PAGE_SIZE,
2730                                                      true);
2731                 }
2732                 ret = 0;
2733                 goto out_page;
2734         }
2735
2736         /*
2737          * We can't mess with the page state unless it is locked, so now that
2738          * it is locked bail if we failed to make our space reservation.
2739          */
2740         if (ret)
2741                 goto out_page;
2742
2743         lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
2744
2745         /* already ordered? We're done */
2746         if (PageOrdered(page))
2747                 goto out_reserved;
2748
2749         ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2750         if (ordered) {
2751                 unlock_extent_cached(&inode->io_tree, page_start, page_end,
2752                                      &cached_state);
2753                 unlock_page(page);
2754                 btrfs_start_ordered_extent(ordered, 1);
2755                 btrfs_put_ordered_extent(ordered);
2756                 goto again;
2757         }
2758
2759         ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2760                                         &cached_state);
2761         if (ret)
2762                 goto out_reserved;
2763
2764         /*
2765          * Everything went as planned, we're now the owner of a dirty page with
2766          * delayed allocation bits set and space reserved for our COW
2767          * destination.
2768          *
2769          * The page was dirty when we started, nothing should have cleaned it.
2770          */
2771         BUG_ON(!PageDirty(page));
2772         free_delalloc_space = false;
2773 out_reserved:
2774         btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2775         if (free_delalloc_space)
2776                 btrfs_delalloc_release_space(inode, data_reserved, page_start,
2777                                              PAGE_SIZE, true);
2778         unlock_extent_cached(&inode->io_tree, page_start, page_end,
2779                              &cached_state);
2780 out_page:
2781         if (ret) {
2782                 /*
2783                  * We hit ENOSPC or other errors.  Update the mapping and page
2784                  * to reflect the errors and clean the page.
2785                  */
2786                 mapping_set_error(page->mapping, ret);
2787                 end_extent_writepage(page, ret, page_start, page_end);
2788                 clear_page_dirty_for_io(page);
2789                 SetPageError(page);
2790         }
2791         btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
2792         unlock_page(page);
2793         put_page(page);
2794         kfree(fixup);
2795         extent_changeset_free(data_reserved);
2796         /*
2797          * As a precaution, do a delayed iput in case it would be the last iput
2798          * that could need flushing space. Recursing back to fixup worker would
2799          * deadlock.
2800          */
2801         btrfs_add_delayed_iput(&inode->vfs_inode);
2802 }
2803
2804 /*
2805  * There are a few paths in the higher layers of the kernel that directly
2806  * set the page dirty bit without asking the filesystem if it is a
2807  * good idea.  This causes problems because we want to make sure COW
2808  * properly happens and the data=ordered rules are followed.
2809  *
2810  * In our case any range that doesn't have the ORDERED bit set
2811  * hasn't been properly setup for IO.  We kick off an async process
2812  * to fix it up.  The async helper will wait for ordered extents, set
2813  * the delalloc bit and make it safe to write the page.
2814  */
2815 int btrfs_writepage_cow_fixup(struct page *page)
2816 {
2817         struct inode *inode = page->mapping->host;
2818         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2819         struct btrfs_writepage_fixup *fixup;
2820
2821         /* This page has ordered extent covering it already */
2822         if (PageOrdered(page))
2823                 return 0;
2824
2825         /*
2826          * PageChecked is set below when we create a fixup worker for this page,
2827          * don't try to create another one if we're already PageChecked()
2828          *
2829          * The extent_io writepage code will redirty the page if we send back
2830          * EAGAIN.
2831          */
2832         if (PageChecked(page))
2833                 return -EAGAIN;
2834
2835         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2836         if (!fixup)
2837                 return -EAGAIN;
2838
2839         /*
2840          * We are already holding a reference to this inode from
2841          * write_cache_pages.  We need to hold it because the space reservation
2842          * takes place outside of the page lock, and we can't trust
2843          * page->mapping outside of the page lock.
2844          */
2845         ihold(inode);
2846         btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
2847         get_page(page);
2848         btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
2849         fixup->page = page;
2850         fixup->inode = inode;
2851         btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2852
2853         return -EAGAIN;
2854 }
2855
2856 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2857                                        struct btrfs_inode *inode, u64 file_pos,
2858                                        struct btrfs_file_extent_item *stack_fi,
2859                                        const bool update_inode_bytes,
2860                                        u64 qgroup_reserved)
2861 {
2862         struct btrfs_root *root = inode->root;
2863         const u64 sectorsize = root->fs_info->sectorsize;
2864         struct btrfs_path *path;
2865         struct extent_buffer *leaf;
2866         struct btrfs_key ins;
2867         u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2868         u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2869         u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2870         u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2871         struct btrfs_drop_extents_args drop_args = { 0 };
2872         int ret;
2873
2874         path = btrfs_alloc_path();
2875         if (!path)
2876                 return -ENOMEM;
2877
2878         /*
2879          * we may be replacing one extent in the tree with another.
2880          * The new extent is pinned in the extent map, and we don't want
2881          * to drop it from the cache until it is completely in the btree.
2882          *
2883          * So, tell btrfs_drop_extents to leave this extent in the cache.
2884          * the caller is expected to unpin it and allow it to be merged
2885          * with the others.
2886          */
2887         drop_args.path = path;
2888         drop_args.start = file_pos;
2889         drop_args.end = file_pos + num_bytes;
2890         drop_args.replace_extent = true;
2891         drop_args.extent_item_size = sizeof(*stack_fi);
2892         ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2893         if (ret)
2894                 goto out;
2895
2896         if (!drop_args.extent_inserted) {
2897                 ins.objectid = btrfs_ino(inode);
2898                 ins.offset = file_pos;
2899                 ins.type = BTRFS_EXTENT_DATA_KEY;
2900
2901                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2902                                               sizeof(*stack_fi));
2903                 if (ret)
2904                         goto out;
2905         }
2906         leaf = path->nodes[0];
2907         btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2908         write_extent_buffer(leaf, stack_fi,
2909                         btrfs_item_ptr_offset(leaf, path->slots[0]),
2910                         sizeof(struct btrfs_file_extent_item));
2911
2912         btrfs_mark_buffer_dirty(leaf);
2913         btrfs_release_path(path);
2914
2915         /*
2916          * If we dropped an inline extent here, we know the range where it is
2917          * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
2918          * number of bytes only for that range containing the inline extent.
2919          * The remaining of the range will be processed when clearning the
2920          * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
2921          */
2922         if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
2923                 u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
2924
2925                 inline_size = drop_args.bytes_found - inline_size;
2926                 btrfs_update_inode_bytes(inode, sectorsize, inline_size);
2927                 drop_args.bytes_found -= inline_size;
2928                 num_bytes -= sectorsize;
2929         }
2930
2931         if (update_inode_bytes)
2932                 btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
2933
2934         ins.objectid = disk_bytenr;
2935         ins.offset = disk_num_bytes;
2936         ins.type = BTRFS_EXTENT_ITEM_KEY;
2937
2938         ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
2939         if (ret)
2940                 goto out;
2941
2942         ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
2943                                                file_pos, qgroup_reserved, &ins);
2944 out:
2945         btrfs_free_path(path);
2946
2947         return ret;
2948 }
2949
2950 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2951                                          u64 start, u64 len)
2952 {
2953         struct btrfs_block_group *cache;
2954
2955         cache = btrfs_lookup_block_group(fs_info, start);
2956         ASSERT(cache);
2957
2958         spin_lock(&cache->lock);
2959         cache->delalloc_bytes -= len;
2960         spin_unlock(&cache->lock);
2961
2962         btrfs_put_block_group(cache);
2963 }
2964
2965 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
2966                                              struct btrfs_ordered_extent *oe)
2967 {
2968         struct btrfs_file_extent_item stack_fi;
2969         u64 logical_len;
2970         bool update_inode_bytes;
2971
2972         memset(&stack_fi, 0, sizeof(stack_fi));
2973         btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
2974         btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
2975         btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
2976                                                    oe->disk_num_bytes);
2977         if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
2978                 logical_len = oe->truncated_len;
2979         else
2980                 logical_len = oe->num_bytes;
2981         btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len);
2982         btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len);
2983         btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
2984         /* Encryption and other encoding is reserved and all 0 */
2985
2986         /*
2987          * For delalloc, when completing an ordered extent we update the inode's
2988          * bytes when clearing the range in the inode's io tree, so pass false
2989          * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
2990          * except if the ordered extent was truncated.
2991          */
2992         update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
2993                              test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
2994
2995         return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
2996                                            oe->file_offset, &stack_fi,
2997                                            update_inode_bytes, oe->qgroup_rsv);
2998 }
2999
3000 /*
3001  * As ordered data IO finishes, this gets called so we can finish
3002  * an ordered extent if the range of bytes in the file it covers are
3003  * fully written.
3004  */
3005 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
3006 {
3007         struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
3008         struct btrfs_root *root = inode->root;
3009         struct btrfs_fs_info *fs_info = root->fs_info;
3010         struct btrfs_trans_handle *trans = NULL;
3011         struct extent_io_tree *io_tree = &inode->io_tree;
3012         struct extent_state *cached_state = NULL;
3013         u64 start, end;
3014         int compress_type = 0;
3015         int ret = 0;
3016         u64 logical_len = ordered_extent->num_bytes;
3017         bool freespace_inode;
3018         bool truncated = false;
3019         bool clear_reserved_extent = true;
3020         unsigned int clear_bits = EXTENT_DEFRAG;
3021
3022         start = ordered_extent->file_offset;
3023         end = start + ordered_extent->num_bytes - 1;
3024
3025         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3026             !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3027             !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
3028                 clear_bits |= EXTENT_DELALLOC_NEW;
3029
3030         freespace_inode = btrfs_is_free_space_inode(inode);
3031
3032         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3033                 ret = -EIO;
3034                 goto out;
3035         }
3036
3037         /* A valid bdev implies a write on a sequential zone */
3038         if (ordered_extent->bdev) {
3039                 btrfs_rewrite_logical_zoned(ordered_extent);
3040                 btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3041                                         ordered_extent->disk_num_bytes);
3042         }
3043
3044         btrfs_free_io_failure_record(inode, start, end);
3045
3046         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3047                 truncated = true;
3048                 logical_len = ordered_extent->truncated_len;
3049                 /* Truncated the entire extent, don't bother adding */
3050                 if (!logical_len)
3051                         goto out;
3052         }
3053
3054         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3055                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3056
3057                 btrfs_inode_safe_disk_i_size_write(inode, 0);
3058                 if (freespace_inode)
3059                         trans = btrfs_join_transaction_spacecache(root);
3060                 else
3061                         trans = btrfs_join_transaction(root);
3062                 if (IS_ERR(trans)) {
3063                         ret = PTR_ERR(trans);
3064                         trans = NULL;
3065                         goto out;
3066                 }
3067                 trans->block_rsv = &inode->block_rsv;
3068                 ret = btrfs_update_inode_fallback(trans, root, inode);
3069                 if (ret) /* -ENOMEM or corruption */
3070                         btrfs_abort_transaction(trans, ret);
3071                 goto out;
3072         }
3073
3074         clear_bits |= EXTENT_LOCKED;
3075         lock_extent_bits(io_tree, start, end, &cached_state);
3076
3077         if (freespace_inode)
3078                 trans = btrfs_join_transaction_spacecache(root);
3079         else
3080                 trans = btrfs_join_transaction(root);
3081         if (IS_ERR(trans)) {
3082                 ret = PTR_ERR(trans);
3083                 trans = NULL;
3084                 goto out;
3085         }
3086
3087         trans->block_rsv = &inode->block_rsv;
3088
3089         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3090                 compress_type = ordered_extent->compress_type;
3091         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3092                 BUG_ON(compress_type);
3093                 ret = btrfs_mark_extent_written(trans, inode,
3094                                                 ordered_extent->file_offset,
3095                                                 ordered_extent->file_offset +
3096                                                 logical_len);
3097         } else {
3098                 BUG_ON(root == fs_info->tree_root);
3099                 ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3100                 if (!ret) {
3101                         clear_reserved_extent = false;
3102                         btrfs_release_delalloc_bytes(fs_info,
3103                                                 ordered_extent->disk_bytenr,
3104                                                 ordered_extent->disk_num_bytes);
3105                 }
3106         }
3107         unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
3108                            ordered_extent->num_bytes, trans->transid);
3109         if (ret < 0) {
3110                 btrfs_abort_transaction(trans, ret);
3111                 goto out;
3112         }
3113
3114         ret = add_pending_csums(trans, &ordered_extent->list);
3115         if (ret) {
3116                 btrfs_abort_transaction(trans, ret);
3117                 goto out;
3118         }
3119
3120         /*
3121          * If this is a new delalloc range, clear its new delalloc flag to
3122          * update the inode's number of bytes. This needs to be done first
3123          * before updating the inode item.
3124          */
3125         if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3126             !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3127                 clear_extent_bit(&inode->io_tree, start, end,
3128                                  EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
3129                                  0, 0, &cached_state);
3130
3131         btrfs_inode_safe_disk_i_size_write(inode, 0);
3132         ret = btrfs_update_inode_fallback(trans, root, inode);
3133         if (ret) { /* -ENOMEM or corruption */
3134                 btrfs_abort_transaction(trans, ret);
3135                 goto out;
3136         }
3137         ret = 0;
3138 out:
3139         clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3140                          (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
3141                          &cached_state);
3142
3143         if (trans)
3144                 btrfs_end_transaction(trans);
3145
3146         if (ret || truncated) {
3147                 u64 unwritten_start = start;
3148
3149                 /*
3150                  * If we failed to finish this ordered extent for any reason we
3151                  * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3152                  * extent, and mark the inode with the error if it wasn't
3153                  * already set.  Any error during writeback would have already
3154                  * set the mapping error, so we need to set it if we're the ones
3155                  * marking this ordered extent as failed.
3156                  */
3157                 if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
3158                                              &ordered_extent->flags))
3159                         mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
3160
3161                 if (truncated)
3162                         unwritten_start += logical_len;
3163                 clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
3164
3165                 /* Drop the cache for the part of the extent we didn't write. */
3166                 btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
3167
3168                 /*
3169                  * If the ordered extent had an IOERR or something else went
3170                  * wrong we need to return the space for this ordered extent
3171                  * back to the allocator.  We only free the extent in the
3172                  * truncated case if we didn't write out the extent at all.
3173                  *
3174                  * If we made it past insert_reserved_file_extent before we
3175                  * errored out then we don't need to do this as the accounting
3176                  * has already been done.
3177                  */
3178                 if ((ret || !logical_len) &&
3179                     clear_reserved_extent &&
3180                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3181                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3182                         /*
3183                          * Discard the range before returning it back to the
3184                          * free space pool
3185                          */
3186                         if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3187                                 btrfs_discard_extent(fs_info,
3188                                                 ordered_extent->disk_bytenr,
3189                                                 ordered_extent->disk_num_bytes,
3190                                                 NULL);
3191                         btrfs_free_reserved_extent(fs_info,
3192                                         ordered_extent->disk_bytenr,
3193                                         ordered_extent->disk_num_bytes, 1);
3194                 }
3195         }
3196
3197         /*
3198          * This needs to be done to make sure anybody waiting knows we are done
3199          * updating everything for this ordered extent.
3200          */
3201         btrfs_remove_ordered_extent(inode, ordered_extent);
3202
3203         /* once for us */
3204         btrfs_put_ordered_extent(ordered_extent);
3205         /* once for the tree */
3206         btrfs_put_ordered_extent(ordered_extent);
3207
3208         return ret;
3209 }
3210
3211 static void finish_ordered_fn(struct btrfs_work *work)
3212 {
3213         struct btrfs_ordered_extent *ordered_extent;
3214         ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3215         btrfs_finish_ordered_io(ordered_extent);
3216 }
3217
3218 void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
3219                                           struct page *page, u64 start,
3220                                           u64 end, bool uptodate)
3221 {
3222         trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
3223
3224         btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
3225                                        finish_ordered_fn, uptodate);
3226 }
3227
3228 /*
3229  * check_data_csum - verify checksum of one sector of uncompressed data
3230  * @inode:      inode
3231  * @io_bio:     btrfs_io_bio which contains the csum
3232  * @bio_offset: offset to the beginning of the bio (in bytes)
3233  * @page:       page where is the data to be verified
3234  * @pgoff:      offset inside the page
3235  * @start:      logical offset in the file
3236  *
3237  * The length of such check is always one sector size.
3238  */
3239 static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
3240                            u32 bio_offset, struct page *page, u32 pgoff,
3241                            u64 start)
3242 {
3243         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3244         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3245         char *kaddr;
3246         u32 len = fs_info->sectorsize;
3247         const u32 csum_size = fs_info->csum_size;
3248         unsigned int offset_sectors;
3249         u8 *csum_expected;
3250         u8 csum[BTRFS_CSUM_SIZE];
3251
3252         ASSERT(pgoff + len <= PAGE_SIZE);
3253
3254         offset_sectors = bio_offset >> fs_info->sectorsize_bits;
3255         csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
3256
3257         kaddr = kmap_atomic(page);
3258         shash->tfm = fs_info->csum_shash;
3259
3260         crypto_shash_digest(shash, kaddr + pgoff, len, csum);
3261
3262         if (memcmp(csum, csum_expected, csum_size))
3263                 goto zeroit;
3264
3265         kunmap_atomic(kaddr);
3266         return 0;
3267 zeroit:
3268         btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3269                                     bbio->mirror_num);
3270         if (bbio->device)
3271                 btrfs_dev_stat_inc_and_print(bbio->device,
3272                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
3273         memset(kaddr + pgoff, 1, len);
3274         flush_dcache_page(page);
3275         kunmap_atomic(kaddr);
3276         return -EIO;
3277 }
3278
3279 /*
3280  * When reads are done, we need to check csums to verify the data is correct.
3281  * if there's a match, we allow the bio to finish.  If not, the code in
3282  * extent_io.c will try to find good copies for us.
3283  *
3284  * @bio_offset: offset to the beginning of the bio (in bytes)
3285  * @start:      file offset of the range start
3286  * @end:        file offset of the range end (inclusive)
3287  *
3288  * Return a bitmap where bit set means a csum mismatch, and bit not set means
3289  * csum match.
3290  */
3291 unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
3292                                     u32 bio_offset, struct page *page,
3293                                     u64 start, u64 end)
3294 {
3295         struct inode *inode = page->mapping->host;
3296         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3297         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3298         struct btrfs_root *root = BTRFS_I(inode)->root;
3299         const u32 sectorsize = root->fs_info->sectorsize;
3300         u32 pg_off;
3301         unsigned int result = 0;
3302
3303         if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
3304                 btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
3305                 return 0;
3306         }
3307
3308         /*
3309          * This only happens for NODATASUM or compressed read.
3310          * Normally this should be covered by above check for compressed read
3311          * or the next check for NODATASUM.  Just do a quicker exit here.
3312          */
3313         if (bbio->csum == NULL)
3314                 return 0;
3315
3316         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3317                 return 0;
3318
3319         if (!root->fs_info->csum_root)
3320                 return 0;
3321
3322         ASSERT(page_offset(page) <= start &&
3323                end <= page_offset(page) + PAGE_SIZE - 1);
3324         for (pg_off = offset_in_page(start);
3325              pg_off < offset_in_page(end);
3326              pg_off += sectorsize, bio_offset += sectorsize) {
3327                 u64 file_offset = pg_off + page_offset(page);
3328                 int ret;
3329
3330                 if (btrfs_is_data_reloc_root(root) &&
3331                     test_range_bit(io_tree, file_offset,
3332                                    file_offset + sectorsize - 1,
3333                                    EXTENT_NODATASUM, 1, NULL)) {
3334                         /* Skip the range without csum for data reloc inode */
3335                         clear_extent_bits(io_tree, file_offset,
3336                                           file_offset + sectorsize - 1,
3337                                           EXTENT_NODATASUM);
3338                         continue;
3339                 }
3340                 ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
3341                                       page_offset(page) + pg_off);
3342                 if (ret < 0) {
3343                         const int nr_bit = (pg_off - offset_in_page(start)) >>
3344                                      root->fs_info->sectorsize_bits;
3345
3346                         result |= (1U << nr_bit);
3347                 }
3348         }
3349         return result;
3350 }
3351
3352 /*
3353  * btrfs_add_delayed_iput - perform a delayed iput on @inode
3354  *
3355  * @inode: The inode we want to perform iput on
3356  *
3357  * This function uses the generic vfs_inode::i_count to track whether we should
3358  * just decrement it (in case it's > 1) or if this is the last iput then link
3359  * the inode to the delayed iput machinery. Delayed iputs are processed at
3360  * transaction commit time/superblock commit/cleaner kthread.
3361  */
3362 void btrfs_add_delayed_iput(struct inode *inode)
3363 {
3364         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3365         struct btrfs_inode *binode = BTRFS_I(inode);
3366
3367         if (atomic_add_unless(&inode->i_count, -1, 1))
3368                 return;
3369
3370         atomic_inc(&fs_info->nr_delayed_iputs);
3371         spin_lock(&fs_info->delayed_iput_lock);
3372         ASSERT(list_empty(&binode->delayed_iput));
3373         list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3374         spin_unlock(&fs_info->delayed_iput_lock);
3375         if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3376                 wake_up_process(fs_info->cleaner_kthread);
3377 }
3378
3379 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3380                                     struct btrfs_inode *inode)
3381 {
3382         list_del_init(&inode->delayed_iput);
3383         spin_unlock(&fs_info->delayed_iput_lock);
3384         iput(&inode->vfs_inode);
3385         if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3386                 wake_up(&fs_info->delayed_iputs_wait);
3387         spin_lock(&fs_info->delayed_iput_lock);
3388 }
3389
3390 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3391                                    struct btrfs_inode *inode)
3392 {
3393         if (!list_empty(&inode->delayed_iput)) {
3394                 spin_lock(&fs_info->delayed_iput_lock);
3395                 if (!list_empty(&inode->delayed_iput))
3396                         run_delayed_iput_locked(fs_info, inode);
3397                 spin_unlock(&fs_info->delayed_iput_lock);
3398         }
3399 }
3400
3401 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3402 {
3403
3404         spin_lock(&fs_info->delayed_iput_lock);
3405         while (!list_empty(&fs_info->delayed_iputs)) {
3406                 struct btrfs_inode *inode;
3407
3408                 inode = list_first_entry(&fs_info->delayed_iputs,
3409                                 struct btrfs_inode, delayed_iput);
3410                 run_delayed_iput_locked(fs_info, inode);
3411                 cond_resched_lock(&fs_info->delayed_iput_lock);
3412         }
3413         spin_unlock(&fs_info->delayed_iput_lock);
3414 }
3415
3416 /**
3417  * Wait for flushing all delayed iputs
3418  *
3419  * @fs_info:  the filesystem
3420  *
3421  * This will wait on any delayed iputs that are currently running with KILLABLE
3422  * set.  Once they are all done running we will return, unless we are killed in
3423  * which case we return EINTR. This helps in user operations like fallocate etc
3424  * that might get blocked on the iputs.
3425  *
3426  * Return EINTR if we were killed, 0 if nothing's pending
3427  */
3428 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3429 {
3430         int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3431                         atomic_read(&fs_info->nr_delayed_iputs) == 0);
3432         if (ret)
3433                 return -EINTR;
3434         return 0;
3435 }
3436
3437 /*
3438  * This creates an orphan entry for the given inode in case something goes wrong
3439  * in the middle of an unlink.
3440  */
3441 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3442                      struct btrfs_inode *inode)
3443 {
3444         int ret;
3445
3446         ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3447         if (ret && ret != -EEXIST) {
3448                 btrfs_abort_transaction(trans, ret);
3449                 return ret;
3450         }
3451
3452         return 0;
3453 }
3454
3455 /*
3456  * We have done the delete so we can go ahead and remove the orphan item for
3457  * this particular inode.
3458  */
3459 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3460                             struct btrfs_inode *inode)
3461 {
3462         return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3463 }
3464
3465 /*
3466  * this cleans up any orphans that may be left on the list from the last use
3467  * of this root.
3468  */
3469 int btrfs_orphan_cleanup(struct btrfs_root *root)
3470 {
3471         struct btrfs_fs_info *fs_info = root->fs_info;
3472         struct btrfs_path *path;
3473         struct extent_buffer *leaf;
3474         struct btrfs_key key, found_key;
3475         struct btrfs_trans_handle *trans;
3476         struct inode *inode;
3477         u64 last_objectid = 0;
3478         int ret = 0, nr_unlink = 0;
3479
3480         if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3481                 return 0;
3482
3483         path = btrfs_alloc_path();
3484         if (!path) {
3485                 ret = -ENOMEM;
3486                 goto out;
3487         }
3488         path->reada = READA_BACK;
3489
3490         key.objectid = BTRFS_ORPHAN_OBJECTID;
3491         key.type = BTRFS_ORPHAN_ITEM_KEY;
3492         key.offset = (u64)-1;
3493
3494         while (1) {
3495                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3496                 if (ret < 0)
3497                         goto out;
3498
3499                 /*
3500                  * if ret == 0 means we found what we were searching for, which
3501                  * is weird, but possible, so only screw with path if we didn't
3502                  * find the key and see if we have stuff that matches
3503                  */
3504                 if (ret > 0) {
3505                         ret = 0;
3506                         if (path->slots[0] == 0)
3507                                 break;
3508                         path->slots[0]--;
3509                 }
3510
3511                 /* pull out the item */
3512                 leaf = path->nodes[0];
3513                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3514
3515                 /* make sure the item matches what we want */
3516                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3517                         break;
3518                 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3519                         break;
3520
3521                 /* release the path since we're done with it */
3522                 btrfs_release_path(path);
3523
3524                 /*
3525                  * this is where we are basically btrfs_lookup, without the
3526                  * crossing root thing.  we store the inode number in the
3527                  * offset of the orphan item.
3528                  */
3529
3530                 if (found_key.offset == last_objectid) {
3531                         btrfs_err(fs_info,
3532                                   "Error removing orphan entry, stopping orphan cleanup");
3533                         ret = -EINVAL;
3534                         goto out;
3535                 }
3536
3537                 last_objectid = found_key.offset;
3538
3539                 found_key.objectid = found_key.offset;
3540                 found_key.type = BTRFS_INODE_ITEM_KEY;
3541                 found_key.offset = 0;
3542                 inode = btrfs_iget(fs_info->sb, last_objectid, root);
3543                 ret = PTR_ERR_OR_ZERO(inode);
3544                 if (ret && ret != -ENOENT)
3545                         goto out;
3546
3547                 if (ret == -ENOENT && root == fs_info->tree_root) {
3548                         struct btrfs_root *dead_root;
3549                         int is_dead_root = 0;
3550
3551                         /*
3552                          * This is an orphan in the tree root. Currently these
3553                          * could come from 2 sources:
3554                          *  a) a root (snapshot/subvolume) deletion in progress
3555                          *  b) a free space cache inode
3556                          * We need to distinguish those two, as the orphan item
3557                          * for a root must not get deleted before the deletion
3558                          * of the snapshot/subvolume's tree completes.
3559                          *
3560                          * btrfs_find_orphan_roots() ran before us, which has
3561                          * found all deleted roots and loaded them into
3562                          * fs_info->fs_roots_radix. So here we can find if an
3563                          * orphan item corresponds to a deleted root by looking
3564                          * up the root from that radix tree.
3565                          */
3566
3567                         spin_lock(&fs_info->fs_roots_radix_lock);
3568                         dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3569                                                          (unsigned long)found_key.objectid);
3570                         if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3571                                 is_dead_root = 1;
3572                         spin_unlock(&fs_info->fs_roots_radix_lock);
3573
3574                         if (is_dead_root) {
3575                                 /* prevent this orphan from being found again */
3576                                 key.offset = found_key.objectid - 1;
3577                                 continue;
3578                         }
3579
3580                 }
3581
3582                 /*
3583                  * If we have an inode with links, there are a couple of
3584                  * possibilities:
3585                  *
3586                  * 1. We were halfway through creating fsverity metadata for the
3587                  * file. In that case, the orphan item represents incomplete
3588                  * fsverity metadata which must be cleaned up with
3589                  * btrfs_drop_verity_items and deleting the orphan item.
3590
3591                  * 2. Old kernels (before v3.12) used to create an
3592                  * orphan item for truncate indicating that there were possibly
3593                  * extent items past i_size that needed to be deleted. In v3.12,
3594                  * truncate was changed to update i_size in sync with the extent
3595                  * items, but the (useless) orphan item was still created. Since
3596                  * v4.18, we don't create the orphan item for truncate at all.
3597                  *
3598                  * So, this item could mean that we need to do a truncate, but
3599                  * only if this filesystem was last used on a pre-v3.12 kernel
3600                  * and was not cleanly unmounted. The odds of that are quite
3601                  * slim, and it's a pain to do the truncate now, so just delete
3602                  * the orphan item.
3603                  *
3604                  * It's also possible that this orphan item was supposed to be
3605                  * deleted but wasn't. The inode number may have been reused,
3606                  * but either way, we can delete the orphan item.
3607                  */
3608                 if (ret == -ENOENT || inode->i_nlink) {
3609                         if (!ret) {
3610                                 ret = btrfs_drop_verity_items(BTRFS_I(inode));
3611                                 iput(inode);
3612                                 if (ret)
3613                                         goto out;
3614                         }
3615                         trans = btrfs_start_transaction(root, 1);
3616                         if (IS_ERR(trans)) {
3617                                 ret = PTR_ERR(trans);
3618                                 goto out;
3619                         }
3620                         btrfs_debug(fs_info, "auto deleting %Lu",
3621                                     found_key.objectid);
3622                         ret = btrfs_del_orphan_item(trans, root,
3623                                                     found_key.objectid);
3624                         btrfs_end_transaction(trans);
3625                         if (ret)
3626                                 goto out;
3627                         continue;
3628                 }
3629
3630                 nr_unlink++;
3631
3632                 /* this will do delete_inode and everything for us */
3633                 iput(inode);
3634         }
3635         /* release the path since we're done with it */
3636         btrfs_release_path(path);
3637
3638         root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3639
3640         if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3641                 trans = btrfs_join_transaction(root);
3642                 if (!IS_ERR(trans))
3643                         btrfs_end_transaction(trans);
3644         }
3645
3646         if (nr_unlink)
3647                 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3648
3649 out:
3650         if (ret)
3651                 btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3652         btrfs_free_path(path);
3653         return ret;
3654 }
3655
3656 /*
3657  * very simple check to peek ahead in the leaf looking for xattrs.  If we
3658  * don't find any xattrs, we know there can't be any acls.
3659  *
3660  * slot is the slot the inode is in, objectid is the objectid of the inode
3661  */
3662 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3663                                           int slot, u64 objectid,
3664                                           int *first_xattr_slot)
3665 {
3666         u32 nritems = btrfs_header_nritems(leaf);
3667         struct btrfs_key found_key;
3668         static u64 xattr_access = 0;
3669         static u64 xattr_default = 0;
3670         int scanned = 0;
3671
3672         if (!xattr_access) {
3673                 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3674                                         strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3675                 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3676                                         strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3677         }
3678
3679         slot++;
3680         *first_xattr_slot = -1;
3681         while (slot < nritems) {
3682                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3683
3684                 /* we found a different objectid, there must not be acls */
3685                 if (found_key.objectid != objectid)
3686                         return 0;
3687
3688                 /* we found an xattr, assume we've got an acl */
3689                 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3690                         if (*first_xattr_slot == -1)
3691                                 *first_xattr_slot = slot;
3692                         if (found_key.offset == xattr_access ||
3693                             found_key.offset == xattr_default)
3694                                 return 1;
3695                 }
3696
3697                 /*
3698                  * we found a key greater than an xattr key, there can't
3699                  * be any acls later on
3700                  */
3701                 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3702                         return 0;
3703
3704                 slot++;
3705                 scanned++;
3706
3707                 /*
3708                  * it goes inode, inode backrefs, xattrs, extents,
3709                  * so if there are a ton of hard links to an inode there can
3710                  * be a lot of backrefs.  Don't waste time searching too hard,
3711                  * this is just an optimization
3712                  */
3713                 if (scanned >= 8)
3714                         break;
3715         }
3716         /* we hit the end of the leaf before we found an xattr or
3717          * something larger than an xattr.  We have to assume the inode
3718          * has acls
3719          */
3720         if (*first_xattr_slot == -1)
3721                 *first_xattr_slot = slot;
3722         return 1;
3723 }
3724
3725 /*
3726  * read an inode from the btree into the in-memory inode
3727  */
3728 static int btrfs_read_locked_inode(struct inode *inode,
3729                                    struct btrfs_path *in_path)
3730 {
3731         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3732         struct btrfs_path *path = in_path;
3733         struct extent_buffer *leaf;
3734         struct btrfs_inode_item *inode_item;
3735         struct btrfs_root *root = BTRFS_I(inode)->root;
3736         struct btrfs_key location;
3737         unsigned long ptr;
3738         int maybe_acls;
3739         u32 rdev;
3740         int ret;
3741         bool filled = false;
3742         int first_xattr_slot;
3743
3744         ret = btrfs_fill_inode(inode, &rdev);
3745         if (!ret)
3746                 filled = true;
3747
3748         if (!path) {
3749                 path = btrfs_alloc_path();
3750                 if (!path)
3751                         return -ENOMEM;
3752         }
3753
3754         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3755
3756         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3757         if (ret) {
3758                 if (path != in_path)
3759                         btrfs_free_path(path);
3760                 return ret;
3761         }
3762
3763         leaf = path->nodes[0];
3764
3765         if (filled)
3766                 goto cache_index;
3767
3768         inode_item = btrfs_item_ptr(leaf, path->slots[0],
3769                                     struct btrfs_inode_item);
3770         inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3771         set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3772         i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3773         i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3774         btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3775         btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
3776                         round_up(i_size_read(inode), fs_info->sectorsize));
3777
3778         inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3779         inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3780
3781         inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3782         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3783
3784         inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3785         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3786
3787         BTRFS_I(inode)->i_otime.tv_sec =
3788                 btrfs_timespec_sec(leaf, &inode_item->otime);
3789         BTRFS_I(inode)->i_otime.tv_nsec =
3790                 btrfs_timespec_nsec(leaf, &inode_item->otime);
3791
3792         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3793         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3794         BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3795
3796         inode_set_iversion_queried(inode,
3797                                    btrfs_inode_sequence(leaf, inode_item));
3798         inode->i_generation = BTRFS_I(inode)->generation;
3799         inode->i_rdev = 0;
3800         rdev = btrfs_inode_rdev(leaf, inode_item);
3801
3802         BTRFS_I(inode)->index_cnt = (u64)-1;
3803         btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
3804                                 &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
3805
3806 cache_index:
3807         /*
3808          * If we were modified in the current generation and evicted from memory
3809          * and then re-read we need to do a full sync since we don't have any
3810          * idea about which extents were modified before we were evicted from
3811          * cache.
3812          *
3813          * This is required for both inode re-read from disk and delayed inode
3814          * in delayed_nodes_tree.
3815          */
3816         if (BTRFS_I(inode)->last_trans == fs_info->generation)
3817                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3818                         &BTRFS_I(inode)->runtime_flags);
3819
3820         /*
3821          * We don't persist the id of the transaction where an unlink operation
3822          * against the inode was last made. So here we assume the inode might
3823          * have been evicted, and therefore the exact value of last_unlink_trans
3824          * lost, and set it to last_trans to avoid metadata inconsistencies
3825          * between the inode and its parent if the inode is fsync'ed and the log
3826          * replayed. For example, in the scenario:
3827          *
3828          * touch mydir/foo
3829          * ln mydir/foo mydir/bar
3830          * sync
3831          * unlink mydir/bar
3832          * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3833          * xfs_io -c fsync mydir/foo
3834          * <power failure>
3835          * mount fs, triggers fsync log replay
3836          *
3837          * We must make sure that when we fsync our inode foo we also log its
3838          * parent inode, otherwise after log replay the parent still has the
3839          * dentry with the "bar" name but our inode foo has a link count of 1
3840          * and doesn't have an inode ref with the name "bar" anymore.
3841          *
3842          * Setting last_unlink_trans to last_trans is a pessimistic approach,
3843          * but it guarantees correctness at the expense of occasional full
3844          * transaction commits on fsync if our inode is a directory, or if our
3845          * inode is not a directory, logging its parent unnecessarily.
3846          */
3847         BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3848
3849         /*
3850          * Same logic as for last_unlink_trans. We don't persist the generation
3851          * of the last transaction where this inode was used for a reflink
3852          * operation, so after eviction and reloading the inode we must be
3853          * pessimistic and assume the last transaction that modified the inode.
3854          */
3855         BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
3856
3857         path->slots[0]++;
3858         if (inode->i_nlink != 1 ||
3859             path->slots[0] >= btrfs_header_nritems(leaf))
3860                 goto cache_acl;
3861
3862         btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3863         if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3864                 goto cache_acl;
3865
3866         ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3867         if (location.type == BTRFS_INODE_REF_KEY) {
3868                 struct btrfs_inode_ref *ref;
3869
3870                 ref = (struct btrfs_inode_ref *)ptr;
3871                 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3872         } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3873                 struct btrfs_inode_extref *extref;
3874
3875                 extref = (struct btrfs_inode_extref *)ptr;
3876                 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3877                                                                      extref);
3878         }
3879 cache_acl:
3880         /*
3881          * try to precache a NULL acl entry for files that don't have
3882          * any xattrs or acls
3883          */
3884         maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3885                         btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3886         if (first_xattr_slot != -1) {
3887                 path->slots[0] = first_xattr_slot;
3888                 ret = btrfs_load_inode_props(inode, path);
3889                 if (ret)
3890                         btrfs_err(fs_info,
3891                                   "error loading props for ino %llu (root %llu): %d",
3892                                   btrfs_ino(BTRFS_I(inode)),
3893                                   root->root_key.objectid, ret);
3894         }
3895         if (path != in_path)
3896                 btrfs_free_path(path);
3897
3898         if (!maybe_acls)
3899                 cache_no_acl(inode);
3900
3901         switch (inode->i_mode & S_IFMT) {
3902         case S_IFREG:
3903                 inode->i_mapping->a_ops = &btrfs_aops;
3904                 inode->i_fop = &btrfs_file_operations;
3905                 inode->i_op = &btrfs_file_inode_operations;
3906                 break;
3907         case S_IFDIR:
3908                 inode->i_fop = &btrfs_dir_file_operations;
3909                 inode->i_op = &btrfs_dir_inode_operations;
3910                 break;
3911         case S_IFLNK:
3912                 inode->i_op = &btrfs_symlink_inode_operations;
3913                 inode_nohighmem(inode);
3914                 inode->i_mapping->a_ops = &btrfs_aops;
3915                 break;
3916         default:
3917                 inode->i_op = &btrfs_special_inode_operations;
3918                 init_special_inode(inode, inode->i_mode, rdev);
3919                 break;
3920         }
3921
3922         btrfs_sync_inode_flags_to_i_flags(inode);
3923         return 0;
3924 }
3925
3926 /*
3927  * given a leaf and an inode, copy the inode fields into the leaf
3928  */
3929 static void fill_inode_item(struct btrfs_trans_handle *trans,
3930                             struct extent_buffer *leaf,
3931                             struct btrfs_inode_item *item,
3932                             struct inode *inode)
3933 {
3934         struct btrfs_map_token token;
3935         u64 flags;
3936
3937         btrfs_init_map_token(&token, leaf);
3938
3939         btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3940         btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3941         btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
3942         btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3943         btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3944
3945         btrfs_set_token_timespec_sec(&token, &item->atime,
3946                                      inode->i_atime.tv_sec);
3947         btrfs_set_token_timespec_nsec(&token, &item->atime,
3948                                       inode->i_atime.tv_nsec);
3949
3950         btrfs_set_token_timespec_sec(&token, &item->mtime,
3951                                      inode->i_mtime.tv_sec);
3952         btrfs_set_token_timespec_nsec(&token, &item->mtime,
3953                                       inode->i_mtime.tv_nsec);
3954
3955         btrfs_set_token_timespec_sec(&token, &item->ctime,
3956                                      inode->i_ctime.tv_sec);
3957         btrfs_set_token_timespec_nsec(&token, &item->ctime,
3958                                       inode->i_ctime.tv_nsec);
3959
3960         btrfs_set_token_timespec_sec(&token, &item->otime,
3961                                      BTRFS_I(inode)->i_otime.tv_sec);
3962         btrfs_set_token_timespec_nsec(&token, &item->otime,
3963                                       BTRFS_I(inode)->i_otime.tv_nsec);
3964
3965         btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
3966         btrfs_set_token_inode_generation(&token, item,
3967                                          BTRFS_I(inode)->generation);
3968         btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3969         btrfs_set_token_inode_transid(&token, item, trans->transid);
3970         btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3971         flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
3972                                           BTRFS_I(inode)->ro_flags);
3973         btrfs_set_token_inode_flags(&token, item, flags);
3974         btrfs_set_token_inode_block_group(&token, item, 0);
3975 }
3976
3977 /*
3978  * copy everything in the in-memory inode into the btree.
3979  */
3980 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3981                                 struct btrfs_root *root,
3982                                 struct btrfs_inode *inode)
3983 {
3984         struct btrfs_inode_item *inode_item;
3985         struct btrfs_path *path;
3986         struct extent_buffer *leaf;
3987         int ret;
3988
3989         path = btrfs_alloc_path();
3990         if (!path)
3991                 return -ENOMEM;
3992
3993         ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
3994         if (ret) {
3995                 if (ret > 0)
3996                         ret = -ENOENT;
3997                 goto failed;
3998         }
3999
4000         leaf = path->nodes[0];
4001         inode_item = btrfs_item_ptr(leaf, path->slots[0],
4002                                     struct btrfs_inode_item);
4003
4004         fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
4005         btrfs_mark_buffer_dirty(leaf);
4006         btrfs_set_inode_last_trans(trans, inode);
4007         ret = 0;
4008 failed:
4009         btrfs_free_path(path);
4010         return ret;
4011 }
4012
4013 /*
4014  * copy everything in the in-memory inode into the btree.
4015  */
4016 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
4017                                 struct btrfs_root *root,
4018                                 struct btrfs_inode *inode)
4019 {
4020         struct btrfs_fs_info *fs_info = root->fs_info;
4021         int ret;
4022
4023         /*
4024          * If the inode is a free space inode, we can deadlock during commit
4025          * if we put it into the delayed code.
4026          *
4027          * The data relocation inode should also be directly updated
4028          * without delay
4029          */
4030         if (!btrfs_is_free_space_inode(inode)
4031             && !btrfs_is_data_reloc_root(root)
4032             && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4033                 btrfs_update_root_times(trans, root);
4034
4035                 ret = btrfs_delayed_update_inode(trans, root, inode);
4036                 if (!ret)
4037                         btrfs_set_inode_last_trans(trans, inode);
4038                 return ret;
4039         }
4040
4041         return btrfs_update_inode_item(trans, root, inode);
4042 }
4043
4044 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4045                                 struct btrfs_root *root, struct btrfs_inode *inode)
4046 {
4047         int ret;
4048
4049         ret = btrfs_update_inode(trans, root, inode);
4050         if (ret == -ENOSPC)
4051                 return btrfs_update_inode_item(trans, root, inode);
4052         return ret;
4053 }
4054
4055 /*
4056  * unlink helper that gets used here in inode.c and in the tree logging
4057  * recovery code.  It remove a link in a directory with a given name, and
4058  * also drops the back refs in the inode to the directory
4059  */
4060 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4061                                 struct btrfs_inode *dir,
4062                                 struct btrfs_inode *inode,
4063                                 const char *name, int name_len)
4064 {
4065         struct btrfs_root *root = dir->root;
4066         struct btrfs_fs_info *fs_info = root->fs_info;
4067         struct btrfs_path *path;
4068         int ret = 0;
4069         struct btrfs_dir_item *di;
4070         u64 index;
4071         u64 ino = btrfs_ino(inode);
4072         u64 dir_ino = btrfs_ino(dir);
4073
4074         path = btrfs_alloc_path();
4075         if (!path) {
4076                 ret = -ENOMEM;
4077                 goto out;
4078         }
4079
4080         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4081                                     name, name_len, -1);
4082         if (IS_ERR_OR_NULL(di)) {
4083                 ret = di ? PTR_ERR(di) : -ENOENT;
4084                 goto err;
4085         }
4086         ret = btrfs_delete_one_dir_name(trans, root, path, di);
4087         if (ret)
4088                 goto err;
4089         btrfs_release_path(path);
4090
4091         /*
4092          * If we don't have dir index, we have to get it by looking up
4093          * the inode ref, since we get the inode ref, remove it directly,
4094          * it is unnecessary to do delayed deletion.
4095          *
4096          * But if we have dir index, needn't search inode ref to get it.
4097          * Since the inode ref is close to the inode item, it is better
4098          * that we delay to delete it, and just do this deletion when
4099          * we update the inode item.
4100          */
4101         if (inode->dir_index) {
4102                 ret = btrfs_delayed_delete_inode_ref(inode);
4103                 if (!ret) {
4104                         index = inode->dir_index;
4105                         goto skip_backref;
4106                 }
4107         }
4108
4109         ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
4110                                   dir_ino, &index);
4111         if (ret) {
4112                 btrfs_info(fs_info,
4113                         "failed to delete reference to %.*s, inode %llu parent %llu",
4114                         name_len, name, ino, dir_ino);
4115                 btrfs_abort_transaction(trans, ret);
4116                 goto err;
4117         }
4118 skip_backref:
4119         ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4120         if (ret) {
4121                 btrfs_abort_transaction(trans, ret);
4122                 goto err;
4123         }
4124
4125         btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
4126                                    dir_ino);
4127         btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index);
4128
4129         /*
4130          * If we have a pending delayed iput we could end up with the final iput
4131          * being run in btrfs-cleaner context.  If we have enough of these built
4132          * up we can end up burning a lot of time in btrfs-cleaner without any
4133          * way to throttle the unlinks.  Since we're currently holding a ref on
4134          * the inode we can run the delayed iput here without any issues as the
4135          * final iput won't be done until after we drop the ref we're currently
4136          * holding.
4137          */
4138         btrfs_run_delayed_iput(fs_info, inode);
4139 err:
4140         btrfs_free_path(path);
4141         if (ret)
4142                 goto out;
4143
4144         btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
4145         inode_inc_iversion(&inode->vfs_inode);
4146         inode_inc_iversion(&dir->vfs_inode);
4147         inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
4148                 dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
4149         ret = btrfs_update_inode(trans, root, dir);
4150 out:
4151         return ret;
4152 }
4153
4154 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4155                        struct btrfs_inode *dir, struct btrfs_inode *inode,
4156                        const char *name, int name_len)
4157 {
4158         int ret;
4159         ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len);
4160         if (!ret) {
4161                 drop_nlink(&inode->vfs_inode);
4162                 ret = btrfs_update_inode(trans, inode->root, inode);
4163         }
4164         return ret;
4165 }
4166
4167 /*
4168  * helper to start transaction for unlink and rmdir.
4169  *
4170  * unlink and rmdir are special in btrfs, they do not always free space, so
4171  * if we cannot make our reservations the normal way try and see if there is
4172  * plenty of slack room in the global reserve to migrate, otherwise we cannot
4173  * allow the unlink to occur.
4174  */
4175 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
4176 {
4177         struct btrfs_root *root = BTRFS_I(dir)->root;
4178
4179         /*
4180          * 1 for the possible orphan item
4181          * 1 for the dir item
4182          * 1 for the dir index
4183          * 1 for the inode ref
4184          * 1 for the inode
4185          */
4186         return btrfs_start_transaction_fallback_global_rsv(root, 5);
4187 }
4188
4189 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4190 {
4191         struct btrfs_trans_handle *trans;
4192         struct inode *inode = d_inode(dentry);
4193         int ret;
4194
4195         trans = __unlink_start_trans(dir);
4196         if (IS_ERR(trans))
4197                 return PTR_ERR(trans);
4198
4199         btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4200                         0);
4201
4202         ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
4203                         BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4204                         dentry->d_name.len);
4205         if (ret)
4206                 goto out;
4207
4208         if (inode->i_nlink == 0) {
4209                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4210                 if (ret)
4211                         goto out;
4212         }
4213
4214 out:
4215         btrfs_end_transaction(trans);
4216         btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4217         return ret;
4218 }
4219
4220 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4221                                struct inode *dir, struct dentry *dentry)
4222 {
4223         struct btrfs_root *root = BTRFS_I(dir)->root;
4224         struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4225         struct btrfs_path *path;
4226         struct extent_buffer *leaf;
4227         struct btrfs_dir_item *di;
4228         struct btrfs_key key;
4229         const char *name = dentry->d_name.name;
4230         int name_len = dentry->d_name.len;
4231         u64 index;
4232         int ret;
4233         u64 objectid;
4234         u64 dir_ino = btrfs_ino(BTRFS_I(dir));
4235
4236         if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4237                 objectid = inode->root->root_key.objectid;
4238         } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4239                 objectid = inode->location.objectid;
4240         } else {
4241                 WARN_ON(1);
4242                 return -EINVAL;
4243         }
4244
4245         path = btrfs_alloc_path();
4246         if (!path)
4247                 return -ENOMEM;
4248
4249         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4250                                    name, name_len, -1);
4251         if (IS_ERR_OR_NULL(di)) {
4252                 ret = di ? PTR_ERR(di) : -ENOENT;
4253                 goto out;
4254         }
4255
4256         leaf = path->nodes[0];
4257         btrfs_dir_item_key_to_cpu(leaf, di, &key);
4258         WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4259         ret = btrfs_delete_one_dir_name(trans, root, path, di);
4260         if (ret) {
4261                 btrfs_abort_transaction(trans, ret);
4262                 goto out;
4263         }
4264         btrfs_release_path(path);
4265
4266         /*
4267          * This is a placeholder inode for a subvolume we didn't have a
4268          * reference to at the time of the snapshot creation.  In the meantime
4269          * we could have renamed the real subvol link into our snapshot, so
4270          * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4271          * Instead simply lookup the dir_index_item for this entry so we can
4272          * remove it.  Otherwise we know we have a ref to the root and we can
4273          * call btrfs_del_root_ref, and it _shouldn't_ fail.
4274          */
4275         if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4276                 di = btrfs_search_dir_index_item(root, path, dir_ino,
4277                                                  name, name_len);
4278                 if (IS_ERR_OR_NULL(di)) {
4279                         if (!di)
4280                                 ret = -ENOENT;
4281                         else
4282                                 ret = PTR_ERR(di);
4283                         btrfs_abort_transaction(trans, ret);
4284                         goto out;
4285                 }
4286
4287                 leaf = path->nodes[0];
4288                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4289                 index = key.offset;
4290                 btrfs_release_path(path);
4291         } else {
4292                 ret = btrfs_del_root_ref(trans, objectid,
4293                                          root->root_key.objectid, dir_ino,
4294                                          &index, name, name_len);
4295                 if (ret) {
4296                         btrfs_abort_transaction(trans, ret);
4297                         goto out;
4298                 }
4299         }
4300
4301         ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
4302         if (ret) {
4303                 btrfs_abort_transaction(trans, ret);
4304                 goto out;
4305         }
4306
4307         btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
4308         inode_inc_iversion(dir);
4309         dir->i_mtime = dir->i_ctime = current_time(dir);
4310         ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
4311         if (ret)
4312                 btrfs_abort_transaction(trans, ret);
4313 out:
4314         btrfs_free_path(path);
4315         return ret;
4316 }
4317
4318 /*
4319  * Helper to check if the subvolume references other subvolumes or if it's
4320  * default.
4321  */
4322 static noinline int may_destroy_subvol(struct btrfs_root *root)
4323 {
4324         struct btrfs_fs_info *fs_info = root->fs_info;
4325         struct btrfs_path *path;
4326         struct btrfs_dir_item *di;
4327         struct btrfs_key key;
4328         u64 dir_id;
4329         int ret;
4330
4331         path = btrfs_alloc_path();
4332         if (!path)
4333                 return -ENOMEM;
4334
4335         /* Make sure this root isn't set as the default subvol */
4336         dir_id = btrfs_super_root_dir(fs_info->super_copy);
4337         di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4338                                    dir_id, "default", 7, 0);
4339         if (di && !IS_ERR(di)) {
4340                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4341                 if (key.objectid == root->root_key.objectid) {
4342                         ret = -EPERM;
4343                         btrfs_err(fs_info,
4344                                   "deleting default subvolume %llu is not allowed",
4345                                   key.objectid);
4346                         goto out;
4347                 }
4348                 btrfs_release_path(path);
4349         }
4350
4351         key.objectid = root->root_key.objectid;
4352         key.type = BTRFS_ROOT_REF_KEY;
4353         key.offset = (u64)-1;
4354
4355         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4356         if (ret < 0)
4357                 goto out;
4358         BUG_ON(ret == 0);
4359
4360         ret = 0;
4361         if (path->slots[0] > 0) {
4362                 path->slots[0]--;
4363                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4364                 if (key.objectid == root->root_key.objectid &&
4365                     key.type == BTRFS_ROOT_REF_KEY)
4366                         ret = -ENOTEMPTY;
4367         }
4368 out:
4369         btrfs_free_path(path);
4370         return ret;
4371 }
4372
4373 /* Delete all dentries for inodes belonging to the root */
4374 static void btrfs_prune_dentries(struct btrfs_root *root)
4375 {
4376         struct btrfs_fs_info *fs_info = root->fs_info;
4377         struct rb_node *node;
4378         struct rb_node *prev;
4379         struct btrfs_inode *entry;
4380         struct inode *inode;
4381         u64 objectid = 0;
4382
4383         if (!BTRFS_FS_ERROR(fs_info))
4384                 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4385
4386         spin_lock(&root->inode_lock);
4387 again:
4388         node = root->inode_tree.rb_node;
4389         prev = NULL;
4390         while (node) {
4391                 prev = node;
4392                 entry = rb_entry(node, struct btrfs_inode, rb_node);
4393
4394                 if (objectid < btrfs_ino(entry))
4395                         node = node->rb_left;
4396                 else if (objectid > btrfs_ino(entry))
4397                         node = node->rb_right;
4398                 else
4399                         break;
4400         }
4401         if (!node) {
4402                 while (prev) {
4403                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
4404                         if (objectid <= btrfs_ino(entry)) {
4405                                 node = prev;
4406                                 break;
4407                         }
4408                         prev = rb_next(prev);
4409                 }
4410         }
4411         while (node) {
4412                 entry = rb_entry(node, struct btrfs_inode, rb_node);
4413                 objectid = btrfs_ino(entry) + 1;
4414                 inode = igrab(&entry->vfs_inode);
4415                 if (inode) {
4416                         spin_unlock(&root->inode_lock);
4417                         if (atomic_read(&inode->i_count) > 1)
4418                                 d_prune_aliases(inode);
4419                         /*
4420                          * btrfs_drop_inode will have it removed from the inode
4421                          * cache when its usage count hits zero.
4422                          */
4423                         iput(inode);
4424                         cond_resched();
4425                         spin_lock(&root->inode_lock);
4426                         goto again;
4427                 }
4428
4429                 if (cond_resched_lock(&root->inode_lock))
4430                         goto again;
4431
4432                 node = rb_next(node);
4433         }
4434         spin_unlock(&root->inode_lock);
4435 }
4436
4437 int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
4438 {
4439         struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
4440         struct btrfs_root *root = BTRFS_I(dir)->root;
4441         struct inode *inode = d_inode(dentry);
4442         struct btrfs_root *dest = BTRFS_I(inode)->root;
4443         struct btrfs_trans_handle *trans;
4444         struct btrfs_block_rsv block_rsv;
4445         u64 root_flags;
4446         int ret;
4447
4448         /*
4449          * Don't allow to delete a subvolume with send in progress. This is
4450          * inside the inode lock so the error handling that has to drop the bit
4451          * again is not run concurrently.
4452          */
4453         spin_lock(&dest->root_item_lock);
4454         if (dest->send_in_progress) {
4455                 spin_unlock(&dest->root_item_lock);
4456                 btrfs_warn(fs_info,
4457                            "attempt to delete subvolume %llu during send",
4458                            dest->root_key.objectid);
4459                 return -EPERM;
4460         }
4461         root_flags = btrfs_root_flags(&dest->root_item);
4462         btrfs_set_root_flags(&dest->root_item,
4463                              root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4464         spin_unlock(&dest->root_item_lock);
4465
4466         down_write(&fs_info->subvol_sem);
4467
4468         ret = may_destroy_subvol(dest);
4469         if (ret)
4470                 goto out_up_write;
4471
4472         btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4473         /*
4474          * One for dir inode,
4475          * two for dir entries,
4476          * two for root ref/backref.
4477          */
4478         ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4479         if (ret)
4480                 goto out_up_write;
4481
4482         trans = btrfs_start_transaction(root, 0);
4483         if (IS_ERR(trans)) {
4484                 ret = PTR_ERR(trans);
4485                 goto out_release;
4486         }
4487         trans->block_rsv = &block_rsv;
4488         trans->bytes_reserved = block_rsv.size;
4489
4490         btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
4491
4492         ret = btrfs_unlink_subvol(trans, dir, dentry);
4493         if (ret) {
4494                 btrfs_abort_transaction(trans, ret);
4495                 goto out_end_trans;
4496         }
4497
4498         ret = btrfs_record_root_in_trans(trans, dest);
4499         if (ret) {
4500                 btrfs_abort_transaction(trans, ret);
4501                 goto out_end_trans;
4502         }
4503
4504         memset(&dest->root_item.drop_progress, 0,
4505                 sizeof(dest->root_item.drop_progress));
4506         btrfs_set_root_drop_level(&dest->root_item, 0);
4507         btrfs_set_root_refs(&dest->root_item, 0);
4508
4509         if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4510                 ret = btrfs_insert_orphan_item(trans,
4511                                         fs_info->tree_root,
4512                                         dest->root_key.objectid);
4513                 if (ret) {
4514                         btrfs_abort_transaction(trans, ret);
4515                         goto out_end_trans;
4516                 }
4517         }
4518
4519         ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4520                                   BTRFS_UUID_KEY_SUBVOL,
4521                                   dest->root_key.objectid);
4522         if (ret && ret != -ENOENT) {
4523                 btrfs_abort_transaction(trans, ret);
4524                 goto out_end_trans;
4525         }
4526         if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4527                 ret = btrfs_uuid_tree_remove(trans,
4528                                           dest->root_item.received_uuid,
4529                                           BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4530                                           dest->root_key.objectid);
4531                 if (ret && ret != -ENOENT) {
4532                         btrfs_abort_transaction(trans, ret);
4533                         goto out_end_trans;
4534                 }
4535         }
4536
4537         free_anon_bdev(dest->anon_dev);
4538         dest->anon_dev = 0;
4539 out_end_trans:
4540         trans->block_rsv = NULL;
4541         trans->bytes_reserved = 0;
4542         ret = btrfs_end_transaction(trans);
4543         inode->i_flags |= S_DEAD;
4544 out_release:
4545         btrfs_subvolume_release_metadata(root, &block_rsv);
4546 out_up_write:
4547         up_write(&fs_info->subvol_sem);
4548         if (ret) {
4549                 spin_lock(&dest->root_item_lock);
4550                 root_flags = btrfs_root_flags(&dest->root_item);
4551                 btrfs_set_root_flags(&dest->root_item,
4552                                 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4553                 spin_unlock(&dest->root_item_lock);
4554         } else {
4555                 d_invalidate(dentry);
4556                 btrfs_prune_dentries(dest);
4557                 ASSERT(dest->send_in_progress == 0);
4558         }
4559
4560         return ret;
4561 }
4562
4563 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4564 {
4565         struct inode *inode = d_inode(dentry);
4566         int err = 0;
4567         struct btrfs_trans_handle *trans;
4568         u64 last_unlink_trans;
4569
4570         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4571                 return -ENOTEMPTY;
4572         if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
4573                 return btrfs_delete_subvolume(dir, dentry);
4574
4575         trans = __unlink_start_trans(dir);
4576         if (IS_ERR(trans))
4577                 return PTR_ERR(trans);
4578
4579         if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4580                 err = btrfs_unlink_subvol(trans, dir, dentry);
4581                 goto out;
4582         }
4583
4584         err = btrfs_orphan_add(trans, BTRFS_I(inode));
4585         if (err)
4586                 goto out;
4587
4588         last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4589
4590         /* now the directory is empty */
4591         err = btrfs_unlink_inode(trans, BTRFS_I(dir),
4592                         BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4593                         dentry->d_name.len);
4594         if (!err) {
4595                 btrfs_i_size_write(BTRFS_I(inode), 0);
4596                 /*
4597                  * Propagate the last_unlink_trans value of the deleted dir to
4598                  * its parent directory. This is to prevent an unrecoverable
4599                  * log tree in the case we do something like this:
4600                  * 1) create dir foo
4601                  * 2) create snapshot under dir foo
4602                  * 3) delete the snapshot
4603                  * 4) rmdir foo
4604                  * 5) mkdir foo
4605                  * 6) fsync foo or some file inside foo
4606                  */
4607                 if (last_unlink_trans >= trans->transid)
4608                         BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4609         }
4610 out:
4611         btrfs_end_transaction(trans);
4612         btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4613
4614         return err;
4615 }
4616
4617 /*
4618  * Return this if we need to call truncate_block for the last bit of the
4619  * truncate.
4620  */
4621 #define NEED_TRUNCATE_BLOCK 1
4622
4623 /*
4624  * Remove inode items from a given root.
4625  *
4626  * @trans:              A transaction handle.
4627  * @root:               The root from which to remove items.
4628  * @inode:              The inode whose items we want to remove.
4629  * @new_size:           The new i_size for the inode. This is only applicable when
4630  *                      @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise.
4631  * @min_type:           The minimum key type to remove. All keys with a type
4632  *                      greater than this value are removed and all keys with
4633  *                      this type are removed only if their offset is >= @new_size.
4634  * @extents_found:      Output parameter that will contain the number of file
4635  *                      extent items that were removed or adjusted to the new
4636  *                      inode i_size. The caller is responsible for initializing
4637  *                      the counter. Also, it can be NULL if the caller does not
4638  *                      need this counter.
4639  *
4640  * Remove all keys associated with the inode from the given root that have a key
4641  * with a type greater than or equals to @min_type. When @min_type has a value of
4642  * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
4643  * greater than or equals to @new_size. If a file extent item that starts before
4644  * @new_size and ends after it is found, its length is adjusted.
4645  *
4646  * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
4647  * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
4648  */
4649 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4650                                struct btrfs_root *root,
4651                                struct btrfs_inode *inode,
4652                                u64 new_size, u32 min_type,
4653                                u64 *extents_found)
4654 {
4655         struct btrfs_fs_info *fs_info = root->fs_info;
4656         struct btrfs_path *path;
4657         struct extent_buffer *leaf;
4658         struct btrfs_file_extent_item *fi;
4659         struct btrfs_key key;
4660         struct btrfs_key found_key;
4661         u64 extent_start = 0;
4662         u64 extent_num_bytes = 0;
4663         u64 extent_offset = 0;
4664         u64 item_end = 0;
4665         u64 last_size = new_size;
4666         u32 found_type = (u8)-1;
4667         int found_extent;
4668         int del_item;
4669         int pending_del_nr = 0;
4670         int pending_del_slot = 0;
4671         int extent_type = -1;
4672         int ret;
4673         u64 ino = btrfs_ino(inode);
4674         u64 bytes_deleted = 0;
4675         bool be_nice = false;
4676         bool should_throttle = false;
4677         const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
4678         struct extent_state *cached_state = NULL;
4679
4680         BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4681
4682         /*
4683          * For non-free space inodes and non-shareable roots, we want to back
4684          * off from time to time.  This means all inodes in subvolume roots,
4685          * reloc roots, and data reloc roots.
4686          */
4687         if (!btrfs_is_free_space_inode(inode) &&
4688             test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
4689                 be_nice = true;
4690
4691         path = btrfs_alloc_path();
4692         if (!path)
4693                 return -ENOMEM;
4694         path->reada = READA_BACK;
4695
4696         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4697                 lock_extent_bits(&inode->io_tree, lock_start, (u64)-1,
4698                                  &cached_state);
4699
4700                 /*
4701                  * We want to drop from the next block forward in case this
4702                  * new size is not block aligned since we will be keeping the
4703                  * last block of the extent just the way it is.
4704                  */
4705                 btrfs_drop_extent_cache(inode, ALIGN(new_size,
4706                                         fs_info->sectorsize),
4707                                         (u64)-1, 0);
4708         }
4709
4710         /*
4711          * This function is also used to drop the items in the log tree before
4712          * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4713          * it is used to drop the logged items. So we shouldn't kill the delayed
4714          * items.
4715          */
4716         if (min_type == 0 && root == inode->root)
4717                 btrfs_kill_delayed_inode_items(inode);
4718
4719         key.objectid = ino;
4720         key.offset = (u64)-1;
4721         key.type = (u8)-1;
4722
4723 search_again:
4724         /*
4725          * with a 16K leaf size and 128MB extents, you can actually queue
4726          * up a huge file in a single leaf.  Most of the time that
4727          * bytes_deleted is > 0, it will be huge by the time we get here
4728          */
4729         if (be_nice && bytes_deleted > SZ_32M &&
4730             btrfs_should_end_transaction(trans)) {
4731                 ret = -EAGAIN;
4732                 goto out;
4733         }
4734
4735         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4736         if (ret < 0)
4737                 goto out;
4738
4739         if (ret > 0) {
4740                 ret = 0;
4741                 /* there are no items in the tree for us to truncate, we're
4742                  * done
4743                  */
4744                 if (path->slots[0] == 0)
4745                         goto out;
4746                 path->slots[0]--;
4747         }
4748
4749         while (1) {
4750                 u64 clear_start = 0, clear_len = 0;
4751
4752                 fi = NULL;
4753                 leaf = path->nodes[0];
4754                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4755                 found_type = found_key.type;
4756
4757                 if (found_key.objectid != ino)
4758                         break;
4759
4760                 if (found_type < min_type)
4761                         break;
4762
4763                 item_end = found_key.offset;
4764                 if (found_type == BTRFS_EXTENT_DATA_KEY) {
4765                         fi = btrfs_item_ptr(leaf, path->slots[0],
4766                                             struct btrfs_file_extent_item);
4767                         extent_type = btrfs_file_extent_type(leaf, fi);
4768                         if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4769                                 item_end +=
4770                                     btrfs_file_extent_num_bytes(leaf, fi);
4771
4772                                 trace_btrfs_truncate_show_fi_regular(
4773                                         inode, leaf, fi, found_key.offset);
4774                         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4775                                 item_end += btrfs_file_extent_ram_bytes(leaf,
4776                                                                         fi);
4777
4778                                 trace_btrfs_truncate_show_fi_inline(
4779                                         inode, leaf, fi, path->slots[0],
4780                                         found_key.offset);
4781                         }
4782                         item_end--;
4783                 }
4784                 if (found_type > min_type) {
4785                         del_item = 1;
4786                 } else {
4787                         if (item_end < new_size)
4788                                 break;
4789                         if (found_key.offset >= new_size)
4790                                 del_item = 1;
4791                         else
4792                                 del_item = 0;
4793                 }
4794                 found_extent = 0;
4795                 /* FIXME, shrink the extent if the ref count is only 1 */
4796                 if (found_type != BTRFS_EXTENT_DATA_KEY)
4797                         goto delete;
4798
4799                 if (extents_found != NULL)
4800                         (*extents_found)++;
4801
4802                 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4803                         u64 num_dec;
4804
4805                         clear_start = found_key.offset;
4806                         extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4807                         if (!del_item) {
4808                                 u64 orig_num_bytes =
4809                                         btrfs_file_extent_num_bytes(leaf, fi);
4810                                 extent_num_bytes = ALIGN(new_size -
4811                                                 found_key.offset,
4812                                                 fs_info->sectorsize);
4813                                 clear_start = ALIGN(new_size, fs_info->sectorsize);
4814                                 btrfs_set_file_extent_num_bytes(leaf, fi,
4815                                                          extent_num_bytes);
4816                                 num_dec = (orig_num_bytes -
4817                                            extent_num_bytes);
4818                                 if (test_bit(BTRFS_ROOT_SHAREABLE,
4819                                              &root->state) &&
4820                                     extent_start != 0)
4821                                         inode_sub_bytes(&inode->vfs_inode,
4822                                                         num_dec);
4823                                 btrfs_mark_buffer_dirty(leaf);
4824                         } else {
4825                                 extent_num_bytes =
4826                                         btrfs_file_extent_disk_num_bytes(leaf,
4827                                                                          fi);
4828                                 extent_offset = found_key.offset -
4829                                         btrfs_file_extent_offset(leaf, fi);
4830
4831                                 /* FIXME blocksize != 4096 */
4832                                 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4833                                 if (extent_start != 0) {
4834                                         found_extent = 1;
4835                                         if (test_bit(BTRFS_ROOT_SHAREABLE,
4836                                                      &root->state))
4837                                                 inode_sub_bytes(&inode->vfs_inode,
4838                                                                 num_dec);
4839                                 }
4840                         }
4841                         clear_len = num_dec;
4842                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4843                         /*
4844                          * we can't truncate inline items that have had
4845                          * special encodings
4846                          */
4847                         if (!del_item &&
4848                             btrfs_file_extent_encryption(leaf, fi) == 0 &&
4849                             btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
4850                             btrfs_file_extent_compression(leaf, fi) == 0) {
4851                                 u32 size = (u32)(new_size - found_key.offset);
4852
4853                                 btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4854                                 size = btrfs_file_extent_calc_inline_size(size);
4855                                 btrfs_truncate_item(path, size, 1);
4856                         } else if (!del_item) {
4857                                 /*
4858                                  * We have to bail so the last_size is set to
4859                                  * just before this extent.
4860                                  */
4861                                 ret = NEED_TRUNCATE_BLOCK;
4862                                 break;
4863                         } else {
4864                                 /*
4865                                  * Inline extents are special, we just treat
4866                                  * them as a full sector worth in the file
4867                                  * extent tree just for simplicity sake.
4868                                  */
4869                                 clear_len = fs_info->sectorsize;
4870                         }
4871
4872                         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
4873                                 inode_sub_bytes(&inode->vfs_inode,
4874                                                 item_end + 1 - new_size);
4875                 }
4876 delete:
4877                 /*
4878                  * We use btrfs_truncate_inode_items() to clean up log trees for
4879                  * multiple fsyncs, and in this case we don't want to clear the
4880                  * file extent range because it's just the log.
4881                  */
4882                 if (root == inode->root) {
4883                         ret = btrfs_inode_clear_file_extent_range(inode,
4884                                                   clear_start, clear_len);
4885                         if (ret) {
4886                                 btrfs_abort_transaction(trans, ret);
4887                                 break;
4888                         }
4889                 }
4890
4891                 if (del_item)
4892                         last_size = found_key.offset;
4893                 else
4894                         last_size = new_size;
4895                 if (del_item) {
4896                         if (!pending_del_nr) {
4897                                 /* no pending yet, add ourselves */
4898                                 pending_del_slot = path->slots[0];
4899                                 pending_del_nr = 1;
4900                         } else if (pending_del_nr &&
4901                                    path->slots[0] + 1 == pending_del_slot) {
4902                                 /* hop on the pending chunk */
4903                                 pending_del_nr++;
4904                                 pending_del_slot = path->slots[0];
4905                         } else {
4906                                 BUG();
4907                         }
4908                 } else {
4909                         break;
4910                 }
4911                 should_throttle = false;
4912
4913                 if (found_extent &&
4914                     root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4915                         struct btrfs_ref ref = { 0 };
4916
4917                         bytes_deleted += extent_num_bytes;
4918
4919                         btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
4920                                         extent_start, extent_num_bytes, 0);
4921                         btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
4922                                         ino, extent_offset,
4923                                         root->root_key.objectid, false);
4924                         ret = btrfs_free_extent(trans, &ref);
4925                         if (ret) {
4926                                 btrfs_abort_transaction(trans, ret);
4927                                 break;
4928                         }
4929                         if (be_nice) {
4930                                 if (btrfs_should_throttle_delayed_refs(trans))
4931                                         should_throttle = true;
4932                         }
4933                 }
4934
4935                 if (found_type == BTRFS_INODE_ITEM_KEY)
4936                         break;
4937
4938                 if (path->slots[0] == 0 ||
4939                     path->slots[0] != pending_del_slot ||
4940                     should_throttle) {
4941                         if (pending_del_nr) {
4942                                 ret = btrfs_del_items(trans, root, path,
4943                                                 pending_del_slot,
4944                                                 pending_del_nr);
4945                                 if (ret) {
4946                                         btrfs_abort_transaction(trans, ret);
4947                                         break;
4948                                 }
4949                                 pending_del_nr = 0;
4950                         }
4951                         btrfs_release_path(path);
4952
4953                         /*
4954                          * We can generate a lot of delayed refs, so we need to
4955                          * throttle every once and a while and make sure we're
4956                          * adding enough space to keep up with the work we are
4957                          * generating.  Since we hold a transaction here we
4958                          * can't flush, and we don't want to FLUSH_LIMIT because
4959                          * we could have generated too many delayed refs to
4960                          * actually allocate, so just bail if we're short and
4961                          * let the normal reservation dance happen higher up.
4962                          */
4963                         if (should_throttle) {
4964                                 ret = btrfs_delayed_refs_rsv_refill(fs_info,
4965                                                         BTRFS_RESERVE_NO_FLUSH);
4966                                 if (ret) {
4967                                         ret = -EAGAIN;
4968                                         break;
4969                                 }
4970                         }
4971                         goto search_again;
4972                 } else {
4973                         path->slots[0]--;
4974                 }
4975         }
4976 out:
4977         if (ret >= 0 && pending_del_nr) {
4978                 int err;
4979
4980                 err = btrfs_del_items(trans, root, path, pending_del_slot,
4981                                       pending_del_nr);
4982                 if (err) {
4983                         btrfs_abort_transaction(trans, err);
4984                         ret = err;
4985                 }
4986         }
4987         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4988                 ASSERT(last_size >= new_size);
4989                 if (!ret && last_size > new_size)
4990                         last_size = new_size;
4991                 btrfs_inode_safe_disk_i_size_write(inode, last_size);
4992                 unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1,
4993                                      &cached_state);
4994         }
4995
4996         btrfs_free_path(path);
4997         return ret;
4998 }
4999
5000 /*
5001  * btrfs_truncate_block - read, zero a chunk and write a block
5002  * @inode - inode that we're zeroing
5003  * @from - the offset to start zeroing
5004  * @len - the length to zero, 0 to zero the entire range respective to the
5005  *      offset
5006  * @front - zero up to the offset instead of from the offset on
5007  *
5008  * This will find the block for the "from" offset and cow the block and zero the
5009  * part we want to zero.  This is used with truncate and hole punching.
5010  */
5011 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
5012                          int front)
5013 {
5014         struct btrfs_fs_info *fs_info = inode->root->fs_info;
5015         struct address_space *mapping = inode->vfs_inode.i_mapping;
5016         struct extent_io_tree *io_tree = &inode->io_tree;
5017         struct btrfs_ordered_extent *ordered;
5018         struct extent_state *cached_state = NULL;
5019         struct extent_changeset *data_reserved = NULL;
5020         bool only_release_metadata = false;
5021         u32 blocksize = fs_info->sectorsize;
5022         pgoff_t index = from >> PAGE_SHIFT;
5023         unsigned offset = from & (blocksize - 1);
5024         struct page *page;
5025         gfp_t mask = btrfs_alloc_write_mask(mapping);
5026         size_t write_bytes = blocksize;
5027         int ret = 0;
5028         u64 block_start;
5029         u64 block_end;
5030
5031         if (IS_ALIGNED(offset, blocksize) &&
5032             (!len || IS_ALIGNED(len, blocksize)))
5033                 goto out;
5034
5035         block_start = round_down(from, blocksize);
5036         block_end = block_start + blocksize - 1;
5037
5038         ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
5039                                           blocksize);
5040         if (ret < 0) {
5041                 if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
5042                         /* For nocow case, no need to reserve data space */
5043                         only_release_metadata = true;
5044                 } else {
5045                         goto out;
5046                 }
5047         }
5048         ret = btrfs_delalloc_reserve_metadata(inode, blocksize);
5049         if (ret < 0) {
5050                 if (!only_release_metadata)
5051                         btrfs_free_reserved_data_space(inode, data_reserved,
5052                                                        block_start, blocksize);
5053                 goto out;
5054         }
5055 again:
5056         page = find_or_create_page(mapping, index, mask);
5057         if (!page) {
5058                 btrfs_delalloc_release_space(inode, data_reserved, block_start,
5059                                              blocksize, true);
5060                 btrfs_delalloc_release_extents(inode, blocksize);
5061                 ret = -ENOMEM;
5062                 goto out;
5063         }
5064         ret = set_page_extent_mapped(page);
5065         if (ret < 0)
5066                 goto out_unlock;
5067
5068         if (!PageUptodate(page)) {
5069                 ret = btrfs_readpage(NULL, page);
5070                 lock_page(page);
5071                 if (page->mapping != mapping) {
5072                         unlock_page(page);
5073                         put_page(page);
5074                         goto again;
5075                 }
5076                 if (!PageUptodate(page)) {
5077                         ret = -EIO;
5078                         goto out_unlock;
5079                 }
5080         }
5081         wait_on_page_writeback(page);
5082
5083         lock_extent_bits(io_tree, block_start, block_end, &cached_state);
5084
5085         ordered = btrfs_lookup_ordered_extent(inode, block_start);
5086         if (ordered) {
5087                 unlock_extent_cached(io_tree, block_start, block_end,
5088                                      &cached_state);
5089                 unlock_page(page);
5090                 put_page(page);
5091                 btrfs_start_ordered_extent(ordered, 1);
5092                 btrfs_put_ordered_extent(ordered);
5093                 goto again;
5094         }
5095
5096         clear_extent_bit(&inode->io_tree, block_start, block_end,
5097                          EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
5098                          0, 0, &cached_state);
5099
5100         ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
5101                                         &cached_state);
5102         if (ret) {
5103                 unlock_extent_cached(io_tree, block_start, block_end,
5104                                      &cached_state);
5105                 goto out_unlock;
5106         }
5107
5108         if (offset != blocksize) {
5109                 if (!len)
5110                         len = blocksize - offset;
5111                 if (front)
5112                         memzero_page(page, (block_start - page_offset(page)),
5113                                      offset);
5114                 else
5115                         memzero_page(page, (block_start - page_offset(page)) + offset,
5116                                      len);
5117                 flush_dcache_page(page);
5118         }
5119         btrfs_page_clear_checked(fs_info, page, block_start,
5120                                  block_end + 1 - block_start);
5121         btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
5122         unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
5123
5124         if (only_release_metadata)
5125                 set_extent_bit(&inode->io_tree, block_start, block_end,
5126                                EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
5127
5128 out_unlock:
5129         if (ret) {
5130                 if (only_release_metadata)
5131                         btrfs_delalloc_release_metadata(inode, blocksize, true);
5132                 else
5133                         btrfs_delalloc_release_space(inode, data_reserved,
5134                                         block_start, blocksize, true);
5135         }
5136         btrfs_delalloc_release_extents(inode, blocksize);
5137         unlock_page(page);
5138         put_page(page);
5139 out:
5140         if (only_release_metadata)
5141                 btrfs_check_nocow_unlock(inode);
5142         extent_changeset_free(data_reserved);
5143         return ret;
5144 }
5145
5146 static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
5147                              u64 offset, u64 len)
5148 {
5149         struct btrfs_fs_info *fs_info = root->fs_info;
5150         struct btrfs_trans_handle *trans;
5151         struct btrfs_drop_extents_args drop_args = { 0 };
5152         int ret;
5153
5154         /*
5155          * If NO_HOLES is enabled, we don't need to do anything.
5156          * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
5157          * or btrfs_update_inode() will be called, which guarantee that the next
5158          * fsync will know this inode was changed and needs to be logged.
5159          */
5160         if (btrfs_fs_incompat(fs_info, NO_HOLES))
5161                 return 0;
5162
5163         /*
5164          * 1 - for the one we're dropping
5165          * 1 - for the one we're adding
5166          * 1 - for updating the inode.
5167          */
5168         trans = btrfs_start_transaction(root, 3);
5169         if (IS_ERR(trans))
5170                 return PTR_ERR(trans);
5171
5172         drop_args.start = offset;
5173         drop_args.end = offset + len;
5174         drop_args.drop_cache = true;
5175
5176         ret = btrfs_drop_extents(trans, root, inode, &drop_args);
5177         if (ret) {
5178                 btrfs_abort_transaction(trans, ret);
5179                 btrfs_end_transaction(trans);
5180                 return ret;
5181         }
5182
5183         ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
5184                         offset, 0, 0, len, 0, len, 0, 0, 0);
5185         if (ret) {
5186                 btrfs_abort_transaction(trans, ret);
5187         } else {
5188                 btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
5189                 btrfs_update_inode(trans, root, inode);
5190         }
5191         btrfs_end_transaction(trans);
5192         return ret;
5193 }
5194
5195 /*
5196  * This function puts in dummy file extents for the area we're creating a hole
5197  * for.  So if we are truncating this file to a larger size we need to insert
5198  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
5199  * the range between oldsize and size
5200  */
5201 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
5202 {
5203         struct btrfs_root *root = inode->root;
5204         struct btrfs_fs_info *fs_info = root->fs_info;
5205         struct extent_io_tree *io_tree = &inode->io_tree;
5206         struct extent_map *em = NULL;
5207         struct extent_state *cached_state = NULL;
5208         struct extent_map_tree *em_tree = &inode->extent_tree;
5209         u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
5210         u64 block_end = ALIGN(size, fs_info->sectorsize);
5211         u64 last_byte;
5212         u64 cur_offset;
5213         u64 hole_size;
5214         int err = 0;
5215
5216         /*
5217          * If our size started in the middle of a block we need to zero out the
5218          * rest of the block before we expand the i_size, otherwise we could
5219          * expose stale data.
5220          */
5221         err = btrfs_truncate_block(inode, oldsize, 0, 0);
5222         if (err)
5223                 return err;
5224
5225         if (size <= hole_start)
5226                 return 0;
5227
5228         btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
5229                                            &cached_state);
5230         cur_offset = hole_start;
5231         while (1) {
5232                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
5233                                       block_end - cur_offset);
5234                 if (IS_ERR(em)) {
5235                         err = PTR_ERR(em);
5236                         em = NULL;
5237                         break;
5238                 }
5239                 last_byte = min(extent_map_end(em), block_end);
5240                 last_byte = ALIGN(last_byte, fs_info->sectorsize);
5241                 hole_size = last_byte - cur_offset;
5242
5243                 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
5244                         struct extent_map *hole_em;
5245
5246                         err = maybe_insert_hole(root, inode, cur_offset,
5247                                                 hole_size);
5248                         if (err)
5249                                 break;
5250
5251                         err = btrfs_inode_set_file_extent_range(inode,
5252                                                         cur_offset, hole_size);
5253                         if (err)
5254                                 break;
5255
5256                         btrfs_drop_extent_cache(inode, cur_offset,
5257                                                 cur_offset + hole_size - 1, 0);
5258                         hole_em = alloc_extent_map();
5259                         if (!hole_em) {
5260                                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5261                                         &inode->runtime_flags);
5262                                 goto next;
5263                         }
5264                         hole_em->start = cur_offset;
5265                         hole_em->len = hole_size;
5266                         hole_em->orig_start = cur_offset;
5267
5268                         hole_em->block_start = EXTENT_MAP_HOLE;
5269                         hole_em->block_len = 0;
5270                         hole_em->orig_block_len = 0;
5271                         hole_em->ram_bytes = hole_size;
5272                         hole_em->compress_type = BTRFS_COMPRESS_NONE;
5273                         hole_em->generation = fs_info->generation;
5274
5275                         while (1) {
5276                                 write_lock(&em_tree->lock);
5277                                 err = add_extent_mapping(em_tree, hole_em, 1);
5278                                 write_unlock(&em_tree->lock);
5279                                 if (err != -EEXIST)
5280                                         break;
5281                                 btrfs_drop_extent_cache(inode, cur_offset,
5282                                                         cur_offset +
5283                                                         hole_size - 1, 0);
5284                         }
5285                         free_extent_map(hole_em);
5286                 } else {
5287                         err = btrfs_inode_set_file_extent_range(inode,
5288                                                         cur_offset, hole_size);
5289                         if (err)
5290                                 break;
5291                 }
5292 next:
5293                 free_extent_map(em);
5294                 em = NULL;
5295                 cur_offset = last_byte;
5296                 if (cur_offset >= block_end)
5297                         break;
5298         }
5299         free_extent_map(em);
5300         unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
5301         return err;
5302 }
5303
5304 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
5305 {
5306         struct btrfs_root *root = BTRFS_I(inode)->root;
5307         struct btrfs_trans_handle *trans;
5308         loff_t oldsize = i_size_read(inode);
5309         loff_t newsize = attr->ia_size;
5310         int mask = attr->ia_valid;
5311         int ret;
5312
5313         /*
5314          * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
5315          * special case where we need to update the times despite not having
5316          * these flags set.  For all other operations the VFS set these flags
5317          * explicitly if it wants a timestamp update.
5318          */
5319         if (newsize != oldsize) {
5320                 inode_inc_iversion(inode);
5321                 if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
5322                         inode->i_ctime = inode->i_mtime =
5323                                 current_time(inode);
5324         }
5325
5326         if (newsize > oldsize) {
5327                 /*
5328                  * Don't do an expanding truncate while snapshotting is ongoing.
5329                  * This is to ensure the snapshot captures a fully consistent
5330                  * state of this file - if the snapshot captures this expanding
5331                  * truncation, it must capture all writes that happened before
5332                  * this truncation.
5333                  */
5334                 btrfs_drew_write_lock(&root->snapshot_lock);
5335                 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
5336                 if (ret) {
5337                         btrfs_drew_write_unlock(&root->snapshot_lock);
5338                         return ret;
5339                 }
5340
5341                 trans = btrfs_start_transaction(root, 1);
5342                 if (IS_ERR(trans)) {
5343                         btrfs_drew_write_unlock(&root->snapshot_lock);
5344                         return PTR_ERR(trans);
5345                 }
5346
5347                 i_size_write(inode, newsize);
5348                 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
5349                 pagecache_isize_extended(inode, oldsize, newsize);
5350                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
5351                 btrfs_drew_write_unlock(&root->snapshot_lock);
5352                 btrfs_end_transaction(trans);
5353         } else {
5354                 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5355
5356                 if (btrfs_is_zoned(fs_info)) {
5357                         ret = btrfs_wait_ordered_range(inode,
5358                                         ALIGN(newsize, fs_info->sectorsize),
5359                                         (u64)-1);
5360                         if (ret)
5361                                 return ret;
5362                 }
5363
5364                 /*
5365                  * We're truncating a file that used to have good data down to
5366                  * zero. Make sure any new writes to the file get on disk
5367                  * on close.
5368                  */
5369                 if (newsize == 0)
5370                         set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5371                                 &BTRFS_I(inode)->runtime_flags);
5372
5373                 truncate_setsize(inode, newsize);
5374
5375                 inode_dio_wait(inode);
5376
5377                 ret = btrfs_truncate(inode, newsize == oldsize);
5378                 if (ret && inode->i_nlink) {
5379                         int err;
5380
5381                         /*
5382                          * Truncate failed, so fix up the in-memory size. We
5383                          * adjusted disk_i_size down as we removed extents, so
5384                          * wait for disk_i_size to be stable and then update the
5385                          * in-memory size to match.
5386                          */
5387                         err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5388                         if (err)
5389                                 return err;
5390                         i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5391                 }
5392         }
5393
5394         return ret;
5395 }
5396
5397 static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
5398                          struct iattr *attr)
5399 {
5400         struct inode *inode = d_inode(dentry);
5401         struct btrfs_root *root = BTRFS_I(inode)->root;
5402         int err;
5403
5404         if (btrfs_root_readonly(root))
5405                 return -EROFS;
5406
5407         err = setattr_prepare(mnt_userns, dentry, attr);
5408         if (err)
5409                 return err;
5410
5411         if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5412                 err = btrfs_setsize(inode, attr);
5413                 if (err)
5414                         return err;
5415         }
5416
5417         if (attr->ia_valid) {
5418                 setattr_copy(mnt_userns, inode, attr);
5419                 inode_inc_iversion(inode);
5420                 err = btrfs_dirty_inode(inode);
5421
5422                 if (!err && attr->ia_valid & ATTR_MODE)
5423                         err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
5424         }
5425
5426         return err;
5427 }
5428
5429 /*
5430  * While truncating the inode pages during eviction, we get the VFS calling
5431  * btrfs_invalidatepage() against each page of the inode. This is slow because
5432  * the calls to btrfs_invalidatepage() result in a huge amount of calls to
5433  * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
5434  * extent_state structures over and over, wasting lots of time.
5435  *
5436  * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
5437  * those expensive operations on a per page basis and do only the ordered io
5438  * finishing, while we release here the extent_map and extent_state structures,
5439  * without the excessive merging and splitting.
5440  */
5441 static void evict_inode_truncate_pages(struct inode *inode)
5442 {
5443         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5444         struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
5445         struct rb_node *node;
5446
5447         ASSERT(inode->i_state & I_FREEING);
5448         truncate_inode_pages_final(&inode->i_data);
5449
5450         write_lock(&map_tree->lock);
5451         while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
5452                 struct extent_map *em;
5453
5454                 node = rb_first_cached(&map_tree->map);
5455                 em = rb_entry(node, struct extent_map, rb_node);
5456                 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
5457                 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
5458                 remove_extent_mapping(map_tree, em);
5459                 free_extent_map(em);
5460                 if (need_resched()) {
5461                         write_unlock(&map_tree->lock);
5462                         cond_resched();
5463                         write_lock(&map_tree->lock);
5464                 }
5465         }
5466         write_unlock(&map_tree->lock);
5467
5468         /*
5469          * Keep looping until we have no more ranges in the io tree.
5470          * We can have ongoing bios started by readahead that have
5471          * their endio callback (extent_io.c:end_bio_extent_readpage)
5472          * still in progress (unlocked the pages in the bio but did not yet
5473          * unlocked the ranges in the io tree). Therefore this means some
5474          * ranges can still be locked and eviction started because before
5475          * submitting those bios, which are executed by a separate task (work
5476          * queue kthread), inode references (inode->i_count) were not taken
5477          * (which would be dropped in the end io callback of each bio).
5478          * Therefore here we effectively end up waiting for those bios and
5479          * anyone else holding locked ranges without having bumped the inode's
5480          * reference count - if we don't do it, when they access the inode's
5481          * io_tree to unlock a range it may be too late, leading to an
5482          * use-after-free issue.
5483          */
5484         spin_lock(&io_tree->lock);
5485         while (!RB_EMPTY_ROOT(&io_tree->state)) {
5486                 struct extent_state *state;
5487                 struct extent_state *cached_state = NULL;
5488                 u64 start;
5489                 u64 end;
5490                 unsigned state_flags;
5491
5492                 node = rb_first(&io_tree->state);
5493                 state = rb_entry(node, struct extent_state, rb_node);
5494                 start = state->start;
5495                 end = state->end;
5496                 state_flags = state->state;
5497                 spin_unlock(&io_tree->lock);
5498
5499                 lock_extent_bits(io_tree, start, end, &cached_state);
5500
5501                 /*
5502                  * If still has DELALLOC flag, the extent didn't reach disk,
5503                  * and its reserved space won't be freed by delayed_ref.
5504                  * So we need to free its reserved space here.
5505                  * (Refer to comment in btrfs_invalidatepage, case 2)
5506                  *
5507                  * Note, end is the bytenr of last byte, so we need + 1 here.
5508                  */
5509                 if (state_flags & EXTENT_DELALLOC)
5510                         btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5511                                                end - start + 1);
5512
5513                 clear_extent_bit(io_tree, start, end,
5514                                  EXTENT_LOCKED | EXTENT_DELALLOC |
5515                                  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
5516                                  &cached_state);
5517
5518                 cond_resched();
5519                 spin_lock(&io_tree->lock);
5520         }
5521         spin_unlock(&io_tree->lock);
5522 }
5523
5524 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5525                                                         struct btrfs_block_rsv *rsv)
5526 {
5527         struct btrfs_fs_info *fs_info = root->fs_info;
5528         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5529         struct btrfs_trans_handle *trans;
5530         u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
5531         int ret;
5532
5533         /*
5534          * Eviction should be taking place at some place safe because of our
5535          * delayed iputs.  However the normal flushing code will run delayed
5536          * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5537          *
5538          * We reserve the delayed_refs_extra here again because we can't use
5539          * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5540          * above.  We reserve our extra bit here because we generate a ton of
5541          * delayed refs activity by truncating.
5542          *
5543          * If we cannot make our reservation we'll attempt to steal from the
5544          * global reserve, because we really want to be able to free up space.
5545          */
5546         ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra,
5547                                      BTRFS_RESERVE_FLUSH_EVICT);
5548         if (ret) {
5549                 /*
5550                  * Try to steal from the global reserve if there is space for
5551                  * it.
5552                  */
5553                 if (btrfs_check_space_for_delayed_refs(fs_info) ||
5554                     btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) {
5555                         btrfs_warn(fs_info,
5556                                    "could not allocate space for delete; will truncate on mount");
5557                         return ERR_PTR(-ENOSPC);
5558                 }
5559                 delayed_refs_extra = 0;
5560         }
5561
5562         trans = btrfs_join_transaction(root);
5563         if (IS_ERR(trans))
5564                 return trans;
5565
5566         if (delayed_refs_extra) {
5567                 trans->block_rsv = &fs_info->trans_block_rsv;
5568                 trans->bytes_reserved = delayed_refs_extra;
5569                 btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5570                                         delayed_refs_extra, 1);
5571         }
5572         return trans;
5573 }
5574
5575 void btrfs_evict_inode(struct inode *inode)
5576 {
5577         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5578         struct btrfs_trans_handle *trans;
5579         struct btrfs_root *root = BTRFS_I(inode)->root;
5580         struct btrfs_block_rsv *rsv;
5581         int ret;
5582
5583         trace_btrfs_inode_evict(inode);
5584
5585         if (!root) {
5586                 fsverity_cleanup_inode(inode);
5587                 clear_inode(inode);
5588                 return;
5589         }
5590
5591         evict_inode_truncate_pages(inode);
5592
5593         if (inode->i_nlink &&
5594             ((btrfs_root_refs(&root->root_item) != 0 &&
5595               root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5596              btrfs_is_free_space_inode(BTRFS_I(inode))))
5597                 goto no_delete;
5598
5599         if (is_bad_inode(inode))
5600                 goto no_delete;
5601
5602         btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
5603
5604         if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5605                 goto no_delete;
5606
5607         if (inode->i_nlink > 0) {
5608                 BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5609                        root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5610                 goto no_delete;
5611         }
5612
5613         ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5614         if (ret)
5615                 goto no_delete;
5616
5617         rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5618         if (!rsv)
5619                 goto no_delete;
5620         rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5621         rsv->failfast = 1;
5622
5623         btrfs_i_size_write(BTRFS_I(inode), 0);
5624
5625         while (1) {
5626                 trans = evict_refill_and_join(root, rsv);
5627                 if (IS_ERR(trans))
5628                         goto free_rsv;
5629
5630                 trans->block_rsv = rsv;
5631
5632                 ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
5633                                                  0, 0, NULL);
5634                 trans->block_rsv = &fs_info->trans_block_rsv;
5635                 btrfs_end_transaction(trans);
5636                 btrfs_btree_balance_dirty(fs_info);
5637                 if (ret && ret != -ENOSPC && ret != -EAGAIN)
5638                         goto free_rsv;
5639                 else if (!ret)
5640                         break;
5641         }
5642
5643         /*
5644          * Errors here aren't a big deal, it just means we leave orphan items in
5645          * the tree. They will be cleaned up on the next mount. If the inode
5646          * number gets reused, cleanup deletes the orphan item without doing
5647          * anything, and unlink reuses the existing orphan item.
5648          *
5649          * If it turns out that we are dropping too many of these, we might want
5650          * to add a mechanism for retrying these after a commit.
5651          */
5652         trans = evict_refill_and_join(root, rsv);
5653         if (!IS_ERR(trans)) {
5654                 trans->block_rsv = rsv;
5655                 btrfs_orphan_del(trans, BTRFS_I(inode));
5656                 trans->block_rsv = &fs_info->trans_block_rsv;
5657                 btrfs_end_transaction(trans);
5658         }
5659
5660 free_rsv:
5661         btrfs_free_block_rsv(fs_info, rsv);
5662 no_delete:
5663         /*
5664          * If we didn't successfully delete, the orphan item will still be in
5665          * the tree and we'll retry on the next mount. Again, we might also want
5666          * to retry these periodically in the future.
5667          */
5668         btrfs_remove_delayed_node(BTRFS_I(inode));
5669         fsverity_cleanup_inode(inode);
5670         clear_inode(inode);
5671 }
5672
5673 /*
5674  * Return the key found in the dir entry in the location pointer, fill @type
5675  * with BTRFS_FT_*, and return 0.
5676  *
5677  * If no dir entries were found, returns -ENOENT.
5678  * If found a corrupted location in dir entry, returns -EUCLEAN.
5679  */
5680 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
5681                                struct btrfs_key *location, u8 *type)
5682 {
5683         const char *name = dentry->d_name.name;
5684         int namelen = dentry->d_name.len;
5685         struct btrfs_dir_item *di;
5686         struct btrfs_path *path;
5687         struct btrfs_root *root = BTRFS_I(dir)->root;
5688         int ret = 0;
5689
5690         path = btrfs_alloc_path();
5691         if (!path)
5692                 return -ENOMEM;
5693
5694         di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
5695                         name, namelen, 0);
5696         if (IS_ERR_OR_NULL(di)) {
5697                 ret = di ? PTR_ERR(di) : -ENOENT;
5698                 goto out;
5699         }
5700
5701         btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5702         if (location->type != BTRFS_INODE_ITEM_KEY &&
5703             location->type != BTRFS_ROOT_ITEM_KEY) {
5704                 ret = -EUCLEAN;
5705                 btrfs_warn(root->fs_info,
5706 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5707                            __func__, name, btrfs_ino(BTRFS_I(dir)),
5708                            location->objectid, location->type, location->offset);
5709         }
5710         if (!ret)
5711                 *type = btrfs_dir_type(path->nodes[0], di);
5712 out:
5713         btrfs_free_path(path);
5714         return ret;
5715 }
5716
5717 /*
5718  * when we hit a tree root in a directory, the btrfs part of the inode
5719  * needs to be changed to reflect the root directory of the tree root.  This
5720  * is kind of like crossing a mount point.
5721  */
5722 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5723                                     struct inode *dir,
5724                                     struct dentry *dentry,
5725                                     struct btrfs_key *location,
5726                                     struct btrfs_root **sub_root)
5727 {
5728         struct btrfs_path *path;
5729         struct btrfs_root *new_root;
5730         struct btrfs_root_ref *ref;
5731         struct extent_buffer *leaf;
5732         struct btrfs_key key;
5733         int ret;
5734         int err = 0;
5735
5736         path = btrfs_alloc_path();
5737         if (!path) {
5738                 err = -ENOMEM;
5739                 goto out;
5740         }
5741
5742         err = -ENOENT;
5743         key.objectid = BTRFS_I(dir)->root->root_key.objectid;
5744         key.type = BTRFS_ROOT_REF_KEY;
5745         key.offset = location->objectid;
5746
5747         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5748         if (ret) {
5749                 if (ret < 0)
5750                         err = ret;
5751                 goto out;
5752         }
5753
5754         leaf = path->nodes[0];
5755         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5756         if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
5757             btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
5758                 goto out;
5759
5760         ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
5761                                    (unsigned long)(ref + 1),
5762                                    dentry->d_name.len);
5763         if (ret)
5764                 goto out;
5765
5766         btrfs_release_path(path);
5767
5768         new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5769         if (IS_ERR(new_root)) {
5770                 err = PTR_ERR(new_root);
5771                 goto out;
5772         }
5773
5774         *sub_root = new_root;
5775         location->objectid = btrfs_root_dirid(&new_root->root_item);
5776         location->type = BTRFS_INODE_ITEM_KEY;
5777         location->offset = 0;
5778         err = 0;
5779 out:
5780         btrfs_free_path(path);
5781         return err;
5782 }
5783
5784 static void inode_tree_add(struct inode *inode)
5785 {
5786         struct btrfs_root *root = BTRFS_I(inode)->root;
5787         struct btrfs_inode *entry;
5788         struct rb_node **p;
5789         struct rb_node *parent;
5790         struct rb_node *new = &BTRFS_I(inode)->rb_node;
5791         u64 ino = btrfs_ino(BTRFS_I(inode));
5792
5793         if (inode_unhashed(inode))
5794                 return;
5795         parent = NULL;
5796         spin_lock(&root->inode_lock);
5797         p = &root->inode_tree.rb_node;
5798         while (*p) {
5799                 parent = *p;
5800                 entry = rb_entry(parent, struct btrfs_inode, rb_node);
5801
5802                 if (ino < btrfs_ino(entry))
5803                         p = &parent->rb_left;
5804                 else if (ino > btrfs_ino(entry))
5805                         p = &parent->rb_right;
5806                 else {
5807                         WARN_ON(!(entry->vfs_inode.i_state &
5808                                   (I_WILL_FREE | I_FREEING)));
5809                         rb_replace_node(parent, new, &root->inode_tree);
5810                         RB_CLEAR_NODE(parent);
5811                         spin_unlock(&root->inode_lock);
5812                         return;
5813                 }
5814         }
5815         rb_link_node(new, parent, p);
5816         rb_insert_color(new, &root->inode_tree);
5817         spin_unlock(&root->inode_lock);
5818 }
5819
5820 static void inode_tree_del(struct btrfs_inode *inode)
5821 {
5822         struct btrfs_root *root = inode->root;
5823         int empty = 0;
5824
5825         spin_lock(&root->inode_lock);
5826         if (!RB_EMPTY_NODE(&inode->rb_node)) {
5827                 rb_erase(&inode->rb_node, &root->inode_tree);
5828                 RB_CLEAR_NODE(&inode->rb_node);
5829                 empty = RB_EMPTY_ROOT(&root->inode_tree);
5830         }
5831         spin_unlock(&root->inode_lock);
5832
5833         if (empty && btrfs_root_refs(&root->root_item) == 0) {
5834                 spin_lock(&root->inode_lock);
5835                 empty = RB_EMPTY_ROOT(&root->inode_tree);
5836                 spin_unlock(&root->inode_lock);
5837                 if (empty)
5838                         btrfs_add_dead_root(root);
5839         }
5840 }
5841
5842
5843 static int btrfs_init_locked_inode(struct inode *inode, void *p)
5844 {
5845         struct btrfs_iget_args *args = p;
5846
5847         inode->i_ino = args->ino;
5848         BTRFS_I(inode)->location.objectid = args->ino;
5849         BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5850         BTRFS_I(inode)->location.offset = 0;
5851         BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5852         BUG_ON(args->root && !BTRFS_I(inode)->root);
5853         return 0;
5854 }
5855
5856 static int btrfs_find_actor(struct inode *inode, void *opaque)
5857 {
5858         struct btrfs_iget_args *args = opaque;
5859
5860         return args->ino == BTRFS_I(inode)->location.objectid &&
5861                 args->root == BTRFS_I(inode)->root;
5862 }
5863
5864 static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
5865                                        struct btrfs_root *root)
5866 {
5867         struct inode *inode;
5868         struct btrfs_iget_args args;
5869         unsigned long hashval = btrfs_inode_hash(ino, root);
5870
5871         args.ino = ino;
5872         args.root = root;
5873
5874         inode = iget5_locked(s, hashval, btrfs_find_actor,
5875                              btrfs_init_locked_inode,
5876                              (void *)&args);
5877         return inode;
5878 }
5879
5880 /*
5881  * Get an inode object given its inode number and corresponding root.
5882  * Path can be preallocated to prevent recursing back to iget through
5883  * allocator. NULL is also valid but may require an additional allocation
5884  * later.
5885  */
5886 struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
5887                               struct btrfs_root *root, struct btrfs_path *path)
5888 {
5889         struct inode *inode;
5890
5891         inode = btrfs_iget_locked(s, ino, root);
5892         if (!inode)
5893                 return ERR_PTR(-ENOMEM);
5894
5895         if (inode->i_state & I_NEW) {
5896                 int ret;
5897
5898                 ret = btrfs_read_locked_inode(inode, path);
5899                 if (!ret) {
5900                         inode_tree_add(inode);
5901                         unlock_new_inode(inode);
5902                 } else {
5903                         iget_failed(inode);
5904                         /*
5905                          * ret > 0 can come from btrfs_search_slot called by
5906                          * btrfs_read_locked_inode, this means the inode item
5907                          * was not found.
5908                          */
5909                         if (ret > 0)
5910                                 ret = -ENOENT;
5911                         inode = ERR_PTR(ret);
5912                 }
5913         }
5914
5915         return inode;
5916 }
5917
5918 struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
5919 {
5920         return btrfs_iget_path(s, ino, root, NULL);
5921 }
5922
5923 static struct inode *new_simple_dir(struct super_block *s,
5924                                     struct btrfs_key *key,
5925                                     struct btrfs_root *root)
5926 {
5927         struct inode *inode = new_inode(s);
5928
5929         if (!inode)
5930                 return ERR_PTR(-ENOMEM);
5931
5932         BTRFS_I(inode)->root = btrfs_grab_root(root);
5933         memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5934         set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5935
5936         inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5937         /*
5938          * We only need lookup, the rest is read-only and there's no inode
5939          * associated with the dentry
5940          */
5941         inode->i_op = &simple_dir_inode_operations;
5942         inode->i_opflags &= ~IOP_XATTR;
5943         inode->i_fop = &simple_dir_operations;
5944         inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5945         inode->i_mtime = current_time(inode);
5946         inode->i_atime = inode->i_mtime;
5947         inode->i_ctime = inode->i_mtime;
5948         BTRFS_I(inode)->i_otime = inode->i_mtime;
5949
5950         return inode;
5951 }
5952
5953 static inline u8 btrfs_inode_type(struct inode *inode)
5954 {
5955         /*
5956          * Compile-time asserts that generic FT_* types still match
5957          * BTRFS_FT_* types
5958          */
5959         BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
5960         BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
5961         BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
5962         BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
5963         BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
5964         BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
5965         BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
5966         BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
5967
5968         return fs_umode_to_ftype(inode->i_mode);
5969 }
5970
5971 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5972 {
5973         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
5974         struct inode *inode;
5975         struct btrfs_root *root = BTRFS_I(dir)->root;
5976         struct btrfs_root *sub_root = root;
5977         struct btrfs_key location;
5978         u8 di_type = 0;
5979         int ret = 0;
5980
5981         if (dentry->d_name.len > BTRFS_NAME_LEN)
5982                 return ERR_PTR(-ENAMETOOLONG);
5983
5984         ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
5985         if (ret < 0)
5986                 return ERR_PTR(ret);
5987
5988         if (location.type == BTRFS_INODE_ITEM_KEY) {
5989                 inode = btrfs_iget(dir->i_sb, location.objectid, root);
5990                 if (IS_ERR(inode))
5991                         return inode;
5992
5993                 /* Do extra check against inode mode with di_type */
5994                 if (btrfs_inode_type(inode) != di_type) {
5995                         btrfs_crit(fs_info,
5996 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5997                                   inode->i_mode, btrfs_inode_type(inode),
5998                                   di_type);
5999                         iput(inode);
6000                         return ERR_PTR(-EUCLEAN);
6001                 }
6002                 return inode;
6003         }
6004
6005         ret = fixup_tree_root_location(fs_info, dir, dentry,
6006                                        &location, &sub_root);
6007         if (ret < 0) {
6008                 if (ret != -ENOENT)
6009                         inode = ERR_PTR(ret);
6010                 else
6011                         inode = new_simple_dir(dir->i_sb, &location, sub_root);
6012         } else {
6013                 inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
6014         }
6015         if (root != sub_root)
6016                 btrfs_put_root(sub_root);
6017
6018         if (!IS_ERR(inode) && root != sub_root) {
6019                 down_read(&fs_info->cleanup_work_sem);
6020                 if (!sb_rdonly(inode->i_sb))
6021                         ret = btrfs_orphan_cleanup(sub_root);
6022                 up_read(&fs_info->cleanup_work_sem);
6023                 if (ret) {
6024                         iput(inode);
6025                         inode = ERR_PTR(ret);
6026                 }
6027         }
6028
6029         return inode;
6030 }
6031
6032 static int btrfs_dentry_delete(const struct dentry *dentry)
6033 {
6034         struct btrfs_root *root;
6035         struct inode *inode = d_inode(dentry);
6036
6037         if (!inode && !IS_ROOT(dentry))
6038                 inode = d_inode(dentry->d_parent);
6039
6040         if (inode) {
6041                 root = BTRFS_I(inode)->root;
6042                 if (btrfs_root_refs(&root->root_item) == 0)
6043                         return 1;
6044
6045                 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
6046                         return 1;
6047         }
6048         return 0;
6049 }
6050
6051 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
6052                                    unsigned int flags)
6053 {
6054         struct inode *inode = btrfs_lookup_dentry(dir, dentry);
6055
6056         if (inode == ERR_PTR(-ENOENT))
6057                 inode = NULL;
6058         return d_splice_alias(inode, dentry);
6059 }
6060
6061 /*
6062  * All this infrastructure exists because dir_emit can fault, and we are holding
6063  * the tree lock when doing readdir.  For now just allocate a buffer and copy
6064  * our information into that, and then dir_emit from the buffer.  This is
6065  * similar to what NFS does, only we don't keep the buffer around in pagecache
6066  * because I'm afraid I'll mess that up.  Long term we need to make filldir do
6067  * copy_to_user_inatomic so we don't have to worry about page faulting under the
6068  * tree lock.
6069  */
6070 static int btrfs_opendir(struct inode *inode, struct file *file)
6071 {
6072         struct btrfs_file_private *private;
6073
6074         private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
6075         if (!private)
6076                 return -ENOMEM;
6077         private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
6078         if (!private->filldir_buf) {
6079                 kfree(private);
6080                 return -ENOMEM;
6081         }
6082         file->private_data = private;
6083         return 0;
6084 }
6085
6086 struct dir_entry {
6087         u64 ino;
6088         u64 offset;
6089         unsigned type;
6090         int name_len;
6091 };
6092
6093 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
6094 {
6095         while (entries--) {
6096                 struct dir_entry *entry = addr;
6097                 char *name = (char *)(entry + 1);
6098
6099                 ctx->pos = get_unaligned(&entry->offset);
6100                 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
6101                                          get_unaligned(&entry->ino),
6102                                          get_unaligned(&entry->type)))
6103                         return 1;
6104                 addr += sizeof(struct dir_entry) +
6105                         get_unaligned(&entry->name_len);
6106                 ctx->pos++;
6107         }
6108         return 0;
6109 }
6110
6111 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
6112 {
6113         struct inode *inode = file_inode(file);
6114         struct btrfs_root *root = BTRFS_I(inode)->root;
6115         struct btrfs_file_private *private = file->private_data;
6116         struct btrfs_dir_item *di;
6117         struct btrfs_key key;
6118         struct btrfs_key found_key;
6119         struct btrfs_path *path;
6120         void *addr;
6121         struct list_head ins_list;
6122         struct list_head del_list;
6123         int ret;
6124         struct extent_buffer *leaf;
6125         int slot;
6126         char *name_ptr;
6127         int name_len;
6128         int entries = 0;
6129         int total_len = 0;
6130         bool put = false;
6131         struct btrfs_key location;
6132
6133         if (!dir_emit_dots(file, ctx))
6134                 return 0;
6135
6136         path = btrfs_alloc_path();
6137         if (!path)
6138                 return -ENOMEM;
6139
6140         addr = private->filldir_buf;
6141         path->reada = READA_FORWARD;
6142
6143         INIT_LIST_HEAD(&ins_list);
6144         INIT_LIST_HEAD(&del_list);
6145         put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
6146
6147 again:
6148         key.type = BTRFS_DIR_INDEX_KEY;
6149         key.offset = ctx->pos;
6150         key.objectid = btrfs_ino(BTRFS_I(inode));
6151
6152         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6153         if (ret < 0)
6154                 goto err;
6155
6156         while (1) {
6157                 struct dir_entry *entry;
6158
6159                 leaf = path->nodes[0];
6160                 slot = path->slots[0];
6161                 if (slot >= btrfs_header_nritems(leaf)) {
6162                         ret = btrfs_next_leaf(root, path);
6163                         if (ret < 0)
6164                                 goto err;
6165                         else if (ret > 0)
6166                                 break;
6167                         continue;
6168                 }
6169
6170                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6171
6172                 if (found_key.objectid != key.objectid)
6173                         break;
6174                 if (found_key.type != BTRFS_DIR_INDEX_KEY)
6175                         break;
6176                 if (found_key.offset < ctx->pos)
6177                         goto next;
6178                 if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
6179                         goto next;
6180                 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
6181                 name_len = btrfs_dir_name_len(leaf, di);
6182                 if ((total_len + sizeof(struct dir_entry) + name_len) >=
6183                     PAGE_SIZE) {
6184                         btrfs_release_path(path);
6185                         ret = btrfs_filldir(private->filldir_buf, entries, ctx);
6186                         if (ret)
6187                                 goto nopos;
6188                         addr = private->filldir_buf;
6189                         entries = 0;
6190                         total_len = 0;
6191                         goto again;
6192                 }
6193
6194                 entry = addr;
6195                 put_unaligned(name_len, &entry->name_len);
6196                 name_ptr = (char *)(entry + 1);
6197                 read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
6198                                    name_len);
6199                 put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
6200                                 &entry->type);
6201                 btrfs_dir_item_key_to_cpu(leaf, di, &location);
6202                 put_unaligned(location.objectid, &entry->ino);
6203                 put_unaligned(found_key.offset, &entry->offset);
6204                 entries++;
6205                 addr += sizeof(struct dir_entry) + name_len;
6206                 total_len += sizeof(struct dir_entry) + name_len;
6207 next:
6208                 path->slots[0]++;
6209         }
6210         btrfs_release_path(path);
6211
6212         ret = btrfs_filldir(private->filldir_buf, entries, ctx);
6213         if (ret)
6214                 goto nopos;
6215
6216         ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
6217         if (ret)
6218                 goto nopos;
6219
6220         /*
6221          * Stop new entries from being returned after we return the last
6222          * entry.
6223          *
6224          * New directory entries are assigned a strictly increasing
6225          * offset.  This means that new entries created during readdir
6226          * are *guaranteed* to be seen in the future by that readdir.
6227          * This has broken buggy programs which operate on names as
6228          * they're returned by readdir.  Until we re-use freed offsets
6229          * we have this hack to stop new entries from being returned
6230          * under the assumption that they'll never reach this huge
6231          * offset.
6232          *
6233          * This is being careful not to overflow 32bit loff_t unless the
6234          * last entry requires it because doing so has broken 32bit apps
6235          * in the past.
6236          */
6237         if (ctx->pos >= INT_MAX)
6238                 ctx->pos = LLONG_MAX;
6239         else
6240                 ctx->pos = INT_MAX;
6241 nopos:
6242         ret = 0;
6243 err:
6244         if (put)
6245                 btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
6246         btrfs_free_path(path);
6247         return ret;
6248 }
6249
6250 /*
6251  * This is somewhat expensive, updating the tree every time the
6252  * inode changes.  But, it is most likely to find the inode in cache.
6253  * FIXME, needs more benchmarking...there are no reasons other than performance
6254  * to keep or drop this code.
6255  */
6256 static int btrfs_dirty_inode(struct inode *inode)
6257 {
6258         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6259         struct btrfs_root *root = BTRFS_I(inode)->root;
6260         struct btrfs_trans_handle *trans;
6261         int ret;
6262
6263         if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
6264                 return 0;
6265
6266         trans = btrfs_join_transaction(root);
6267         if (IS_ERR(trans))
6268                 return PTR_ERR(trans);
6269
6270         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
6271         if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
6272                 /* whoops, lets try again with the full transaction */
6273                 btrfs_end_transaction(trans);
6274                 trans = btrfs_start_transaction(root, 1);
6275                 if (IS_ERR(trans))
6276                         return PTR_ERR(trans);
6277
6278                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
6279         }
6280         btrfs_end_transaction(trans);
6281         if (BTRFS_I(inode)->delayed_node)
6282                 btrfs_balance_delayed_items(fs_info);
6283
6284         return ret;
6285 }
6286
6287 /*
6288  * This is a copy of file_update_time.  We need this so we can return error on
6289  * ENOSPC for updating the inode in the case of file write and mmap writes.
6290  */
6291 static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
6292                              int flags)
6293 {
6294         struct btrfs_root *root = BTRFS_I(inode)->root;
6295         bool dirty = flags & ~S_VERSION;
6296
6297         if (btrfs_root_readonly(root))
6298                 return -EROFS;
6299
6300         if (flags & S_VERSION)
6301                 dirty |= inode_maybe_inc_iversion(inode, dirty);
6302         if (flags & S_CTIME)
6303                 inode->i_ctime = *now;
6304         if (flags & S_MTIME)
6305                 inode->i_mtime = *now;
6306         if (flags & S_ATIME)
6307                 inode->i_atime = *now;
6308         return dirty ? btrfs_dirty_inode(inode) : 0;
6309 }
6310
6311 /*
6312  * find the highest existing sequence number in a directory
6313  * and then set the in-memory index_cnt variable to reflect
6314  * free sequence numbers
6315  */
6316 static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
6317 {
6318         struct btrfs_root *root = inode->root;
6319         struct btrfs_key key, found_key;
6320         struct btrfs_path *path;
6321         struct extent_buffer *leaf;
6322         int ret;
6323
6324         key.objectid = btrfs_ino(inode);
6325         key.type = BTRFS_DIR_INDEX_KEY;
6326         key.offset = (u64)-1;
6327
6328         path = btrfs_alloc_path();
6329         if (!path)
6330                 return -ENOMEM;
6331
6332         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6333         if (ret < 0)
6334                 goto out;
6335         /* FIXME: we should be able to handle this */
6336         if (ret == 0)
6337                 goto out;
6338         ret = 0;
6339
6340         /*
6341          * MAGIC NUMBER EXPLANATION:
6342          * since we search a directory based on f_pos we have to start at 2
6343          * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
6344          * else has to start at 2
6345          */
6346         if (path->slots[0] == 0) {
6347                 inode->index_cnt = 2;
6348                 goto out;
6349         }
6350
6351         path->slots[0]--;
6352
6353         leaf = path->nodes[0];
6354         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6355
6356         if (found_key.objectid != btrfs_ino(inode) ||
6357             found_key.type != BTRFS_DIR_INDEX_KEY) {
6358                 inode->index_cnt = 2;
6359                 goto out;
6360         }
6361
6362         inode->index_cnt = found_key.offset + 1;
6363 out:
6364         btrfs_free_path(path);
6365         return ret;
6366 }
6367
6368 /*
6369  * helper to find a free sequence number in a given directory.  This current
6370  * code is very simple, later versions will do smarter things in the btree
6371  */
6372 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6373 {
6374         int ret = 0;
6375
6376         if (dir->index_cnt == (u64)-1) {
6377                 ret = btrfs_inode_delayed_dir_index_count(dir);
6378                 if (ret) {
6379                         ret = btrfs_set_inode_index_count(dir);
6380                         if (ret)
6381                                 return ret;
6382                 }
6383         }
6384
6385         *index = dir->index_cnt;
6386         dir->index_cnt++;
6387
6388         return ret;
6389 }
6390
6391 static int btrfs_insert_inode_locked(struct inode *inode)
6392 {
6393         struct btrfs_iget_args args;
6394
6395         args.ino = BTRFS_I(inode)->location.objectid;
6396         args.root = BTRFS_I(inode)->root;
6397
6398         return insert_inode_locked4(inode,
6399                    btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6400                    btrfs_find_actor, &args);
6401 }
6402
6403 /*
6404  * Inherit flags from the parent inode.
6405  *
6406  * Currently only the compression flags and the cow flags are inherited.
6407  */
6408 static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
6409 {
6410         unsigned int flags;
6411
6412         if (!dir)
6413                 return;
6414
6415         flags = BTRFS_I(dir)->flags;
6416
6417         if (flags & BTRFS_INODE_NOCOMPRESS) {
6418                 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
6419                 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
6420         } else if (flags & BTRFS_INODE_COMPRESS) {
6421                 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
6422                 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
6423         }
6424
6425         if (flags & BTRFS_INODE_NODATACOW) {
6426                 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
6427                 if (S_ISREG(inode->i_mode))
6428                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6429         }
6430
6431         btrfs_sync_inode_flags_to_i_flags(inode);
6432 }
6433
6434 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
6435                                      struct btrfs_root *root,
6436                                      struct user_namespace *mnt_userns,
6437                                      struct inode *dir,
6438                                      const char *name, int name_len,
6439                                      u64 ref_objectid, u64 objectid,
6440                                      umode_t mode, u64 *index)
6441 {
6442         struct btrfs_fs_info *fs_info = root->fs_info;
6443         struct inode *inode;
6444         struct btrfs_inode_item *inode_item;
6445         struct btrfs_key *location;
6446         struct btrfs_path *path;
6447         struct btrfs_inode_ref *ref;
6448         struct btrfs_key key[2];
6449         u32 sizes[2];
6450         struct btrfs_item_batch batch;
6451         unsigned long ptr;
6452         unsigned int nofs_flag;
6453         int ret;
6454
6455         path = btrfs_alloc_path();
6456         if (!path)
6457                 return ERR_PTR(-ENOMEM);
6458
6459         nofs_flag = memalloc_nofs_save();
6460         inode = new_inode(fs_info->sb);
6461         memalloc_nofs_restore(nofs_flag);
6462         if (!inode) {
6463                 btrfs_free_path(path);
6464                 return ERR_PTR(-ENOMEM);
6465         }
6466
6467         /*
6468          * O_TMPFILE, set link count to 0, so that after this point,
6469          * we fill in an inode item with the correct link count.
6470          */
6471         if (!name)
6472                 set_nlink(inode, 0);
6473
6474         /*
6475          * we have to initialize this early, so we can reclaim the inode
6476          * number if we fail afterwards in this function.
6477          */
6478         inode->i_ino = objectid;
6479
6480         if (dir && name) {
6481                 trace_btrfs_inode_request(dir);
6482
6483                 ret = btrfs_set_inode_index(BTRFS_I(dir), index);
6484                 if (ret) {
6485                         btrfs_free_path(path);
6486                         iput(inode);
6487                         return ERR_PTR(ret);
6488                 }
6489         } else if (dir) {
6490                 *index = 0;
6491         }
6492         /*
6493          * index_cnt is ignored for everything but a dir,
6494          * btrfs_set_inode_index_count has an explanation for the magic
6495          * number
6496          */
6497         BTRFS_I(inode)->index_cnt = 2;
6498         BTRFS_I(inode)->dir_index = *index;
6499         BTRFS_I(inode)->root = btrfs_grab_root(root);
6500         BTRFS_I(inode)->generation = trans->transid;
6501         inode->i_generation = BTRFS_I(inode)->generation;
6502
6503         /*
6504          * We could have gotten an inode number from somebody who was fsynced
6505          * and then removed in this same transaction, so let's just set full
6506          * sync since it will be a full sync anyway and this will blow away the
6507          * old info in the log.
6508          */
6509         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
6510
6511         key[0].objectid = objectid;
6512         key[0].type = BTRFS_INODE_ITEM_KEY;
6513         key[0].offset = 0;
6514
6515         sizes[0] = sizeof(struct btrfs_inode_item);
6516
6517         if (name) {
6518                 /*
6519                  * Start new inodes with an inode_ref. This is slightly more
6520                  * efficient for small numbers of hard links since they will
6521                  * be packed into one item. Extended refs will kick in if we
6522                  * add more hard links than can fit in the ref item.
6523                  */
6524                 key[1].objectid = objectid;
6525                 key[1].type = BTRFS_INODE_REF_KEY;
6526                 key[1].offset = ref_objectid;
6527
6528                 sizes[1] = name_len + sizeof(*ref);
6529         }
6530
6531         location = &BTRFS_I(inode)->location;
6532         location->objectid = objectid;
6533         location->offset = 0;
6534         location->type = BTRFS_INODE_ITEM_KEY;
6535
6536         ret = btrfs_insert_inode_locked(inode);
6537         if (ret < 0) {
6538                 iput(inode);
6539                 goto fail;
6540         }
6541
6542         batch.keys = &key[0];
6543         batch.data_sizes = &sizes[0];
6544         batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
6545         batch.nr = name ? 2 : 1;
6546         ret = btrfs_insert_empty_items(trans, root, path, &batch);
6547         if (ret != 0)
6548                 goto fail_unlock;
6549
6550         inode_init_owner(mnt_userns, inode, dir, mode);
6551         inode_set_bytes(inode, 0);
6552
6553         inode->i_mtime = current_time(inode);
6554         inode->i_atime = inode->i_mtime;
6555         inode->i_ctime = inode->i_mtime;
6556         BTRFS_I(inode)->i_otime = inode->i_mtime;
6557
6558         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6559                                   struct btrfs_inode_item);
6560         memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6561                              sizeof(*inode_item));
6562         fill_inode_item(trans, path->nodes[0], inode_item, inode);
6563
6564         if (name) {
6565                 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6566                                      struct btrfs_inode_ref);
6567                 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
6568                 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
6569                 ptr = (unsigned long)(ref + 1);
6570                 write_extent_buffer(path->nodes[0], name, ptr, name_len);
6571         }
6572
6573         btrfs_mark_buffer_dirty(path->nodes[0]);
6574         btrfs_free_path(path);
6575
6576         btrfs_inherit_iflags(inode, dir);
6577
6578         if (S_ISREG(mode)) {
6579                 if (btrfs_test_opt(fs_info, NODATASUM))
6580                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6581                 if (btrfs_test_opt(fs_info, NODATACOW))
6582                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6583                                 BTRFS_INODE_NODATASUM;
6584         }
6585
6586         inode_tree_add(inode);
6587
6588         trace_btrfs_inode_new(inode);
6589         btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
6590
6591         btrfs_update_root_times(trans, root);
6592
6593         ret = btrfs_inode_inherit_props(trans, inode, dir);
6594         if (ret)
6595                 btrfs_err(fs_info,
6596                           "error inheriting props for ino %llu (root %llu): %d",
6597                         btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret);
6598
6599         return inode;
6600
6601 fail_unlock:
6602         discard_new_inode(inode);
6603 fail:
6604         if (dir && name)
6605                 BTRFS_I(dir)->index_cnt--;
6606         btrfs_free_path(path);
6607         return ERR_PTR(ret);
6608 }
6609
6610 /*
6611  * utility function to add 'inode' into 'parent_inode' with
6612  * a give name and a given sequence number.
6613  * if 'add_backref' is true, also insert a backref from the
6614  * inode to the parent directory.
6615  */
6616 int btrfs_add_link(struct btrfs_trans_handle *trans,
6617                    struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6618                    const char *name, int name_len, int add_backref, u64 index)
6619 {
6620         int ret = 0;
6621         struct btrfs_key key;
6622         struct btrfs_root *root = parent_inode->root;
6623         u64 ino = btrfs_ino(inode);
6624         u64 parent_ino = btrfs_ino(parent_inode);
6625
6626         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6627                 memcpy(&key, &inode->root->root_key, sizeof(key));
6628         } else {
6629                 key.objectid = ino;
6630                 key.type = BTRFS_INODE_ITEM_KEY;
6631                 key.offset = 0;
6632         }
6633
6634         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6635                 ret = btrfs_add_root_ref(trans, key.objectid,
6636                                          root->root_key.objectid, parent_ino,
6637                                          index, name, name_len);
6638         } else if (add_backref) {
6639                 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
6640                                              parent_ino, index);
6641         }
6642
6643         /* Nothing to clean up yet */
6644         if (ret)
6645                 return ret;
6646
6647         ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
6648                                     btrfs_inode_type(&inode->vfs_inode), index);
6649         if (ret == -EEXIST || ret == -EOVERFLOW)
6650                 goto fail_dir_item;
6651         else if (ret) {
6652                 btrfs_abort_transaction(trans, ret);
6653                 return ret;
6654         }
6655
6656         btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6657                            name_len * 2);
6658         inode_inc_iversion(&parent_inode->vfs_inode);
6659         /*
6660          * If we are replaying a log tree, we do not want to update the mtime
6661          * and ctime of the parent directory with the current time, since the
6662          * log replay procedure is responsible for setting them to their correct
6663          * values (the ones it had when the fsync was done).
6664          */
6665         if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
6666                 struct timespec64 now = current_time(&parent_inode->vfs_inode);
6667
6668                 parent_inode->vfs_inode.i_mtime = now;
6669                 parent_inode->vfs_inode.i_ctime = now;
6670         }
6671         ret = btrfs_update_inode(trans, root, parent_inode);
6672         if (ret)
6673                 btrfs_abort_transaction(trans, ret);
6674         return ret;
6675
6676 fail_dir_item:
6677         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6678                 u64 local_index;
6679                 int err;
6680                 err = btrfs_del_root_ref(trans, key.objectid,
6681                                          root->root_key.objectid, parent_ino,
6682                                          &local_index, name, name_len);
6683                 if (err)
6684                         btrfs_abort_transaction(trans, err);
6685         } else if (add_backref) {
6686                 u64 local_index;
6687                 int err;
6688
6689                 err = btrfs_del_inode_ref(trans, root, name, name_len,
6690                                           ino, parent_ino, &local_index);
6691                 if (err)
6692                         btrfs_abort_transaction(trans, err);
6693         }
6694
6695         /* Return the original error code */
6696         return ret;
6697 }
6698
6699 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
6700                             struct btrfs_inode *dir, struct dentry *dentry,
6701                             struct btrfs_inode *inode, int backref, u64 index)
6702 {
6703         int err = btrfs_add_link(trans, dir, inode,
6704                                  dentry->d_name.name, dentry->d_name.len,
6705                                  backref, index);
6706         if (err > 0)
6707                 err = -EEXIST;
6708         return err;
6709 }
6710
6711 static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
6712                        struct dentry *dentry, umode_t mode, dev_t rdev)
6713 {
6714         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6715         struct btrfs_trans_handle *trans;
6716         struct btrfs_root *root = BTRFS_I(dir)->root;
6717         struct inode *inode = NULL;
6718         int err;
6719         u64 objectid;
6720         u64 index = 0;
6721
6722         /*
6723          * 2 for inode item and ref
6724          * 2 for dir items
6725          * 1 for xattr if selinux is on
6726          */
6727         trans = btrfs_start_transaction(root, 5);
6728         if (IS_ERR(trans))
6729                 return PTR_ERR(trans);
6730
6731         err = btrfs_get_free_objectid(root, &objectid);
6732         if (err)
6733                 goto out_unlock;
6734
6735         inode = btrfs_new_inode(trans, root, mnt_userns, dir,
6736                         dentry->d_name.name, dentry->d_name.len,
6737                         btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
6738         if (IS_ERR(inode)) {
6739                 err = PTR_ERR(inode);
6740                 inode = NULL;
6741                 goto out_unlock;
6742         }
6743
6744         /*
6745         * If the active LSM wants to access the inode during
6746         * d_instantiate it needs these. Smack checks to see
6747         * if the filesystem supports xattrs by looking at the
6748         * ops vector.
6749         */
6750         inode->i_op = &btrfs_special_inode_operations;
6751         init_special_inode(inode, inode->i_mode, rdev);
6752
6753         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6754         if (err)
6755                 goto out_unlock;
6756
6757         err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6758                         0, index);
6759         if (err)
6760                 goto out_unlock;
6761
6762         btrfs_update_inode(trans, root, BTRFS_I(inode));
6763         d_instantiate_new(dentry, inode);
6764
6765 out_unlock:
6766         btrfs_end_transaction(trans);
6767         btrfs_btree_balance_dirty(fs_info);
6768         if (err && inode) {
6769                 inode_dec_link_count(inode);
6770                 discard_new_inode(inode);
6771         }
6772         return err;
6773 }
6774
6775 static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
6776                         struct dentry *dentry, umode_t mode, bool excl)
6777 {
6778         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6779         struct btrfs_trans_handle *trans;
6780         struct btrfs_root *root = BTRFS_I(dir)->root;
6781         struct inode *inode = NULL;
6782         int err;
6783         u64 objectid;
6784         u64 index = 0;
6785
6786         /*
6787          * 2 for inode item and ref
6788          * 2 for dir items
6789          * 1 for xattr if selinux is on
6790          */
6791         trans = btrfs_start_transaction(root, 5);
6792         if (IS_ERR(trans))
6793                 return PTR_ERR(trans);
6794
6795         err = btrfs_get_free_objectid(root, &objectid);
6796         if (err)
6797                 goto out_unlock;
6798
6799         inode = btrfs_new_inode(trans, root, mnt_userns, dir,
6800                         dentry->d_name.name, dentry->d_name.len,
6801                         btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
6802         if (IS_ERR(inode)) {
6803                 err = PTR_ERR(inode);
6804                 inode = NULL;
6805                 goto out_unlock;
6806         }
6807         /*
6808         * If the active LSM wants to access the inode during
6809         * d_instantiate it needs these. Smack checks to see
6810         * if the filesystem supports xattrs by looking at the
6811         * ops vector.
6812         */
6813         inode->i_fop = &btrfs_file_operations;
6814         inode->i_op = &btrfs_file_inode_operations;
6815         inode->i_mapping->a_ops = &btrfs_aops;
6816
6817         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6818         if (err)
6819                 goto out_unlock;
6820
6821         err = btrfs_update_inode(trans, root, BTRFS_I(inode));
6822         if (err)
6823                 goto out_unlock;
6824
6825         err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6826                         0, index);
6827         if (err)
6828                 goto out_unlock;
6829
6830         d_instantiate_new(dentry, inode);
6831
6832 out_unlock:
6833         btrfs_end_transaction(trans);
6834         if (err && inode) {
6835                 inode_dec_link_count(inode);
6836                 discard_new_inode(inode);
6837         }
6838         btrfs_btree_balance_dirty(fs_info);
6839         return err;
6840 }
6841
6842 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6843                       struct dentry *dentry)
6844 {
6845         struct btrfs_trans_handle *trans = NULL;
6846         struct btrfs_root *root = BTRFS_I(dir)->root;
6847         struct inode *inode = d_inode(old_dentry);
6848         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6849         u64 index;
6850         int err;
6851         int drop_inode = 0;
6852
6853         /* do not allow sys_link's with other subvols of the same device */
6854         if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
6855                 return -EXDEV;
6856
6857         if (inode->i_nlink >= BTRFS_LINK_MAX)
6858                 return -EMLINK;
6859
6860         err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6861         if (err)
6862                 goto fail;
6863
6864         /*
6865          * 2 items for inode and inode ref
6866          * 2 items for dir items
6867          * 1 item for parent inode
6868          * 1 item for orphan item deletion if O_TMPFILE
6869          */
6870         trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6871         if (IS_ERR(trans)) {
6872                 err = PTR_ERR(trans);
6873                 trans = NULL;
6874                 goto fail;
6875         }
6876
6877         /* There are several dir indexes for this inode, clear the cache. */
6878         BTRFS_I(inode)->dir_index = 0ULL;
6879         inc_nlink(inode);
6880         inode_inc_iversion(inode);
6881         inode->i_ctime = current_time(inode);
6882         ihold(inode);
6883         set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6884
6885         err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6886                         1, index);
6887
6888         if (err) {
6889                 drop_inode = 1;
6890         } else {
6891                 struct dentry *parent = dentry->d_parent;
6892
6893                 err = btrfs_update_inode(trans, root, BTRFS_I(inode));
6894                 if (err)
6895                         goto fail;
6896                 if (inode->i_nlink == 1) {
6897                         /*
6898                          * If new hard link count is 1, it's a file created
6899                          * with open(2) O_TMPFILE flag.
6900                          */
6901                         err = btrfs_orphan_del(trans, BTRFS_I(inode));
6902                         if (err)
6903                                 goto fail;
6904                 }
6905                 d_instantiate(dentry, inode);
6906                 btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
6907         }
6908
6909 fail:
6910         if (trans)
6911                 btrfs_end_transaction(trans);
6912         if (drop_inode) {
6913                 inode_dec_link_count(inode);
6914                 iput(inode);
6915         }
6916         btrfs_btree_balance_dirty(fs_info);
6917         return err;
6918 }
6919
6920 static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
6921                        struct dentry *dentry, umode_t mode)
6922 {
6923         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6924         struct inode *inode = NULL;
6925         struct btrfs_trans_handle *trans;
6926         struct btrfs_root *root = BTRFS_I(dir)->root;
6927         int err = 0;
6928         u64 objectid = 0;
6929         u64 index = 0;
6930
6931         /*
6932          * 2 items for inode and ref
6933          * 2 items for dir items
6934          * 1 for xattr if selinux is on
6935          */
6936         trans = btrfs_start_transaction(root, 5);
6937         if (IS_ERR(trans))
6938                 return PTR_ERR(trans);
6939
6940         err = btrfs_get_free_objectid(root, &objectid);
6941         if (err)
6942                 goto out_fail;
6943
6944         inode = btrfs_new_inode(trans, root, mnt_userns, dir,
6945                         dentry->d_name.name, dentry->d_name.len,
6946                         btrfs_ino(BTRFS_I(dir)), objectid,
6947                         S_IFDIR | mode, &index);
6948         if (IS_ERR(inode)) {
6949                 err = PTR_ERR(inode);
6950                 inode = NULL;
6951                 goto out_fail;
6952         }
6953
6954         /* these must be set before we unlock the inode */
6955         inode->i_op = &btrfs_dir_inode_operations;
6956         inode->i_fop = &btrfs_dir_file_operations;
6957
6958         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6959         if (err)
6960                 goto out_fail;
6961
6962         btrfs_i_size_write(BTRFS_I(inode), 0);
6963         err = btrfs_update_inode(trans, root, BTRFS_I(inode));
6964         if (err)
6965                 goto out_fail;
6966
6967         err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6968                         dentry->d_name.name,
6969                         dentry->d_name.len, 0, index);
6970         if (err)
6971                 goto out_fail;
6972
6973         d_instantiate_new(dentry, inode);
6974
6975 out_fail:
6976         btrfs_end_transaction(trans);
6977         if (err && inode) {
6978                 inode_dec_link_count(inode);
6979                 discard_new_inode(inode);
6980         }
6981         btrfs_btree_balance_dirty(fs_info);
6982         return err;
6983 }
6984
6985 static noinline int uncompress_inline(struct btrfs_path *path,
6986                                       struct page *page,
6987                                       size_t pg_offset, u64 extent_offset,
6988                                       struct btrfs_file_extent_item *item)
6989 {
6990         int ret;
6991         struct extent_buffer *leaf = path->nodes[0];
6992         char *tmp;
6993         size_t max_size;
6994         unsigned long inline_size;
6995         unsigned long ptr;
6996         int compress_type;
6997
6998         WARN_ON(pg_offset != 0);
6999         compress_type = btrfs_file_extent_compression(leaf, item);
7000         max_size = btrfs_file_extent_ram_bytes(leaf, item);
7001         inline_size = btrfs_file_extent_inline_item_len(leaf,
7002                                         btrfs_item_nr(path->slots[0]));
7003         tmp = kmalloc(inline_size, GFP_NOFS);
7004         if (!tmp)
7005                 return -ENOMEM;
7006         ptr = btrfs_file_extent_inline_start(item);
7007
7008         read_extent_buffer(leaf, tmp, ptr, inline_size);
7009
7010         max_size = min_t(unsigned long, PAGE_SIZE, max_size);
7011         ret = btrfs_decompress(compress_type, tmp, page,
7012                                extent_offset, inline_size, max_size);
7013
7014         /*
7015          * decompression code contains a memset to fill in any space between the end
7016          * of the uncompressed data and the end of max_size in case the decompressed
7017          * data ends up shorter than ram_bytes.  That doesn't cover the hole between
7018          * the end of an inline extent and the beginning of the next block, so we
7019          * cover that region here.
7020          */
7021
7022         if (max_size + pg_offset < PAGE_SIZE)
7023                 memzero_page(page,  pg_offset + max_size,
7024                              PAGE_SIZE - max_size - pg_offset);
7025         kfree(tmp);
7026         return ret;
7027 }
7028
7029 /**
7030  * btrfs_get_extent - Lookup the first extent overlapping a range in a file.
7031  * @inode:      file to search in
7032  * @page:       page to read extent data into if the extent is inline
7033  * @pg_offset:  offset into @page to copy to
7034  * @start:      file offset
7035  * @len:        length of range starting at @start
7036  *
7037  * This returns the first &struct extent_map which overlaps with the given
7038  * range, reading it from the B-tree and caching it if necessary. Note that
7039  * there may be more extents which overlap the given range after the returned
7040  * extent_map.
7041  *
7042  * If @page is not NULL and the extent is inline, this also reads the extent
7043  * data directly into the page and marks the extent up to date in the io_tree.
7044  *
7045  * Return: ERR_PTR on error, non-NULL extent_map on success.
7046  */
7047 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
7048                                     struct page *page, size_t pg_offset,
7049                                     u64 start, u64 len)
7050 {
7051         struct btrfs_fs_info *fs_info = inode->root->fs_info;
7052         int ret = 0;
7053         u64 extent_start = 0;
7054         u64 extent_end = 0;
7055         u64 objectid = btrfs_ino(inode);
7056         int extent_type = -1;
7057         struct btrfs_path *path = NULL;
7058         struct btrfs_root *root = inode->root;
7059         struct btrfs_file_extent_item *item;
7060         struct extent_buffer *leaf;
7061         struct btrfs_key found_key;
7062         struct extent_map *em = NULL;
7063         struct extent_map_tree *em_tree = &inode->extent_tree;
7064         struct extent_io_tree *io_tree = &inode->io_tree;
7065
7066         read_lock(&em_tree->lock);
7067         em = lookup_extent_mapping(em_tree, start, len);
7068         read_unlock(&em_tree->lock);
7069
7070         if (em) {
7071                 if (em->start > start || em->start + em->len <= start)
7072                         free_extent_map(em);
7073                 else if (em->block_start == EXTENT_MAP_INLINE && page)
7074                         free_extent_map(em);
7075                 else
7076                         goto out;
7077         }
7078         em = alloc_extent_map();
7079         if (!em) {
7080                 ret = -ENOMEM;
7081                 goto out;
7082         }
7083         em->start = EXTENT_MAP_HOLE;
7084         em->orig_start = EXTENT_MAP_HOLE;
7085         em->len = (u64)-1;
7086         em->block_len = (u64)-1;
7087
7088         path = btrfs_alloc_path();
7089         if (!path) {
7090                 ret = -ENOMEM;
7091                 goto out;
7092         }
7093
7094         /* Chances are we'll be called again, so go ahead and do readahead */
7095         path->reada = READA_FORWARD;
7096
7097         /*
7098          * The same explanation in load_free_space_cache applies here as well,
7099          * we only read when we're loading the free space cache, and at that
7100          * point the commit_root has everything we need.
7101          */
7102         if (btrfs_is_free_space_inode(inode)) {
7103                 path->search_commit_root = 1;
7104                 path->skip_locking = 1;
7105         }
7106
7107         ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
7108         if (ret < 0) {
7109                 goto out;
7110         } else if (ret > 0) {
7111                 if (path->slots[0] == 0)
7112                         goto not_found;
7113                 path->slots[0]--;
7114                 ret = 0;
7115         }
7116
7117         leaf = path->nodes[0];
7118         item = btrfs_item_ptr(leaf, path->slots[0],
7119                               struct btrfs_file_extent_item);
7120         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7121         if (found_key.objectid != objectid ||
7122             found_key.type != BTRFS_EXTENT_DATA_KEY) {
7123                 /*
7124                  * If we backup past the first extent we want to move forward
7125                  * and see if there is an extent in front of us, otherwise we'll
7126                  * say there is a hole for our whole search range which can
7127                  * cause problems.
7128                  */
7129                 extent_end = start;
7130                 goto next;
7131         }
7132
7133         extent_type = btrfs_file_extent_type(leaf, item);
7134         extent_start = found_key.offset;
7135         extent_end = btrfs_file_extent_end(path);
7136         if (extent_type == BTRFS_FILE_EXTENT_REG ||
7137             extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
7138                 /* Only regular file could have regular/prealloc extent */
7139                 if (!S_ISREG(inode->vfs_inode.i_mode)) {
7140                         ret = -EUCLEAN;
7141                         btrfs_crit(fs_info,
7142                 "regular/prealloc extent found for non-regular inode %llu",
7143                                    btrfs_ino(inode));
7144                         goto out;
7145                 }
7146                 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
7147                                                        extent_start);
7148         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
7149                 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
7150                                                       path->slots[0],
7151                                                       extent_start);
7152         }
7153 next:
7154         if (start >= extent_end) {
7155                 path->slots[0]++;
7156                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
7157                         ret = btrfs_next_leaf(root, path);
7158                         if (ret < 0)
7159                                 goto out;
7160                         else if (ret > 0)
7161                                 goto not_found;
7162
7163                         leaf = path->nodes[0];
7164                 }
7165                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7166                 if (found_key.objectid != objectid ||
7167                     found_key.type != BTRFS_EXTENT_DATA_KEY)
7168                         goto not_found;
7169                 if (start + len <= found_key.offset)
7170                         goto not_found;
7171                 if (start > found_key.offset)
7172                         goto next;
7173
7174                 /* New extent overlaps with existing one */
7175                 em->start = start;
7176                 em->orig_start = start;
7177                 em->len = found_key.offset - start;
7178                 em->block_start = EXTENT_MAP_HOLE;
7179                 goto insert;
7180         }
7181
7182         btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
7183
7184         if (extent_type == BTRFS_FILE_EXTENT_REG ||
7185             extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
7186                 goto insert;
7187         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
7188                 unsigned long ptr;
7189                 char *map;
7190                 size_t size;
7191                 size_t extent_offset;
7192                 size_t copy_size;
7193
7194                 if (!page)
7195                         goto out;
7196
7197                 size = btrfs_file_extent_ram_bytes(leaf, item);
7198                 extent_offset = page_offset(page) + pg_offset - extent_start;
7199                 copy_size = min_t(u64, PAGE_SIZE - pg_offset,
7200                                   size - extent_offset);
7201                 em->start = extent_start + extent_offset;
7202                 em->len = ALIGN(copy_size, fs_info->sectorsize);
7203                 em->orig_block_len = em->len;
7204                 em->orig_start = em->start;
7205                 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
7206
7207                 if (!PageUptodate(page)) {
7208                         if (btrfs_file_extent_compression(leaf, item) !=
7209                             BTRFS_COMPRESS_NONE) {
7210                                 ret = uncompress_inline(path, page, pg_offset,
7211                                                         extent_offset, item);
7212                                 if (ret)
7213                                         goto out;
7214                         } else {
7215                                 map = kmap_local_page(page);
7216                                 read_extent_buffer(leaf, map + pg_offset, ptr,
7217                                                    copy_size);
7218                                 if (pg_offset + copy_size < PAGE_SIZE) {
7219                                         memset(map + pg_offset + copy_size, 0,
7220                                                PAGE_SIZE - pg_offset -
7221                                                copy_size);
7222                                 }
7223                                 kunmap_local(map);
7224                         }
7225                         flush_dcache_page(page);
7226                 }
7227                 set_extent_uptodate(io_tree, em->start,
7228                                     extent_map_end(em) - 1, NULL, GFP_NOFS);
7229                 goto insert;
7230         }
7231 not_found:
7232         em->start = start;
7233         em->orig_start = start;
7234         em->len = len;
7235         em->block_start = EXTENT_MAP_HOLE;
7236 insert:
7237         ret = 0;
7238         btrfs_release_path(path);
7239         if (em->start > start || extent_map_end(em) <= start) {
7240                 btrfs_err(fs_info,
7241                           "bad extent! em: [%llu %llu] passed [%llu %llu]",
7242                           em->start, em->len, start, len);
7243                 ret = -EIO;
7244                 goto out;
7245         }
7246
7247         write_lock(&em_tree->lock);
7248         ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
7249         write_unlock(&em_tree->lock);
7250 out:
7251         btrfs_free_path(path);
7252
7253         trace_btrfs_get_extent(root, inode, em);
7254
7255         if (ret) {
7256                 free_extent_map(em);
7257                 return ERR_PTR(ret);
7258         }
7259         return em;
7260 }
7261
7262 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
7263                                            u64 start, u64 len)
7264 {
7265         struct extent_map *em;
7266         struct extent_map *hole_em = NULL;
7267         u64 delalloc_start = start;
7268         u64 end;
7269         u64 delalloc_len;
7270         u64 delalloc_end;
7271         int err = 0;
7272
7273         em = btrfs_get_extent(inode, NULL, 0, start, len);
7274         if (IS_ERR(em))
7275                 return em;
7276         /*
7277          * If our em maps to:
7278          * - a hole or
7279          * - a pre-alloc extent,
7280          * there might actually be delalloc bytes behind it.
7281          */
7282         if (em->block_start != EXTENT_MAP_HOLE &&
7283             !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7284                 return em;
7285         else
7286                 hole_em = em;
7287
7288         /* check to see if we've wrapped (len == -1 or similar) */
7289         end = start + len;
7290         if (end < start)
7291                 end = (u64)-1;
7292         else
7293                 end -= 1;
7294
7295         em = NULL;
7296
7297         /* ok, we didn't find anything, lets look for delalloc */
7298         delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
7299                                  end, len, EXTENT_DELALLOC, 1);
7300         delalloc_end = delalloc_start + delalloc_len;
7301         if (delalloc_end < delalloc_start)
7302                 delalloc_end = (u64)-1;
7303
7304         /*
7305          * We didn't find anything useful, return the original results from
7306          * get_extent()
7307          */
7308         if (delalloc_start > end || delalloc_end <= start) {
7309                 em = hole_em;
7310                 hole_em = NULL;
7311                 goto out;
7312         }
7313
7314         /*
7315          * Adjust the delalloc_start to make sure it doesn't go backwards from
7316          * the start they passed in
7317          */
7318         delalloc_start = max(start, delalloc_start);
7319         delalloc_len = delalloc_end - delalloc_start;
7320
7321         if (delalloc_len > 0) {
7322                 u64 hole_start;
7323                 u64 hole_len;
7324                 const u64 hole_end = extent_map_end(hole_em);
7325
7326                 em = alloc_extent_map();
7327                 if (!em) {
7328                         err = -ENOMEM;
7329                         goto out;
7330                 }
7331
7332                 ASSERT(hole_em);
7333                 /*
7334                  * When btrfs_get_extent can't find anything it returns one
7335                  * huge hole
7336                  *
7337                  * Make sure what it found really fits our range, and adjust to
7338                  * make sure it is based on the start from the caller
7339                  */
7340                 if (hole_end <= start || hole_em->start > end) {
7341                        free_extent_map(hole_em);
7342                        hole_em = NULL;
7343                 } else {
7344                        hole_start = max(hole_em->start, start);
7345                        hole_len = hole_end - hole_start;
7346                 }
7347
7348                 if (hole_em && delalloc_start > hole_start) {
7349                         /*
7350                          * Our hole starts before our delalloc, so we have to
7351                          * return just the parts of the hole that go until the
7352                          * delalloc starts
7353                          */
7354                         em->len = min(hole_len, delalloc_start - hole_start);
7355                         em->start = hole_start;
7356                         em->orig_start = hole_start;
7357                         /*
7358                          * Don't adjust block start at all, it is fixed at
7359                          * EXTENT_MAP_HOLE
7360                          */
7361                         em->block_start = hole_em->block_start;
7362                         em->block_len = hole_len;
7363                         if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
7364                                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7365                 } else {
7366                         /*
7367                          * Hole is out of passed range or it starts after
7368                          * delalloc range
7369                          */
7370                         em->start = delalloc_start;
7371                         em->len = delalloc_len;
7372                         em->orig_start = delalloc_start;
7373                         em->block_start = EXTENT_MAP_DELALLOC;
7374                         em->block_len = delalloc_len;
7375                 }
7376         } else {
7377                 return hole_em;
7378         }
7379 out:
7380
7381         free_extent_map(hole_em);
7382         if (err) {
7383                 free_extent_map(em);
7384                 return ERR_PTR(err);
7385         }
7386         return em;
7387 }
7388
7389 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
7390                                                   const u64 start,
7391                                                   const u64 len,
7392                                                   const u64 orig_start,
7393                                                   const u64 block_start,
7394                                                   const u64 block_len,
7395                                                   const u64 orig_block_len,
7396                                                   const u64 ram_bytes,
7397                                                   const int type)
7398 {
7399         struct extent_map *em = NULL;
7400         int ret;
7401
7402         if (type != BTRFS_ORDERED_NOCOW) {
7403                 em = create_io_em(inode, start, len, orig_start, block_start,
7404                                   block_len, orig_block_len, ram_bytes,
7405                                   BTRFS_COMPRESS_NONE, /* compress_type */
7406                                   type);
7407                 if (IS_ERR(em))
7408                         goto out;
7409         }
7410         ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len,
7411                                            block_len, type);
7412         if (ret) {
7413                 if (em) {
7414                         free_extent_map(em);
7415                         btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
7416                 }
7417                 em = ERR_PTR(ret);
7418         }
7419  out:
7420
7421         return em;
7422 }
7423
7424 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
7425                                                   u64 start, u64 len)
7426 {
7427         struct btrfs_root *root = inode->root;
7428         struct btrfs_fs_info *fs_info = root->fs_info;
7429         struct extent_map *em;
7430         struct btrfs_key ins;
7431         u64 alloc_hint;
7432         int ret;
7433
7434         alloc_hint = get_extent_allocation_hint(inode, start, len);
7435         ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
7436                                    0, alloc_hint, &ins, 1, 1);
7437         if (ret)
7438                 return ERR_PTR(ret);
7439
7440         em = btrfs_create_dio_extent(inode, start, ins.offset, start,
7441                                      ins.objectid, ins.offset, ins.offset,
7442                                      ins.offset, BTRFS_ORDERED_REGULAR);
7443         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
7444         if (IS_ERR(em))
7445                 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
7446                                            1);
7447
7448         return em;
7449 }
7450
7451 static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
7452 {
7453         struct btrfs_block_group *block_group;
7454         bool readonly = false;
7455
7456         block_group = btrfs_lookup_block_group(fs_info, bytenr);
7457         if (!block_group || block_group->ro)
7458                 readonly = true;
7459         if (block_group)
7460                 btrfs_put_block_group(block_group);
7461         return readonly;
7462 }
7463
7464 /*
7465  * Check if we can do nocow write into the range [@offset, @offset + @len)
7466  *
7467  * @offset:     File offset
7468  * @len:        The length to write, will be updated to the nocow writeable
7469  *              range
7470  * @orig_start: (optional) Return the original file offset of the file extent
7471  * @orig_len:   (optional) Return the original on-disk length of the file extent
7472  * @ram_bytes:  (optional) Return the ram_bytes of the file extent
7473  * @strict:     if true, omit optimizations that might force us into unnecessary
7474  *              cow. e.g., don't trust generation number.
7475  *
7476  * Return:
7477  * >0   and update @len if we can do nocow write
7478  *  0   if we can't do nocow write
7479  * <0   if error happened
7480  *
7481  * NOTE: This only checks the file extents, caller is responsible to wait for
7482  *       any ordered extents.
7483  */
7484 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7485                               u64 *orig_start, u64 *orig_block_len,
7486                               u64 *ram_bytes, bool strict)
7487 {
7488         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7489         struct btrfs_path *path;
7490         int ret;
7491         struct extent_buffer *leaf;
7492         struct btrfs_root *root = BTRFS_I(inode)->root;
7493         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7494         struct btrfs_file_extent_item *fi;
7495         struct btrfs_key key;
7496         u64 disk_bytenr;
7497         u64 backref_offset;
7498         u64 extent_end;
7499         u64 num_bytes;
7500         int slot;
7501         int found_type;
7502         bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
7503
7504         path = btrfs_alloc_path();
7505         if (!path)
7506                 return -ENOMEM;
7507
7508         ret = btrfs_lookup_file_extent(NULL, root, path,
7509                         btrfs_ino(BTRFS_I(inode)), offset, 0);
7510         if (ret < 0)
7511                 goto out;
7512
7513         slot = path->slots[0];
7514         if (ret == 1) {
7515                 if (slot == 0) {
7516                         /* can't find the item, must cow */
7517                         ret = 0;
7518                         goto out;
7519                 }
7520                 slot--;
7521         }
7522         ret = 0;
7523         leaf = path->nodes[0];
7524         btrfs_item_key_to_cpu(leaf, &key, slot);
7525         if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
7526             key.type != BTRFS_EXTENT_DATA_KEY) {
7527                 /* not our file or wrong item type, must cow */
7528                 goto out;
7529         }
7530
7531         if (key.offset > offset) {
7532                 /* Wrong offset, must cow */
7533                 goto out;
7534         }
7535
7536         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
7537         found_type = btrfs_file_extent_type(leaf, fi);
7538         if (found_type != BTRFS_FILE_EXTENT_REG &&
7539             found_type != BTRFS_FILE_EXTENT_PREALLOC) {
7540                 /* not a regular extent, must cow */
7541                 goto out;
7542         }
7543
7544         if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
7545                 goto out;
7546
7547         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
7548         if (extent_end <= offset)
7549                 goto out;
7550
7551         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7552         if (disk_bytenr == 0)
7553                 goto out;
7554
7555         if (btrfs_file_extent_compression(leaf, fi) ||
7556             btrfs_file_extent_encryption(leaf, fi) ||
7557             btrfs_file_extent_other_encoding(leaf, fi))
7558                 goto out;
7559
7560         /*
7561          * Do the same check as in btrfs_cross_ref_exist but without the
7562          * unnecessary search.
7563          */
7564         if (!strict &&
7565             (btrfs_file_extent_generation(leaf, fi) <=
7566              btrfs_root_last_snapshot(&root->root_item)))
7567                 goto out;
7568
7569         backref_offset = btrfs_file_extent_offset(leaf, fi);
7570
7571         if (orig_start) {
7572                 *orig_start = key.offset - backref_offset;
7573                 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
7574                 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7575         }
7576
7577         if (btrfs_extent_readonly(fs_info, disk_bytenr))
7578                 goto out;
7579
7580         num_bytes = min(offset + *len, extent_end) - offset;
7581         if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7582                 u64 range_end;
7583
7584                 range_end = round_up(offset + num_bytes,
7585                                      root->fs_info->sectorsize) - 1;
7586                 ret = test_range_bit(io_tree, offset, range_end,
7587                                      EXTENT_DELALLOC, 0, NULL);
7588                 if (ret) {
7589                         ret = -EAGAIN;
7590                         goto out;
7591                 }
7592         }
7593
7594         btrfs_release_path(path);
7595
7596         /*
7597          * look for other files referencing this extent, if we
7598          * find any we must cow
7599          */
7600
7601         ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
7602                                     key.offset - backref_offset, disk_bytenr,
7603                                     strict);
7604         if (ret) {
7605                 ret = 0;
7606                 goto out;
7607         }
7608
7609         /*
7610          * adjust disk_bytenr and num_bytes to cover just the bytes
7611          * in this extent we are about to write.  If there
7612          * are any csums in that range we have to cow in order
7613          * to keep the csums correct
7614          */
7615         disk_bytenr += backref_offset;
7616         disk_bytenr += offset - key.offset;
7617         if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
7618                 goto out;
7619         /*
7620          * all of the above have passed, it is safe to overwrite this extent
7621          * without cow
7622          */
7623         *len = num_bytes;
7624         ret = 1;
7625 out:
7626         btrfs_free_path(path);
7627         return ret;
7628 }
7629
7630 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7631                               struct extent_state **cached_state, bool writing)
7632 {
7633         struct btrfs_ordered_extent *ordered;
7634         int ret = 0;
7635
7636         while (1) {
7637                 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7638                                  cached_state);
7639                 /*
7640                  * We're concerned with the entire range that we're going to be
7641                  * doing DIO to, so we need to make sure there's no ordered
7642                  * extents in this range.
7643                  */
7644                 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
7645                                                      lockend - lockstart + 1);
7646
7647                 /*
7648                  * We need to make sure there are no buffered pages in this
7649                  * range either, we could have raced between the invalidate in
7650                  * generic_file_direct_write and locking the extent.  The
7651                  * invalidate needs to happen so that reads after a write do not
7652                  * get stale data.
7653                  */
7654                 if (!ordered &&
7655                     (!writing || !filemap_range_has_page(inode->i_mapping,
7656                                                          lockstart, lockend)))
7657                         break;
7658
7659                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7660                                      cached_state);
7661
7662                 if (ordered) {
7663                         /*
7664                          * If we are doing a DIO read and the ordered extent we
7665                          * found is for a buffered write, we can not wait for it
7666                          * to complete and retry, because if we do so we can
7667                          * deadlock with concurrent buffered writes on page
7668                          * locks. This happens only if our DIO read covers more
7669                          * than one extent map, if at this point has already
7670                          * created an ordered extent for a previous extent map
7671                          * and locked its range in the inode's io tree, and a
7672                          * concurrent write against that previous extent map's
7673                          * range and this range started (we unlock the ranges
7674                          * in the io tree only when the bios complete and
7675                          * buffered writes always lock pages before attempting
7676                          * to lock range in the io tree).
7677                          */
7678                         if (writing ||
7679                             test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7680                                 btrfs_start_ordered_extent(ordered, 1);
7681                         else
7682                                 ret = -ENOTBLK;
7683                         btrfs_put_ordered_extent(ordered);
7684                 } else {
7685                         /*
7686                          * We could trigger writeback for this range (and wait
7687                          * for it to complete) and then invalidate the pages for
7688                          * this range (through invalidate_inode_pages2_range()),
7689                          * but that can lead us to a deadlock with a concurrent
7690                          * call to readahead (a buffered read or a defrag call
7691                          * triggered a readahead) on a page lock due to an
7692                          * ordered dio extent we created before but did not have
7693                          * yet a corresponding bio submitted (whence it can not
7694                          * complete), which makes readahead wait for that
7695                          * ordered extent to complete while holding a lock on
7696                          * that page.
7697                          */
7698                         ret = -ENOTBLK;
7699                 }
7700
7701                 if (ret)
7702                         break;
7703
7704                 cond_resched();
7705         }
7706
7707         return ret;
7708 }
7709
7710 /* The callers of this must take lock_extent() */
7711 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
7712                                        u64 len, u64 orig_start, u64 block_start,
7713                                        u64 block_len, u64 orig_block_len,
7714                                        u64 ram_bytes, int compress_type,
7715                                        int type)
7716 {
7717         struct extent_map_tree *em_tree;
7718         struct extent_map *em;
7719         int ret;
7720
7721         ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7722                type == BTRFS_ORDERED_COMPRESSED ||
7723                type == BTRFS_ORDERED_NOCOW ||
7724                type == BTRFS_ORDERED_REGULAR);
7725
7726         em_tree = &inode->extent_tree;
7727         em = alloc_extent_map();
7728         if (!em)
7729                 return ERR_PTR(-ENOMEM);
7730
7731         em->start = start;
7732         em->orig_start = orig_start;
7733         em->len = len;
7734         em->block_len = block_len;
7735         em->block_start = block_start;
7736         em->orig_block_len = orig_block_len;
7737         em->ram_bytes = ram_bytes;
7738         em->generation = -1;
7739         set_bit(EXTENT_FLAG_PINNED, &em->flags);
7740         if (type == BTRFS_ORDERED_PREALLOC) {
7741                 set_bit(EXTENT_FLAG_FILLING, &em->flags);
7742         } else if (type == BTRFS_ORDERED_COMPRESSED) {
7743                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
7744                 em->compress_type = compress_type;
7745         }
7746
7747         do {
7748                 btrfs_drop_extent_cache(inode, em->start,
7749                                         em->start + em->len - 1, 0);
7750                 write_lock(&em_tree->lock);
7751                 ret = add_extent_mapping(em_tree, em, 1);
7752                 write_unlock(&em_tree->lock);
7753                 /*
7754                  * The caller has taken lock_extent(), who could race with us
7755                  * to add em?
7756                  */
7757         } while (ret == -EEXIST);
7758
7759         if (ret) {
7760                 free_extent_map(em);
7761                 return ERR_PTR(ret);
7762         }
7763
7764         /* em got 2 refs now, callers needs to do free_extent_map once. */
7765         return em;
7766 }
7767
7768
7769 static int btrfs_get_blocks_direct_write(struct extent_map **map,
7770                                          struct inode *inode,
7771                                          struct btrfs_dio_data *dio_data,
7772                                          u64 start, u64 len)
7773 {
7774         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7775         struct extent_map *em = *map;
7776         int ret = 0;
7777
7778         /*
7779          * We don't allocate a new extent in the following cases
7780          *
7781          * 1) The inode is marked as NODATACOW. In this case we'll just use the
7782          * existing extent.
7783          * 2) The extent is marked as PREALLOC. We're good to go here and can
7784          * just use the extent.
7785          *
7786          */
7787         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7788             ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7789              em->block_start != EXTENT_MAP_HOLE)) {
7790                 int type;
7791                 u64 block_start, orig_start, orig_block_len, ram_bytes;
7792
7793                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7794                         type = BTRFS_ORDERED_PREALLOC;
7795                 else
7796                         type = BTRFS_ORDERED_NOCOW;
7797                 len = min(len, em->len - (start - em->start));
7798                 block_start = em->block_start + (start - em->start);
7799
7800                 if (can_nocow_extent(inode, start, &len, &orig_start,
7801                                      &orig_block_len, &ram_bytes, false) == 1 &&
7802                     btrfs_inc_nocow_writers(fs_info, block_start)) {
7803                         struct extent_map *em2;
7804
7805                         em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
7806                                                       orig_start, block_start,
7807                                                       len, orig_block_len,
7808                                                       ram_bytes, type);
7809                         btrfs_dec_nocow_writers(fs_info, block_start);
7810                         if (type == BTRFS_ORDERED_PREALLOC) {
7811                                 free_extent_map(em);
7812                                 *map = em = em2;
7813                         }
7814
7815                         if (em2 && IS_ERR(em2)) {
7816                                 ret = PTR_ERR(em2);
7817                                 goto out;
7818                         }
7819                         /*
7820                          * For inode marked NODATACOW or extent marked PREALLOC,
7821                          * use the existing or preallocated extent, so does not
7822                          * need to adjust btrfs_space_info's bytes_may_use.
7823                          */
7824                         btrfs_free_reserved_data_space_noquota(fs_info, len);
7825                         goto skip_cow;
7826                 }
7827         }
7828
7829         /* this will cow the extent */
7830         free_extent_map(em);
7831         *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
7832         if (IS_ERR(em)) {
7833                 ret = PTR_ERR(em);
7834                 goto out;
7835         }
7836
7837         len = min(len, em->len - (start - em->start));
7838
7839 skip_cow:
7840         /*
7841          * Need to update the i_size under the extent lock so buffered
7842          * readers will get the updated i_size when we unlock.
7843          */
7844         if (start + len > i_size_read(inode))
7845                 i_size_write(inode, start + len);
7846
7847         dio_data->reserve -= len;
7848 out:
7849         return ret;
7850 }
7851
7852 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
7853                 loff_t length, unsigned int flags, struct iomap *iomap,
7854                 struct iomap *srcmap)
7855 {
7856         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7857         struct extent_map *em;
7858         struct extent_state *cached_state = NULL;
7859         struct btrfs_dio_data *dio_data = NULL;
7860         u64 lockstart, lockend;
7861         const bool write = !!(flags & IOMAP_WRITE);
7862         int ret = 0;
7863         u64 len = length;
7864         bool unlock_extents = false;
7865
7866         if (!write)
7867                 len = min_t(u64, len, fs_info->sectorsize);
7868
7869         lockstart = start;
7870         lockend = start + len - 1;
7871
7872         /*
7873          * The generic stuff only does filemap_write_and_wait_range, which
7874          * isn't enough if we've written compressed pages to this area, so we
7875          * need to flush the dirty pages again to make absolutely sure that any
7876          * outstanding dirty pages are on disk.
7877          */
7878         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7879                      &BTRFS_I(inode)->runtime_flags)) {
7880                 ret = filemap_fdatawrite_range(inode->i_mapping, start,
7881                                                start + length - 1);
7882                 if (ret)
7883                         return ret;
7884         }
7885
7886         dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
7887         if (!dio_data)
7888                 return -ENOMEM;
7889
7890         dio_data->length = length;
7891         if (write) {
7892                 dio_data->reserve = round_up(length, fs_info->sectorsize);
7893                 ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
7894                                 &dio_data->data_reserved,
7895                                 start, dio_data->reserve);
7896                 if (ret) {
7897                         extent_changeset_free(dio_data->data_reserved);
7898                         kfree(dio_data);
7899                         return ret;
7900                 }
7901         }
7902         iomap->private = dio_data;
7903
7904
7905         /*
7906          * If this errors out it's because we couldn't invalidate pagecache for
7907          * this range and we need to fallback to buffered.
7908          */
7909         if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
7910                 ret = -ENOTBLK;
7911                 goto err;
7912         }
7913
7914         em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
7915         if (IS_ERR(em)) {
7916                 ret = PTR_ERR(em);
7917                 goto unlock_err;
7918         }
7919
7920         /*
7921          * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7922          * io.  INLINE is special, and we could probably kludge it in here, but
7923          * it's still buffered so for safety lets just fall back to the generic
7924          * buffered path.
7925          *
7926          * For COMPRESSED we _have_ to read the entire extent in so we can
7927          * decompress it, so there will be buffering required no matter what we
7928          * do, so go ahead and fallback to buffered.
7929          *
7930          * We return -ENOTBLK because that's what makes DIO go ahead and go back
7931          * to buffered IO.  Don't blame me, this is the price we pay for using
7932          * the generic code.
7933          */
7934         if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7935             em->block_start == EXTENT_MAP_INLINE) {
7936                 free_extent_map(em);
7937                 ret = -ENOTBLK;
7938                 goto unlock_err;
7939         }
7940
7941         len = min(len, em->len - (start - em->start));
7942         if (write) {
7943                 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
7944                                                     start, len);
7945                 if (ret < 0)
7946                         goto unlock_err;
7947                 unlock_extents = true;
7948                 /* Recalc len in case the new em is smaller than requested */
7949                 len = min(len, em->len - (start - em->start));
7950         } else {
7951                 /*
7952                  * We need to unlock only the end area that we aren't using.
7953                  * The rest is going to be unlocked by the endio routine.
7954                  */
7955                 lockstart = start + len;
7956                 if (lockstart < lockend)
7957                         unlock_extents = true;
7958         }
7959
7960         if (unlock_extents)
7961                 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
7962                                      lockstart, lockend, &cached_state);
7963         else
7964                 free_extent_state(cached_state);
7965
7966         /*
7967          * Translate extent map information to iomap.
7968          * We trim the extents (and move the addr) even though iomap code does
7969          * that, since we have locked only the parts we are performing I/O in.
7970          */
7971         if ((em->block_start == EXTENT_MAP_HOLE) ||
7972             (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
7973                 iomap->addr = IOMAP_NULL_ADDR;
7974                 iomap->type = IOMAP_HOLE;
7975         } else {
7976                 iomap->addr = em->block_start + (start - em->start);
7977                 iomap->type = IOMAP_MAPPED;
7978         }
7979         iomap->offset = start;
7980         iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
7981         iomap->length = len;
7982
7983         if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
7984                 iomap->flags |= IOMAP_F_ZONE_APPEND;
7985
7986         free_extent_map(em);
7987
7988         return 0;
7989
7990 unlock_err:
7991         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7992                              &cached_state);
7993 err:
7994         if (dio_data) {
7995                 btrfs_delalloc_release_space(BTRFS_I(inode),
7996                                 dio_data->data_reserved, start,
7997                                 dio_data->reserve, true);
7998                 btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
7999                 extent_changeset_free(dio_data->data_reserved);
8000                 kfree(dio_data);
8001         }
8002         return ret;
8003 }
8004
8005 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
8006                 ssize_t written, unsigned int flags, struct iomap *iomap)
8007 {
8008         int ret = 0;
8009         struct btrfs_dio_data *dio_data = iomap->private;
8010         size_t submitted = dio_data->submitted;
8011         const bool write = !!(flags & IOMAP_WRITE);
8012
8013         if (!write && (iomap->type == IOMAP_HOLE)) {
8014                 /* If reading from a hole, unlock and return */
8015                 unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
8016                 goto out;
8017         }
8018
8019         if (submitted < length) {
8020                 pos += submitted;
8021                 length -= submitted;
8022                 if (write)
8023                         __endio_write_update_ordered(BTRFS_I(inode), pos,
8024                                         length, false);
8025                 else
8026                         unlock_extent(&BTRFS_I(inode)->io_tree, pos,
8027                                       pos + length - 1);
8028                 ret = -ENOTBLK;
8029         }
8030
8031         if (write) {
8032                 if (dio_data->reserve)
8033                         btrfs_delalloc_release_space(BTRFS_I(inode),
8034                                         dio_data->data_reserved, pos,
8035                                         dio_data->reserve, true);
8036                 btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
8037                 extent_changeset_free(dio_data->data_reserved);
8038         }
8039 out:
8040         kfree(dio_data);
8041         iomap->private = NULL;
8042
8043         return ret;
8044 }
8045
8046 static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
8047 {
8048         /*
8049          * This implies a barrier so that stores to dio_bio->bi_status before
8050          * this and loads of dio_bio->bi_status after this are fully ordered.
8051          */
8052         if (!refcount_dec_and_test(&dip->refs))
8053                 return;
8054
8055         if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
8056                 __endio_write_update_ordered(BTRFS_I(dip->inode),
8057                                              dip->file_offset,
8058                                              dip->bytes,
8059                                              !dip->dio_bio->bi_status);
8060         } else {
8061                 unlock_extent(&BTRFS_I(dip->inode)->io_tree,
8062                               dip->file_offset,
8063                               dip->file_offset + dip->bytes - 1);
8064         }
8065
8066         bio_endio(dip->dio_bio);
8067         kfree(dip);
8068 }
8069
8070 static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
8071                                           int mirror_num,
8072                                           unsigned long bio_flags)
8073 {
8074         struct btrfs_dio_private *dip = bio->bi_private;
8075         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8076         blk_status_t ret;
8077
8078         BUG_ON(bio_op(bio) == REQ_OP_WRITE);
8079
8080         ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
8081         if (ret)
8082                 return ret;
8083
8084         refcount_inc(&dip->refs);
8085         ret = btrfs_map_bio(fs_info, bio, mirror_num);
8086         if (ret)
8087                 refcount_dec(&dip->refs);
8088         return ret;
8089 }
8090
8091 static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
8092                                              struct btrfs_bio *bbio,
8093                                              const bool uptodate)
8094 {
8095         struct inode *inode = dip->inode;
8096         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
8097         const u32 sectorsize = fs_info->sectorsize;
8098         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
8099         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
8100         const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
8101         struct bio_vec bvec;
8102         struct bvec_iter iter;
8103         const u64 orig_file_offset = dip->file_offset;
8104         u64 start = orig_file_offset;
8105         u32 bio_offset = 0;
8106         blk_status_t err = BLK_STS_OK;
8107
8108         __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
8109                 unsigned int i, nr_sectors, pgoff;
8110
8111                 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
8112                 pgoff = bvec.bv_offset;
8113                 for (i = 0; i < nr_sectors; i++) {
8114                         ASSERT(pgoff < PAGE_SIZE);
8115                         if (uptodate &&
8116                             (!csum || !check_data_csum(inode, bbio,
8117                                                        bio_offset, bvec.bv_page,
8118                                                        pgoff, start))) {
8119                                 clean_io_failure(fs_info, failure_tree, io_tree,
8120                                                  start, bvec.bv_page,
8121                                                  btrfs_ino(BTRFS_I(inode)),
8122                                                  pgoff);
8123                         } else {
8124                                 int ret;
8125
8126                                 ASSERT((start - orig_file_offset) < UINT_MAX);
8127                                 ret = btrfs_repair_one_sector(inode,
8128                                                 &bbio->bio,
8129                                                 start - orig_file_offset,
8130                                                 bvec.bv_page, pgoff,
8131                                                 start, bbio->mirror_num,
8132                                                 submit_dio_repair_bio);
8133                                 if (ret)
8134                                         err = errno_to_blk_status(ret);
8135                         }
8136                         start += sectorsize;
8137                         ASSERT(bio_offset + sectorsize > bio_offset);
8138                         bio_offset += sectorsize;
8139                         pgoff += sectorsize;
8140                 }
8141         }
8142         return err;
8143 }
8144
8145 static void __endio_write_update_ordered(struct btrfs_inode *inode,
8146                                          const u64 offset, const u64 bytes,
8147                                          const bool uptodate)
8148 {
8149         btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes,
8150                                        finish_ordered_fn, uptodate);
8151 }
8152
8153 static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
8154                                                      struct bio *bio,
8155                                                      u64 dio_file_offset)
8156 {
8157         return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1);
8158 }
8159
8160 static void btrfs_end_dio_bio(struct bio *bio)
8161 {
8162         struct btrfs_dio_private *dip = bio->bi_private;
8163         blk_status_t err = bio->bi_status;
8164
8165         if (err)
8166                 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
8167                            "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
8168                            btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
8169                            bio->bi_opf, bio->bi_iter.bi_sector,
8170                            bio->bi_iter.bi_size, err);
8171
8172         if (bio_op(bio) == REQ_OP_READ)
8173                 err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
8174
8175         if (err)
8176                 dip->dio_bio->bi_status = err;
8177
8178         btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
8179
8180         bio_put(bio);
8181         btrfs_dio_private_put(dip);
8182 }
8183
8184 static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
8185                 struct inode *inode, u64 file_offset, int async_submit)
8186 {
8187         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8188         struct btrfs_dio_private *dip = bio->bi_private;
8189         bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
8190         blk_status_t ret;
8191
8192         /* Check btrfs_submit_bio_hook() for rules about async submit. */
8193         if (async_submit)
8194                 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
8195
8196         if (!write) {
8197                 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
8198                 if (ret)
8199                         goto err;
8200         }
8201
8202         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
8203                 goto map;
8204
8205         if (write && async_submit) {
8206                 ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
8207                                           btrfs_submit_bio_start_direct_io);
8208                 goto err;
8209         } else if (write) {
8210                 /*
8211                  * If we aren't doing async submit, calculate the csum of the
8212                  * bio now.
8213                  */
8214                 ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1);
8215                 if (ret)
8216                         goto err;
8217         } else {
8218                 u64 csum_offset;
8219
8220                 csum_offset = file_offset - dip->file_offset;
8221                 csum_offset >>= fs_info->sectorsize_bits;
8222                 csum_offset *= fs_info->csum_size;
8223                 btrfs_bio(bio)->csum = dip->csums + csum_offset;
8224         }
8225 map:
8226         ret = btrfs_map_bio(fs_info, bio, 0);
8227 err:
8228         return ret;
8229 }
8230
8231 /*
8232  * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked
8233  * or ordered extents whether or not we submit any bios.
8234  */
8235 static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
8236                                                           struct inode *inode,
8237                                                           loff_t file_offset)
8238 {
8239         const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
8240         const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
8241         size_t dip_size;
8242         struct btrfs_dio_private *dip;
8243
8244         dip_size = sizeof(*dip);
8245         if (!write && csum) {
8246                 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8247                 size_t nblocks;
8248
8249                 nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits;
8250                 dip_size += fs_info->csum_size * nblocks;
8251         }
8252
8253         dip = kzalloc(dip_size, GFP_NOFS);
8254         if (!dip)
8255                 return NULL;
8256
8257         dip->inode = inode;
8258         dip->file_offset = file_offset;
8259         dip->bytes = dio_bio->bi_iter.bi_size;
8260         dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
8261         dip->dio_bio = dio_bio;
8262         refcount_set(&dip->refs, 1);
8263         return dip;
8264 }
8265
8266 static void btrfs_submit_direct(const struct iomap_iter *iter,
8267                 struct bio *dio_bio, loff_t file_offset)
8268 {
8269         struct inode *inode = iter->inode;
8270         const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
8271         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8272         const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
8273                              BTRFS_BLOCK_GROUP_RAID56_MASK);
8274         struct btrfs_dio_private *dip;
8275         struct bio *bio;
8276         u64 start_sector;
8277         int async_submit = 0;
8278         u64 submit_len;
8279         u64 clone_offset = 0;
8280         u64 clone_len;
8281         u64 logical;
8282         int ret;
8283         blk_status_t status;
8284         struct btrfs_io_geometry geom;
8285         struct btrfs_dio_data *dio_data = iter->iomap.private;
8286         struct extent_map *em = NULL;
8287
8288         dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
8289         if (!dip) {
8290                 if (!write) {
8291                         unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
8292                                 file_offset + dio_bio->bi_iter.bi_size - 1);
8293                 }
8294                 dio_bio->bi_status = BLK_STS_RESOURCE;
8295                 bio_endio(dio_bio);
8296                 return;
8297         }
8298
8299         if (!write) {
8300                 /*
8301                  * Load the csums up front to reduce csum tree searches and
8302                  * contention when submitting bios.
8303                  *
8304                  * If we have csums disabled this will do nothing.
8305                  */
8306                 status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
8307                 if (status != BLK_STS_OK)
8308                         goto out_err;
8309         }
8310
8311         start_sector = dio_bio->bi_iter.bi_sector;
8312         submit_len = dio_bio->bi_iter.bi_size;
8313
8314         do {
8315                 logical = start_sector << 9;
8316                 em = btrfs_get_chunk_map(fs_info, logical, submit_len);
8317                 if (IS_ERR(em)) {
8318                         status = errno_to_blk_status(PTR_ERR(em));
8319                         em = NULL;
8320                         goto out_err_em;
8321                 }
8322                 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
8323                                             logical, &geom);
8324                 if (ret) {
8325                         status = errno_to_blk_status(ret);
8326                         goto out_err_em;
8327                 }
8328
8329                 clone_len = min(submit_len, geom.len);
8330                 ASSERT(clone_len <= UINT_MAX);
8331
8332                 /*
8333                  * This will never fail as it's passing GPF_NOFS and
8334                  * the allocation is backed by btrfs_bioset.
8335                  */
8336                 bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
8337                 bio->bi_private = dip;
8338                 bio->bi_end_io = btrfs_end_dio_bio;
8339
8340                 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
8341                         status = extract_ordered_extent(BTRFS_I(inode), bio,
8342                                                         file_offset);
8343                         if (status) {
8344                                 bio_put(bio);
8345                                 goto out_err;
8346                         }
8347                 }
8348
8349                 ASSERT(submit_len >= clone_len);
8350                 submit_len -= clone_len;
8351
8352                 /*
8353                  * Increase the count before we submit the bio so we know
8354                  * the end IO handler won't happen before we increase the
8355                  * count. Otherwise, the dip might get freed before we're
8356                  * done setting it up.
8357                  *
8358                  * We transfer the initial reference to the last bio, so we
8359                  * don't need to increment the reference count for the last one.
8360                  */
8361                 if (submit_len > 0) {
8362                         refcount_inc(&dip->refs);
8363                         /*
8364                          * If we are submitting more than one bio, submit them
8365                          * all asynchronously. The exception is RAID 5 or 6, as
8366                          * asynchronous checksums make it difficult to collect
8367                          * full stripe writes.
8368                          */
8369                         if (!raid56)
8370                                 async_submit = 1;
8371                 }
8372
8373                 status = btrfs_submit_dio_bio(bio, inode, file_offset,
8374                                                 async_submit);
8375                 if (status) {
8376                         bio_put(bio);
8377                         if (submit_len > 0)
8378                                 refcount_dec(&dip->refs);
8379                         goto out_err_em;
8380                 }
8381
8382                 dio_data->submitted += clone_len;
8383                 clone_offset += clone_len;
8384                 start_sector += clone_len >> 9;
8385                 file_offset += clone_len;
8386
8387                 free_extent_map(em);
8388         } while (submit_len > 0);
8389         return;
8390
8391 out_err_em:
8392         free_extent_map(em);
8393 out_err:
8394         dip->dio_bio->bi_status = status;
8395         btrfs_dio_private_put(dip);
8396 }
8397
8398 const struct iomap_ops btrfs_dio_iomap_ops = {
8399         .iomap_begin            = btrfs_dio_iomap_begin,
8400         .iomap_end              = btrfs_dio_iomap_end,
8401 };
8402
8403 const struct iomap_dio_ops btrfs_dio_ops = {
8404         .submit_io              = btrfs_submit_direct,
8405 };
8406
8407 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
8408                         u64 start, u64 len)
8409 {
8410         int     ret;
8411
8412         ret = fiemap_prep(inode, fieinfo, start, &len, 0);
8413         if (ret)
8414                 return ret;
8415
8416         return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
8417 }
8418
8419 int btrfs_readpage(struct file *file, struct page *page)
8420 {
8421         struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
8422         u64 start = page_offset(page);
8423         u64 end = start + PAGE_SIZE - 1;
8424         struct btrfs_bio_ctrl bio_ctrl = { 0 };
8425         int ret;
8426
8427         btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
8428
8429         ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
8430         if (bio_ctrl.bio)
8431                 ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
8432         return ret;
8433 }
8434
8435 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
8436 {
8437         struct inode *inode = page->mapping->host;
8438         int ret;
8439
8440         if (current->flags & PF_MEMALLOC) {
8441                 redirty_page_for_writepage(wbc, page);
8442                 unlock_page(page);
8443                 return 0;
8444         }
8445
8446         /*
8447          * If we are under memory pressure we will call this directly from the
8448          * VM, we need to make sure we have the inode referenced for the ordered
8449          * extent.  If not just return like we didn't do anything.
8450          */
8451         if (!igrab(inode)) {
8452                 redirty_page_for_writepage(wbc, page);
8453                 return AOP_WRITEPAGE_ACTIVATE;
8454         }
8455         ret = extent_write_full_page(page, wbc);
8456         btrfs_add_delayed_iput(inode);
8457         return ret;
8458 }
8459
8460 static int btrfs_writepages(struct address_space *mapping,
8461                             struct writeback_control *wbc)
8462 {
8463         return extent_writepages(mapping, wbc);
8464 }
8465
8466 static void btrfs_readahead(struct readahead_control *rac)
8467 {
8468         extent_readahead(rac);
8469 }
8470
8471 /*
8472  * For releasepage() and invalidatepage() we have a race window where
8473  * end_page_writeback() is called but the subpage spinlock is not yet released.
8474  * If we continue to release/invalidate the page, we could cause use-after-free
8475  * for subpage spinlock.  So this function is to spin and wait for subpage
8476  * spinlock.
8477  */
8478 static void wait_subpage_spinlock(struct page *page)
8479 {
8480         struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
8481         struct btrfs_subpage *subpage;
8482
8483         if (fs_info->sectorsize == PAGE_SIZE)
8484                 return;
8485
8486         ASSERT(PagePrivate(page) && page->private);
8487         subpage = (struct btrfs_subpage *)page->private;
8488
8489         /*
8490          * This may look insane as we just acquire the spinlock and release it,
8491          * without doing anything.  But we just want to make sure no one is
8492          * still holding the subpage spinlock.
8493          * And since the page is not dirty nor writeback, and we have page
8494          * locked, the only possible way to hold a spinlock is from the endio
8495          * function to clear page writeback.
8496          *
8497          * Here we just acquire the spinlock so that all existing callers
8498          * should exit and we're safe to release/invalidate the page.
8499          */
8500         spin_lock_irq(&subpage->lock);
8501         spin_unlock_irq(&subpage->lock);
8502 }
8503
8504 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8505 {
8506         int ret = try_release_extent_mapping(page, gfp_flags);
8507
8508         if (ret == 1) {
8509                 wait_subpage_spinlock(page);
8510                 clear_page_extent_mapped(page);
8511         }
8512         return ret;
8513 }
8514
8515 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8516 {
8517         if (PageWriteback(page) || PageDirty(page))
8518                 return 0;
8519         return __btrfs_releasepage(page, gfp_flags);
8520 }
8521
8522 #ifdef CONFIG_MIGRATION
8523 static int btrfs_migratepage(struct address_space *mapping,
8524                              struct page *newpage, struct page *page,
8525                              enum migrate_mode mode)
8526 {
8527         int ret;
8528
8529         ret = migrate_page_move_mapping(mapping, newpage, page, 0);
8530         if (ret != MIGRATEPAGE_SUCCESS)
8531                 return ret;
8532
8533         if (page_has_private(page))
8534                 attach_page_private(newpage, detach_page_private(page));
8535
8536         if (PageOrdered(page)) {
8537                 ClearPageOrdered(page);
8538                 SetPageOrdered(newpage);
8539         }
8540
8541         if (mode != MIGRATE_SYNC_NO_COPY)
8542                 migrate_page_copy(newpage, page);
8543         else
8544                 migrate_page_states(newpage, page);
8545         return MIGRATEPAGE_SUCCESS;
8546 }
8547 #endif
8548
8549 static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8550                                  unsigned int length)
8551 {
8552         struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
8553         struct btrfs_fs_info *fs_info = inode->root->fs_info;
8554         struct extent_io_tree *tree = &inode->io_tree;
8555         struct extent_state *cached_state = NULL;
8556         u64 page_start = page_offset(page);
8557         u64 page_end = page_start + PAGE_SIZE - 1;
8558         u64 cur;
8559         int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
8560
8561         /*
8562          * We have page locked so no new ordered extent can be created on this
8563          * page, nor bio can be submitted for this page.
8564          *
8565          * But already submitted bio can still be finished on this page.
8566          * Furthermore, endio function won't skip page which has Ordered
8567          * (Private2) already cleared, so it's possible for endio and
8568          * invalidatepage to do the same ordered extent accounting twice
8569          * on one page.
8570          *
8571          * So here we wait for any submitted bios to finish, so that we won't
8572          * do double ordered extent accounting on the same page.
8573          */
8574         wait_on_page_writeback(page);
8575         wait_subpage_spinlock(page);
8576
8577         /*
8578          * For subpage case, we have call sites like
8579          * btrfs_punch_hole_lock_range() which passes range not aligned to
8580          * sectorsize.
8581          * If the range doesn't cover the full page, we don't need to and
8582          * shouldn't clear page extent mapped, as page->private can still
8583          * record subpage dirty bits for other part of the range.
8584          *
8585          * For cases that can invalidate the full even the range doesn't
8586          * cover the full page, like invalidating the last page, we're
8587          * still safe to wait for ordered extent to finish.
8588          */
8589         if (!(offset == 0 && length == PAGE_SIZE)) {
8590                 btrfs_releasepage(page, GFP_NOFS);
8591                 return;
8592         }
8593
8594         if (!inode_evicting)
8595                 lock_extent_bits(tree, page_start, page_end, &cached_state);
8596
8597         cur = page_start;
8598         while (cur < page_end) {
8599                 struct btrfs_ordered_extent *ordered;
8600                 bool delete_states;
8601                 u64 range_end;
8602                 u32 range_len;
8603
8604                 ordered = btrfs_lookup_first_ordered_range(inode, cur,
8605                                                            page_end + 1 - cur);
8606                 if (!ordered) {
8607                         range_end = page_end;
8608                         /*
8609                          * No ordered extent covering this range, we are safe
8610                          * to delete all extent states in the range.
8611                          */
8612                         delete_states = true;
8613                         goto next;
8614                 }
8615                 if (ordered->file_offset > cur) {
8616                         /*
8617                          * There is a range between [cur, oe->file_offset) not
8618                          * covered by any ordered extent.
8619                          * We are safe to delete all extent states, and handle
8620                          * the ordered extent in the next iteration.
8621                          */
8622                         range_end = ordered->file_offset - 1;
8623                         delete_states = true;
8624                         goto next;
8625                 }
8626
8627                 range_end = min(ordered->file_offset + ordered->num_bytes - 1,
8628                                 page_end);
8629                 ASSERT(range_end + 1 - cur < U32_MAX);
8630                 range_len = range_end + 1 - cur;
8631                 if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) {
8632                         /*
8633                          * If Ordered (Private2) is cleared, it means endio has
8634                          * already been executed for the range.
8635                          * We can't delete the extent states as
8636                          * btrfs_finish_ordered_io() may still use some of them.
8637                          */
8638                         delete_states = false;
8639                         goto next;
8640                 }
8641                 btrfs_page_clear_ordered(fs_info, page, cur, range_len);
8642
8643                 /*
8644                  * IO on this page will never be started, so we need to account
8645                  * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
8646                  * here, must leave that up for the ordered extent completion.
8647                  *
8648                  * This will also unlock the range for incoming
8649                  * btrfs_finish_ordered_io().
8650                  */
8651                 if (!inode_evicting)
8652                         clear_extent_bit(tree, cur, range_end,
8653                                          EXTENT_DELALLOC |
8654                                          EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8655                                          EXTENT_DEFRAG, 1, 0, &cached_state);
8656
8657                 spin_lock_irq(&inode->ordered_tree.lock);
8658                 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8659                 ordered->truncated_len = min(ordered->truncated_len,
8660                                              cur - ordered->file_offset);
8661                 spin_unlock_irq(&inode->ordered_tree.lock);
8662
8663                 if (btrfs_dec_test_ordered_pending(inode, &ordered,
8664                                                    cur, range_end + 1 - cur)) {
8665                         btrfs_finish_ordered_io(ordered);
8666                         /*
8667                          * The ordered extent has finished, now we're again
8668                          * safe to delete all extent states of the range.
8669                          */
8670                         delete_states = true;
8671                 } else {
8672                         /*
8673                          * btrfs_finish_ordered_io() will get executed by endio
8674                          * of other pages, thus we can't delete extent states
8675                          * anymore
8676                          */
8677                         delete_states = false;
8678                 }
8679 next:
8680                 if (ordered)
8681                         btrfs_put_ordered_extent(ordered);
8682                 /*
8683                  * Qgroup reserved space handler
8684                  * Sector(s) here will be either:
8685                  *
8686                  * 1) Already written to disk or bio already finished
8687                  *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
8688                  *    Qgroup will be handled by its qgroup_record then.
8689                  *    btrfs_qgroup_free_data() call will do nothing here.
8690                  *
8691                  * 2) Not written to disk yet
8692                  *    Then btrfs_qgroup_free_data() call will clear the
8693                  *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
8694                  *    reserved data space.
8695                  *    Since the IO will never happen for this page.
8696                  */
8697                 btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
8698                 if (!inode_evicting) {
8699                         clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
8700                                  EXTENT_DELALLOC | EXTENT_UPTODATE |
8701                                  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
8702                                  delete_states, &cached_state);
8703                 }
8704                 cur = range_end + 1;
8705         }
8706         /*
8707          * We have iterated through all ordered extents of the page, the page
8708          * should not have Ordered (Private2) anymore, or the above iteration
8709          * did something wrong.
8710          */
8711         ASSERT(!PageOrdered(page));
8712         btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE);
8713         if (!inode_evicting)
8714                 __btrfs_releasepage(page, GFP_NOFS);
8715         clear_page_extent_mapped(page);
8716 }
8717
8718 /*
8719  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
8720  * called from a page fault handler when a page is first dirtied. Hence we must
8721  * be careful to check for EOF conditions here. We set the page up correctly
8722  * for a written page which means we get ENOSPC checking when writing into
8723  * holes and correct delalloc and unwritten extent mapping on filesystems that
8724  * support these features.
8725  *
8726  * We are not allowed to take the i_mutex here so we have to play games to
8727  * protect against truncate races as the page could now be beyond EOF.  Because
8728  * truncate_setsize() writes the inode size before removing pages, once we have
8729  * the page lock we can determine safely if the page is beyond EOF. If it is not
8730  * beyond EOF, then the page is guaranteed safe against truncation until we
8731  * unlock the page.
8732  */
8733 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
8734 {
8735         struct page *page = vmf->page;
8736         struct inode *inode = file_inode(vmf->vma->vm_file);
8737         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8738         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
8739         struct btrfs_ordered_extent *ordered;
8740         struct extent_state *cached_state = NULL;
8741         struct extent_changeset *data_reserved = NULL;
8742         unsigned long zero_start;
8743         loff_t size;
8744         vm_fault_t ret;
8745         int ret2;
8746         int reserved = 0;
8747         u64 reserved_space;
8748         u64 page_start;
8749         u64 page_end;
8750         u64 end;
8751
8752         reserved_space = PAGE_SIZE;
8753
8754         sb_start_pagefault(inode->i_sb);
8755         page_start = page_offset(page);
8756         page_end = page_start + PAGE_SIZE - 1;
8757         end = page_end;
8758
8759         /*
8760          * Reserving delalloc space after obtaining the page lock can lead to
8761          * deadlock. For example, if a dirty page is locked by this function
8762          * and the call to btrfs_delalloc_reserve_space() ends up triggering
8763          * dirty page write out, then the btrfs_writepage() function could
8764          * end up waiting indefinitely to get a lock on the page currently
8765          * being processed by btrfs_page_mkwrite() function.
8766          */
8767         ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
8768                                             page_start, reserved_space);
8769         if (!ret2) {
8770                 ret2 = file_update_time(vmf->vma->vm_file);
8771                 reserved = 1;
8772         }
8773         if (ret2) {
8774                 ret = vmf_error(ret2);
8775                 if (reserved)
8776                         goto out;
8777                 goto out_noreserve;
8778         }
8779
8780         ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
8781 again:
8782         down_read(&BTRFS_I(inode)->i_mmap_lock);
8783         lock_page(page);
8784         size = i_size_read(inode);
8785
8786         if ((page->mapping != inode->i_mapping) ||
8787             (page_start >= size)) {
8788                 /* page got truncated out from underneath us */
8789                 goto out_unlock;
8790         }
8791         wait_on_page_writeback(page);
8792
8793         lock_extent_bits(io_tree, page_start, page_end, &cached_state);
8794         ret2 = set_page_extent_mapped(page);
8795         if (ret2 < 0) {
8796                 ret = vmf_error(ret2);
8797                 unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
8798                 goto out_unlock;
8799         }
8800
8801         /*
8802          * we can't set the delalloc bits if there are pending ordered
8803          * extents.  Drop our locks and wait for them to finish
8804          */
8805         ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
8806                         PAGE_SIZE);
8807         if (ordered) {
8808                 unlock_extent_cached(io_tree, page_start, page_end,
8809                                      &cached_state);
8810                 unlock_page(page);
8811                 up_read(&BTRFS_I(inode)->i_mmap_lock);
8812                 btrfs_start_ordered_extent(ordered, 1);
8813                 btrfs_put_ordered_extent(ordered);
8814                 goto again;
8815         }
8816
8817         if (page->index == ((size - 1) >> PAGE_SHIFT)) {
8818                 reserved_space = round_up(size - page_start,
8819                                           fs_info->sectorsize);
8820                 if (reserved_space < PAGE_SIZE) {
8821                         end = page_start + reserved_space - 1;
8822                         btrfs_delalloc_release_space(BTRFS_I(inode),
8823                                         data_reserved, page_start,
8824                                         PAGE_SIZE - reserved_space, true);
8825                 }
8826         }
8827
8828         /*
8829          * page_mkwrite gets called when the page is firstly dirtied after it's
8830          * faulted in, but write(2) could also dirty a page and set delalloc
8831          * bits, thus in this case for space account reason, we still need to
8832          * clear any delalloc bits within this page range since we have to
8833          * reserve data&meta space before lock_page() (see above comments).
8834          */
8835         clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
8836                           EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
8837                           EXTENT_DEFRAG, 0, 0, &cached_state);
8838
8839         ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
8840                                         &cached_state);
8841         if (ret2) {
8842                 unlock_extent_cached(io_tree, page_start, page_end,
8843                                      &cached_state);
8844                 ret = VM_FAULT_SIGBUS;
8845                 goto out_unlock;
8846         }
8847
8848         /* page is wholly or partially inside EOF */
8849         if (page_start + PAGE_SIZE > size)
8850                 zero_start = offset_in_page(size);
8851         else
8852                 zero_start = PAGE_SIZE;
8853
8854         if (zero_start != PAGE_SIZE) {
8855                 memzero_page(page, zero_start, PAGE_SIZE - zero_start);
8856                 flush_dcache_page(page);
8857         }
8858         btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
8859         btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
8860         btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
8861
8862         btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
8863
8864         unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
8865         up_read(&BTRFS_I(inode)->i_mmap_lock);
8866
8867         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8868         sb_end_pagefault(inode->i_sb);
8869         extent_changeset_free(data_reserved);
8870         return VM_FAULT_LOCKED;
8871
8872 out_unlock:
8873         unlock_page(page);
8874         up_read(&BTRFS_I(inode)->i_mmap_lock);
8875 out:
8876         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8877         btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
8878                                      reserved_space, (ret != 0));
8879 out_noreserve:
8880         sb_end_pagefault(inode->i_sb);
8881         extent_changeset_free(data_reserved);
8882         return ret;
8883 }
8884
8885 static int btrfs_truncate(struct inode *inode, bool skip_writeback)
8886 {
8887         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8888         struct btrfs_root *root = BTRFS_I(inode)->root;
8889         struct btrfs_block_rsv *rsv;
8890         int ret;
8891         struct btrfs_trans_handle *trans;
8892         u64 mask = fs_info->sectorsize - 1;
8893         u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
8894         u64 extents_found = 0;
8895
8896         if (!skip_writeback) {
8897                 ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
8898                                                (u64)-1);
8899                 if (ret)
8900                         return ret;
8901         }
8902
8903         /*
8904          * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
8905          * things going on here:
8906          *
8907          * 1) We need to reserve space to update our inode.
8908          *
8909          * 2) We need to have something to cache all the space that is going to
8910          * be free'd up by the truncate operation, but also have some slack
8911          * space reserved in case it uses space during the truncate (thank you
8912          * very much snapshotting).
8913          *
8914          * And we need these to be separate.  The fact is we can use a lot of
8915          * space doing the truncate, and we have no earthly idea how much space
8916          * we will use, so we need the truncate reservation to be separate so it
8917          * doesn't end up using space reserved for updating the inode.  We also
8918          * need to be able to stop the transaction and start a new one, which
8919          * means we need to be able to update the inode several times, and we
8920          * have no idea of knowing how many times that will be, so we can't just
8921          * reserve 1 item for the entirety of the operation, so that has to be
8922          * done separately as well.
8923          *
8924          * So that leaves us with
8925          *
8926          * 1) rsv - for the truncate reservation, which we will steal from the
8927          * transaction reservation.
8928          * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
8929          * updating the inode.
8930          */
8931         rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
8932         if (!rsv)
8933                 return -ENOMEM;
8934         rsv->size = min_size;
8935         rsv->failfast = 1;
8936
8937         /*
8938          * 1 for the truncate slack space
8939          * 1 for updating the inode.
8940          */
8941         trans = btrfs_start_transaction(root, 2);
8942         if (IS_ERR(trans)) {
8943                 ret = PTR_ERR(trans);
8944                 goto out;
8945         }
8946
8947         /* Migrate the slack space for the truncate to our reserve */
8948         ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
8949                                       min_size, false);
8950         BUG_ON(ret);
8951
8952         trans->block_rsv = rsv;
8953
8954         while (1) {
8955                 ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
8956                                                  inode->i_size,
8957                                                  BTRFS_EXTENT_DATA_KEY,
8958                                                  &extents_found);
8959                 trans->block_rsv = &fs_info->trans_block_rsv;
8960                 if (ret != -ENOSPC && ret != -EAGAIN)
8961                         break;
8962
8963                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
8964                 if (ret)
8965                         break;
8966
8967                 btrfs_end_transaction(trans);
8968                 btrfs_btree_balance_dirty(fs_info);
8969
8970                 trans = btrfs_start_transaction(root, 2);
8971                 if (IS_ERR(trans)) {
8972                         ret = PTR_ERR(trans);
8973                         trans = NULL;
8974                         break;
8975                 }
8976
8977                 btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
8978                 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
8979                                               rsv, min_size, false);
8980                 BUG_ON(ret);    /* shouldn't happen */
8981                 trans->block_rsv = rsv;
8982         }
8983
8984         /*
8985          * We can't call btrfs_truncate_block inside a trans handle as we could
8986          * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know
8987          * we've truncated everything except the last little bit, and can do
8988          * btrfs_truncate_block and then update the disk_i_size.
8989          */
8990         if (ret == NEED_TRUNCATE_BLOCK) {
8991                 btrfs_end_transaction(trans);
8992                 btrfs_btree_balance_dirty(fs_info);
8993
8994                 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
8995                 if (ret)
8996                         goto out;
8997                 trans = btrfs_start_transaction(root, 1);
8998                 if (IS_ERR(trans)) {
8999                         ret = PTR_ERR(trans);
9000                         goto out;
9001                 }
9002                 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
9003         }
9004
9005         if (trans) {
9006                 int ret2;
9007
9008                 trans->block_rsv = &fs_info->trans_block_rsv;
9009                 ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode));
9010                 if (ret2 && !ret)
9011                         ret = ret2;
9012
9013                 ret2 = btrfs_end_transaction(trans);
9014                 if (ret2 && !ret)
9015                         ret = ret2;
9016                 btrfs_btree_balance_dirty(fs_info);
9017         }
9018 out:
9019         btrfs_free_block_rsv(fs_info, rsv);
9020         /*
9021          * So if we truncate and then write and fsync we normally would just
9022          * write the extents that changed, which is a problem if we need to
9023          * first truncate that entire inode.  So set this flag so we write out
9024          * all of the extents in the inode to the sync log so we're completely
9025          * safe.
9026          *
9027          * If no extents were dropped or trimmed we don't need to force the next
9028          * fsync to truncate all the inode's items from the log and re-log them
9029          * all. This means the truncate operation did not change the file size,
9030          * or changed it to a smaller size but there was only an implicit hole
9031          * between the old i_size and the new i_size, and there were no prealloc
9032          * extents beyond i_size to drop.
9033          */
9034         if (extents_found > 0)
9035                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
9036
9037         return ret;
9038 }
9039
9040 /*
9041  * create a new subvolume directory/inode (helper for the ioctl).
9042  */
9043 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
9044                              struct btrfs_root *new_root,
9045                              struct btrfs_root *parent_root,
9046                              struct user_namespace *mnt_userns)
9047 {
9048         struct inode *inode;
9049         int err;
9050         u64 index = 0;
9051         u64 ino;
9052
9053         err = btrfs_get_free_objectid(new_root, &ino);
9054         if (err < 0)
9055                 return err;
9056
9057         inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
9058                                 ino, ino,
9059                                 S_IFDIR | (~current_umask() & S_IRWXUGO),
9060                                 &index);
9061         if (IS_ERR(inode))
9062                 return PTR_ERR(inode);
9063         inode->i_op = &btrfs_dir_inode_operations;
9064         inode->i_fop = &btrfs_dir_file_operations;
9065
9066         set_nlink(inode, 1);
9067         btrfs_i_size_write(BTRFS_I(inode), 0);
9068         unlock_new_inode(inode);
9069
9070         err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
9071         if (err)
9072                 btrfs_err(new_root->fs_info,
9073                           "error inheriting subvolume %llu properties: %d",
9074                           new_root->root_key.objectid, err);
9075
9076         err = btrfs_update_inode(trans, new_root, BTRFS_I(inode));
9077
9078         iput(inode);
9079         return err;
9080 }
9081
9082 struct inode *btrfs_alloc_inode(struct super_block *sb)
9083 {
9084         struct btrfs_fs_info *fs_info = btrfs_sb(sb);
9085         struct btrfs_inode *ei;
9086         struct inode *inode;
9087
9088         ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
9089         if (!ei)
9090                 return NULL;
9091
9092         ei->root = NULL;
9093         ei->generation = 0;
9094         ei->last_trans = 0;
9095         ei->last_sub_trans = 0;
9096         ei->logged_trans = 0;
9097         ei->delalloc_bytes = 0;
9098         ei->new_delalloc_bytes = 0;
9099         ei->defrag_bytes = 0;
9100         ei->disk_i_size = 0;
9101         ei->flags = 0;
9102         ei->ro_flags = 0;
9103         ei->csum_bytes = 0;
9104         ei->index_cnt = (u64)-1;
9105         ei->dir_index = 0;
9106         ei->last_unlink_trans = 0;
9107         ei->last_reflink_trans = 0;
9108         ei->last_log_commit = 0;
9109
9110         spin_lock_init(&ei->lock);
9111         ei->outstanding_extents = 0;
9112         if (sb->s_magic != BTRFS_TEST_MAGIC)
9113                 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
9114                                               BTRFS_BLOCK_RSV_DELALLOC);
9115         ei->runtime_flags = 0;
9116         ei->prop_compress = BTRFS_COMPRESS_NONE;
9117         ei->defrag_compress = BTRFS_COMPRESS_NONE;
9118
9119         ei->delayed_node = NULL;
9120
9121         ei->i_otime.tv_sec = 0;
9122         ei->i_otime.tv_nsec = 0;
9123
9124         inode = &ei->vfs_inode;
9125         extent_map_tree_init(&ei->extent_tree);
9126         extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
9127         extent_io_tree_init(fs_info, &ei->io_failure_tree,
9128                             IO_TREE_INODE_IO_FAILURE, inode);
9129         extent_io_tree_init(fs_info, &ei->file_extent_tree,
9130                             IO_TREE_INODE_FILE_EXTENT, inode);
9131         ei->io_tree.track_uptodate = true;
9132         ei->io_failure_tree.track_uptodate = true;
9133         atomic_set(&ei->sync_writers, 0);
9134         mutex_init(&ei->log_mutex);
9135         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
9136         INIT_LIST_HEAD(&ei->delalloc_inodes);
9137         INIT_LIST_HEAD(&ei->delayed_iput);
9138         RB_CLEAR_NODE(&ei->rb_node);
9139         init_rwsem(&ei->i_mmap_lock);
9140
9141         return inode;
9142 }
9143
9144 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
9145 void btrfs_test_destroy_inode(struct inode *inode)
9146 {
9147         btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
9148         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9149 }
9150 #endif
9151
9152 void btrfs_free_inode(struct inode *inode)
9153 {
9154         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9155 }
9156
9157 void btrfs_destroy_inode(struct inode *vfs_inode)
9158 {
9159         struct btrfs_ordered_extent *ordered;
9160         struct btrfs_inode *inode = BTRFS_I(vfs_inode);
9161         struct btrfs_root *root = inode->root;
9162
9163         WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
9164         WARN_ON(vfs_inode->i_data.nrpages);
9165         WARN_ON(inode->block_rsv.reserved);
9166         WARN_ON(inode->block_rsv.size);
9167         WARN_ON(inode->outstanding_extents);
9168         if (!S_ISDIR(vfs_inode->i_mode)) {
9169                 WARN_ON(inode->delalloc_bytes);
9170                 WARN_ON(inode->new_delalloc_bytes);
9171         }
9172         WARN_ON(inode->csum_bytes);
9173         WARN_ON(inode->defrag_bytes);
9174
9175         /*
9176          * This can happen where we create an inode, but somebody else also
9177          * created the same inode and we need to destroy the one we already
9178          * created.
9179          */
9180         if (!root)
9181                 return;
9182
9183         while (1) {
9184                 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
9185                 if (!ordered)
9186                         break;
9187                 else {
9188                         btrfs_err(root->fs_info,
9189                                   "found ordered extent %llu %llu on inode cleanup",
9190                                   ordered->file_offset, ordered->num_bytes);
9191                         btrfs_remove_ordered_extent(inode, ordered);
9192                         btrfs_put_ordered_extent(ordered);
9193                         btrfs_put_ordered_extent(ordered);
9194                 }
9195         }
9196         btrfs_qgroup_check_reserved_leak(inode);
9197         inode_tree_del(inode);
9198         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
9199         btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
9200         btrfs_put_root(inode->root);
9201 }
9202
9203 int btrfs_drop_inode(struct inode *inode)
9204 {
9205         struct btrfs_root *root = BTRFS_I(inode)->root;
9206
9207         if (root == NULL)
9208                 return 1;
9209
9210         /* the snap/subvol tree is on deleting */
9211         if (btrfs_root_refs(&root->root_item) == 0)
9212                 return 1;
9213         else
9214                 return generic_drop_inode(inode);
9215 }
9216
9217 static void init_once(void *foo)
9218 {
9219         struct btrfs_inode *ei = (struct btrfs_inode *) foo;
9220
9221         inode_init_once(&ei->vfs_inode);
9222 }
9223
9224 void __cold btrfs_destroy_cachep(void)
9225 {
9226         /*
9227          * Make sure all delayed rcu free inodes are flushed before we
9228          * destroy cache.
9229          */
9230         rcu_barrier();
9231         kmem_cache_destroy(btrfs_inode_cachep);
9232         kmem_cache_destroy(btrfs_trans_handle_cachep);
9233         kmem_cache_destroy(btrfs_path_cachep);
9234         kmem_cache_destroy(btrfs_free_space_cachep);
9235         kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
9236 }
9237
9238 int __init btrfs_init_cachep(void)
9239 {
9240         btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
9241                         sizeof(struct btrfs_inode), 0,
9242                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
9243                         init_once);
9244         if (!btrfs_inode_cachep)
9245                 goto fail;
9246
9247         btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
9248                         sizeof(struct btrfs_trans_handle), 0,
9249                         SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
9250         if (!btrfs_trans_handle_cachep)
9251                 goto fail;
9252
9253         btrfs_path_cachep = kmem_cache_create("btrfs_path",
9254                         sizeof(struct btrfs_path), 0,
9255                         SLAB_MEM_SPREAD, NULL);
9256         if (!btrfs_path_cachep)
9257                 goto fail;
9258
9259         btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
9260                         sizeof(struct btrfs_free_space), 0,
9261                         SLAB_MEM_SPREAD, NULL);
9262         if (!btrfs_free_space_cachep)
9263                 goto fail;
9264
9265         btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
9266                                                         PAGE_SIZE, PAGE_SIZE,
9267                                                         SLAB_MEM_SPREAD, NULL);
9268         if (!btrfs_free_space_bitmap_cachep)
9269                 goto fail;
9270
9271         return 0;
9272 fail:
9273         btrfs_destroy_cachep();
9274         return -ENOMEM;
9275 }
9276
9277 static int btrfs_getattr(struct user_namespace *mnt_userns,
9278                          const struct path *path, struct kstat *stat,
9279                          u32 request_mask, unsigned int flags)
9280 {
9281         u64 delalloc_bytes;
9282         u64 inode_bytes;
9283         struct inode *inode = d_inode(path->dentry);
9284         u32 blocksize = inode->i_sb->s_blocksize;
9285         u32 bi_flags = BTRFS_I(inode)->flags;
9286         u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
9287
9288         stat->result_mask |= STATX_BTIME;
9289         stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
9290         stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
9291         if (bi_flags & BTRFS_INODE_APPEND)
9292                 stat->attributes |= STATX_ATTR_APPEND;
9293         if (bi_flags & BTRFS_INODE_COMPRESS)
9294                 stat->attributes |= STATX_ATTR_COMPRESSED;
9295         if (bi_flags & BTRFS_INODE_IMMUTABLE)
9296                 stat->attributes |= STATX_ATTR_IMMUTABLE;
9297         if (bi_flags & BTRFS_INODE_NODUMP)
9298                 stat->attributes |= STATX_ATTR_NODUMP;
9299         if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
9300                 stat->attributes |= STATX_ATTR_VERITY;
9301
9302         stat->attributes_mask |= (STATX_ATTR_APPEND |
9303                                   STATX_ATTR_COMPRESSED |
9304                                   STATX_ATTR_IMMUTABLE |
9305                                   STATX_ATTR_NODUMP);
9306
9307         generic_fillattr(mnt_userns, inode, stat);
9308         stat->dev = BTRFS_I(inode)->root->anon_dev;
9309
9310         spin_lock(&BTRFS_I(inode)->lock);
9311         delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
9312         inode_bytes = inode_get_bytes(inode);
9313         spin_unlock(&BTRFS_I(inode)->lock);
9314         stat->blocks = (ALIGN(inode_bytes, blocksize) +
9315                         ALIGN(delalloc_bytes, blocksize)) >> 9;
9316         return 0;
9317 }
9318
9319 static int btrfs_rename_exchange(struct inode *old_dir,
9320                               struct dentry *old_dentry,
9321                               struct inode *new_dir,
9322                               struct dentry *new_dentry)
9323 {
9324         struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9325         struct btrfs_trans_handle *trans;
9326         struct btrfs_root *root = BTRFS_I(old_dir)->root;
9327         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9328         struct inode *new_inode = new_dentry->d_inode;
9329         struct inode *old_inode = old_dentry->d_inode;
9330         struct timespec64 ctime = current_time(old_inode);
9331         u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9332         u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
9333         u64 old_idx = 0;
9334         u64 new_idx = 0;
9335         int ret;
9336         int ret2;
9337         bool root_log_pinned = false;
9338         bool dest_log_pinned = false;
9339         bool need_abort = false;
9340
9341         /*
9342          * For non-subvolumes allow exchange only within one subvolume, in the
9343          * same inode namespace. Two subvolumes (represented as directory) can
9344          * be exchanged as they're a logical link and have a fixed inode number.
9345          */
9346         if (root != dest &&
9347             (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
9348              new_ino != BTRFS_FIRST_FREE_OBJECTID))
9349                 return -EXDEV;
9350
9351         /* close the race window with snapshot create/destroy ioctl */
9352         if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
9353             new_ino == BTRFS_FIRST_FREE_OBJECTID)
9354                 down_read(&fs_info->subvol_sem);
9355
9356         /*
9357          * We want to reserve the absolute worst case amount of items.  So if
9358          * both inodes are subvols and we need to unlink them then that would
9359          * require 4 item modifications, but if they are both normal inodes it
9360          * would require 5 item modifications, so we'll assume their normal
9361          * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
9362          * should cover the worst case number of items we'll modify.
9363          */
9364         trans = btrfs_start_transaction(root, 12);
9365         if (IS_ERR(trans)) {
9366                 ret = PTR_ERR(trans);
9367                 goto out_notrans;
9368         }
9369
9370         if (dest != root) {
9371                 ret = btrfs_record_root_in_trans(trans, dest);
9372                 if (ret)
9373                         goto out_fail;
9374         }
9375
9376         /*
9377          * We need to find a free sequence number both in the source and
9378          * in the destination directory for the exchange.
9379          */
9380         ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
9381         if (ret)
9382                 goto out_fail;
9383         ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
9384         if (ret)
9385                 goto out_fail;
9386
9387         BTRFS_I(old_inode)->dir_index = 0ULL;
9388         BTRFS_I(new_inode)->dir_index = 0ULL;
9389
9390         /* Reference for the source. */
9391         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9392                 /* force full log commit if subvolume involved. */
9393                 btrfs_set_log_full_commit(trans);
9394         } else {
9395                 ret = btrfs_insert_inode_ref(trans, dest,
9396                                              new_dentry->d_name.name,
9397                                              new_dentry->d_name.len,
9398                                              old_ino,
9399                                              btrfs_ino(BTRFS_I(new_dir)),
9400                                              old_idx);
9401                 if (ret)
9402                         goto out_fail;
9403                 need_abort = true;
9404         }
9405
9406         /* And now for the dest. */
9407         if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9408                 /* force full log commit if subvolume involved. */
9409                 btrfs_set_log_full_commit(trans);
9410         } else {
9411                 ret = btrfs_insert_inode_ref(trans, root,
9412                                              old_dentry->d_name.name,
9413                                              old_dentry->d_name.len,
9414                                              new_ino,
9415                                              btrfs_ino(BTRFS_I(old_dir)),
9416                                              new_idx);
9417                 if (ret) {
9418                         if (need_abort)
9419                                 btrfs_abort_transaction(trans, ret);
9420                         goto out_fail;
9421                 }
9422         }
9423
9424         /* Update inode version and ctime/mtime. */
9425         inode_inc_iversion(old_dir);
9426         inode_inc_iversion(new_dir);
9427         inode_inc_iversion(old_inode);
9428         inode_inc_iversion(new_inode);
9429         old_dir->i_ctime = old_dir->i_mtime = ctime;
9430         new_dir->i_ctime = new_dir->i_mtime = ctime;
9431         old_inode->i_ctime = ctime;
9432         new_inode->i_ctime = ctime;
9433
9434         if (old_dentry->d_parent != new_dentry->d_parent) {
9435                 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9436                                 BTRFS_I(old_inode), 1);
9437                 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
9438                                 BTRFS_I(new_inode), 1);
9439         }
9440
9441         /*
9442          * Now pin the logs of the roots. We do it to ensure that no other task
9443          * can sync the logs while we are in progress with the rename, because
9444          * that could result in an inconsistency in case any of the inodes that
9445          * are part of this rename operation were logged before.
9446          *
9447          * We pin the logs even if at this precise moment none of the inodes was
9448          * logged before. This is because right after we checked for that, some
9449          * other task fsyncing some other inode not involved with this rename
9450          * operation could log that one of our inodes exists.
9451          *
9452          * We don't need to pin the logs before the above calls to
9453          * btrfs_insert_inode_ref(), since those don't ever need to change a log.
9454          */
9455         if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
9456                 btrfs_pin_log_trans(root);
9457                 root_log_pinned = true;
9458         }
9459         if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
9460                 btrfs_pin_log_trans(dest);
9461                 dest_log_pinned = true;
9462         }
9463
9464         /* src is a subvolume */
9465         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9466                 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
9467         } else { /* src is an inode */
9468                 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
9469                                            BTRFS_I(old_dentry->d_inode),
9470                                            old_dentry->d_name.name,
9471                                            old_dentry->d_name.len);
9472                 if (!ret)
9473                         ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
9474         }
9475         if (ret) {
9476                 btrfs_abort_transaction(trans, ret);
9477                 goto out_fail;
9478         }
9479
9480         /* dest is a subvolume */
9481         if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9482                 ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
9483         } else { /* dest is an inode */
9484                 ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
9485                                            BTRFS_I(new_dentry->d_inode),
9486                                            new_dentry->d_name.name,
9487                                            new_dentry->d_name.len);
9488                 if (!ret)
9489                         ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
9490         }
9491         if (ret) {
9492                 btrfs_abort_transaction(trans, ret);
9493                 goto out_fail;
9494         }
9495
9496         ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9497                              new_dentry->d_name.name,
9498                              new_dentry->d_name.len, 0, old_idx);
9499         if (ret) {
9500                 btrfs_abort_transaction(trans, ret);
9501                 goto out_fail;
9502         }
9503
9504         ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
9505                              old_dentry->d_name.name,
9506                              old_dentry->d_name.len, 0, new_idx);
9507         if (ret) {
9508                 btrfs_abort_transaction(trans, ret);
9509                 goto out_fail;
9510         }
9511
9512         if (old_inode->i_nlink == 1)
9513                 BTRFS_I(old_inode)->dir_index = old_idx;
9514         if (new_inode->i_nlink == 1)
9515                 BTRFS_I(new_inode)->dir_index = new_idx;
9516
9517         if (root_log_pinned) {
9518                 btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
9519                                    new_dentry->d_parent);
9520                 btrfs_end_log_trans(root);
9521                 root_log_pinned = false;
9522         }
9523         if (dest_log_pinned) {
9524                 btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
9525                                    old_dentry->d_parent);
9526                 btrfs_end_log_trans(dest);
9527                 dest_log_pinned = false;
9528         }
9529 out_fail:
9530         /*
9531          * If we have pinned a log and an error happened, we unpin tasks
9532          * trying to sync the log and force them to fallback to a transaction
9533          * commit if the log currently contains any of the inodes involved in
9534          * this rename operation (to ensure we do not persist a log with an
9535          * inconsistent state for any of these inodes or leading to any
9536          * inconsistencies when replayed). If the transaction was aborted, the
9537          * abortion reason is propagated to userspace when attempting to commit
9538          * the transaction. If the log does not contain any of these inodes, we
9539          * allow the tasks to sync it.
9540          */
9541         if (ret && (root_log_pinned || dest_log_pinned)) {
9542                 if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
9543                     btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
9544                     btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
9545                     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
9546                         btrfs_set_log_full_commit(trans);
9547
9548                 if (root_log_pinned) {
9549                         btrfs_end_log_trans(root);
9550                         root_log_pinned = false;
9551                 }
9552                 if (dest_log_pinned) {
9553                         btrfs_end_log_trans(dest);
9554                         dest_log_pinned = false;
9555                 }
9556         }
9557         ret2 = btrfs_end_transaction(trans);
9558         ret = ret ? ret : ret2;
9559 out_notrans:
9560         if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
9561             old_ino == BTRFS_FIRST_FREE_OBJECTID)
9562                 up_read(&fs_info->subvol_sem);
9563
9564         return ret;
9565 }
9566
9567 static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
9568                                      struct btrfs_root *root,
9569                                      struct user_namespace *mnt_userns,
9570                                      struct inode *dir,
9571                                      struct dentry *dentry)
9572 {
9573         int ret;
9574         struct inode *inode;
9575         u64 objectid;
9576         u64 index;
9577
9578         ret = btrfs_get_free_objectid(root, &objectid);
9579         if (ret)
9580                 return ret;
9581
9582         inode = btrfs_new_inode(trans, root, mnt_userns, dir,
9583                                 dentry->d_name.name,
9584                                 dentry->d_name.len,
9585                                 btrfs_ino(BTRFS_I(dir)),
9586                                 objectid,
9587                                 S_IFCHR | WHITEOUT_MODE,
9588                                 &index);
9589
9590         if (IS_ERR(inode)) {
9591                 ret = PTR_ERR(inode);
9592                 return ret;
9593         }
9594
9595         inode->i_op = &btrfs_special_inode_operations;
9596         init_special_inode(inode, inode->i_mode,
9597                 WHITEOUT_DEV);
9598
9599         ret = btrfs_init_inode_security(trans, inode, dir,
9600                                 &dentry->d_name);
9601         if (ret)
9602                 goto out;
9603
9604         ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
9605                                 BTRFS_I(inode), 0, index);
9606         if (ret)
9607                 goto out;
9608
9609         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
9610 out:
9611         unlock_new_inode(inode);
9612         if (ret)
9613                 inode_dec_link_count(inode);
9614         iput(inode);
9615
9616         return ret;
9617 }
9618
9619 static int btrfs_rename(struct user_namespace *mnt_userns,
9620                         struct inode *old_dir, struct dentry *old_dentry,
9621                         struct inode *new_dir, struct dentry *new_dentry,
9622                         unsigned int flags)
9623 {
9624         struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9625         struct btrfs_trans_handle *trans;
9626         unsigned int trans_num_items;
9627         struct btrfs_root *root = BTRFS_I(old_dir)->root;
9628         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9629         struct inode *new_inode = d_inode(new_dentry);
9630         struct inode *old_inode = d_inode(old_dentry);
9631         u64 index = 0;
9632         int ret;
9633         int ret2;
9634         u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9635         bool log_pinned = false;
9636
9637         if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9638                 return -EPERM;
9639
9640         /* we only allow rename subvolume link between subvolumes */
9641         if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9642                 return -EXDEV;
9643
9644         if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9645             (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
9646                 return -ENOTEMPTY;
9647
9648         if (S_ISDIR(old_inode->i_mode) && new_inode &&
9649             new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
9650                 return -ENOTEMPTY;
9651
9652
9653         /* check for collisions, even if the  name isn't there */
9654         ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
9655                              new_dentry->d_name.name,
9656                              new_dentry->d_name.len);
9657
9658         if (ret) {
9659                 if (ret == -EEXIST) {
9660                         /* we shouldn't get
9661                          * eexist without a new_inode */
9662                         if (WARN_ON(!new_inode)) {
9663                                 return ret;
9664                         }
9665                 } else {
9666                         /* maybe -EOVERFLOW */
9667                         return ret;
9668                 }
9669         }
9670         ret = 0;
9671
9672         /*
9673          * we're using rename to replace one file with another.  Start IO on it
9674          * now so  we don't add too much work to the end of the transaction
9675          */
9676         if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9677                 filemap_flush(old_inode->i_mapping);
9678
9679         /* close the racy window with snapshot create/destroy ioctl */
9680         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9681                 down_read(&fs_info->subvol_sem);
9682         /*
9683          * We want to reserve the absolute worst case amount of items.  So if
9684          * both inodes are subvols and we need to unlink them then that would
9685          * require 4 item modifications, but if they are both normal inodes it
9686          * would require 5 item modifications, so we'll assume they are normal
9687          * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
9688          * should cover the worst case number of items we'll modify.
9689          * If our rename has the whiteout flag, we need more 5 units for the
9690          * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
9691          * when selinux is enabled).
9692          */
9693         trans_num_items = 11;
9694         if (flags & RENAME_WHITEOUT)
9695                 trans_num_items += 5;
9696         trans = btrfs_start_transaction(root, trans_num_items);
9697         if (IS_ERR(trans)) {
9698                 ret = PTR_ERR(trans);
9699                 goto out_notrans;
9700         }
9701
9702         if (dest != root) {
9703                 ret = btrfs_record_root_in_trans(trans, dest);
9704                 if (ret)
9705                         goto out_fail;
9706         }
9707
9708         ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
9709         if (ret)
9710                 goto out_fail;
9711
9712         BTRFS_I(old_inode)->dir_index = 0ULL;
9713         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9714                 /* force full log commit if subvolume involved. */
9715                 btrfs_set_log_full_commit(trans);
9716         } else {
9717                 ret = btrfs_insert_inode_ref(trans, dest,
9718                                              new_dentry->d_name.name,
9719                                              new_dentry->d_name.len,
9720                                              old_ino,
9721                                              btrfs_ino(BTRFS_I(new_dir)), index);
9722                 if (ret)
9723                         goto out_fail;
9724         }
9725
9726         inode_inc_iversion(old_dir);
9727         inode_inc_iversion(new_dir);
9728         inode_inc_iversion(old_inode);
9729         old_dir->i_ctime = old_dir->i_mtime =
9730         new_dir->i_ctime = new_dir->i_mtime =
9731         old_inode->i_ctime = current_time(old_dir);
9732
9733         if (old_dentry->d_parent != new_dentry->d_parent)
9734                 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9735                                 BTRFS_I(old_inode), 1);
9736
9737         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9738                 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
9739         } else {
9740                 /*
9741                  * Now pin the log. We do it to ensure that no other task can
9742                  * sync the log while we are in progress with the rename, as
9743                  * that could result in an inconsistency in case any of the
9744                  * inodes that are part of this rename operation were logged
9745                  * before.
9746                  *
9747                  * We pin the log even if at this precise moment none of the
9748                  * inodes was logged before. This is because right after we
9749                  * checked for that, some other task fsyncing some other inode
9750                  * not involved with this rename operation could log that one of
9751                  * our inodes exists.
9752                  *
9753                  * We don't need to pin the logs before the above call to
9754                  * btrfs_insert_inode_ref(), since that does not need to change
9755                  * a log.
9756                  */
9757                 btrfs_pin_log_trans(root);
9758                 log_pinned = true;
9759                 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
9760                                         BTRFS_I(d_inode(old_dentry)),
9761                                         old_dentry->d_name.name,
9762                                         old_dentry->d_name.len);
9763                 if (!ret)
9764                         ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
9765         }
9766         if (ret) {
9767                 btrfs_abort_transaction(trans, ret);
9768                 goto out_fail;
9769         }
9770
9771         if (new_inode) {
9772                 inode_inc_iversion(new_inode);
9773                 new_inode->i_ctime = current_time(new_inode);
9774                 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
9775                              BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9776                         ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
9777                         BUG_ON(new_inode->i_nlink == 0);
9778                 } else {
9779                         ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
9780                                                  BTRFS_I(d_inode(new_dentry)),
9781                                                  new_dentry->d_name.name,
9782                                                  new_dentry->d_name.len);
9783                 }
9784                 if (!ret && new_inode->i_nlink == 0)
9785                         ret = btrfs_orphan_add(trans,
9786                                         BTRFS_I(d_inode(new_dentry)));
9787                 if (ret) {
9788                         btrfs_abort_transaction(trans, ret);
9789                         goto out_fail;
9790                 }
9791         }
9792
9793         ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9794                              new_dentry->d_name.name,
9795                              new_dentry->d_name.len, 0, index);
9796         if (ret) {
9797                 btrfs_abort_transaction(trans, ret);
9798                 goto out_fail;
9799         }
9800
9801         if (old_inode->i_nlink == 1)
9802                 BTRFS_I(old_inode)->dir_index = index;
9803
9804         if (log_pinned) {
9805                 btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
9806                                    new_dentry->d_parent);
9807                 btrfs_end_log_trans(root);
9808                 log_pinned = false;
9809         }
9810
9811         if (flags & RENAME_WHITEOUT) {
9812                 ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
9813                                                 old_dir, old_dentry);
9814
9815                 if (ret) {
9816                         btrfs_abort_transaction(trans, ret);
9817                         goto out_fail;
9818                 }
9819         }
9820 out_fail:
9821         /*
9822          * If we have pinned the log and an error happened, we unpin tasks
9823          * trying to sync the log and force them to fallback to a transaction
9824          * commit if the log currently contains any of the inodes involved in
9825          * this rename operation (to ensure we do not persist a log with an
9826          * inconsistent state for any of these inodes or leading to any
9827          * inconsistencies when replayed). If the transaction was aborted, the
9828          * abortion reason is propagated to userspace when attempting to commit
9829          * the transaction. If the log does not contain any of these inodes, we
9830          * allow the tasks to sync it.
9831          */
9832         if (ret && log_pinned) {
9833                 if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
9834                     btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
9835                     btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
9836                     (new_inode &&
9837                      btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
9838                         btrfs_set_log_full_commit(trans);
9839
9840                 btrfs_end_log_trans(root);
9841                 log_pinned = false;
9842         }
9843         ret2 = btrfs_end_transaction(trans);
9844         ret = ret ? ret : ret2;
9845 out_notrans:
9846         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9847                 up_read(&fs_info->subvol_sem);
9848
9849         return ret;
9850 }
9851
9852 static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
9853                          struct dentry *old_dentry, struct inode *new_dir,
9854                          struct dentry *new_dentry, unsigned int flags)
9855 {
9856         if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
9857                 return -EINVAL;
9858
9859         if (flags & RENAME_EXCHANGE)
9860                 return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
9861                                           new_dentry);
9862
9863         return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
9864                             new_dentry, flags);
9865 }
9866
9867 struct btrfs_delalloc_work {
9868         struct inode *inode;
9869         struct completion completion;
9870         struct list_head list;
9871         struct btrfs_work work;
9872 };
9873
9874 static void btrfs_run_delalloc_work(struct btrfs_work *work)
9875 {
9876         struct btrfs_delalloc_work *delalloc_work;
9877         struct inode *inode;
9878
9879         delalloc_work = container_of(work, struct btrfs_delalloc_work,
9880                                      work);
9881         inode = delalloc_work->inode;
9882         filemap_flush(inode->i_mapping);
9883         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
9884                                 &BTRFS_I(inode)->runtime_flags))
9885                 filemap_flush(inode->i_mapping);
9886
9887         iput(inode);
9888         complete(&delalloc_work->completion);
9889 }
9890
9891 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
9892 {
9893         struct btrfs_delalloc_work *work;
9894
9895         work = kmalloc(sizeof(*work), GFP_NOFS);
9896         if (!work)
9897                 return NULL;
9898
9899         init_completion(&work->completion);
9900         INIT_LIST_HEAD(&work->list);
9901         work->inode = inode;
9902         btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
9903
9904         return work;
9905 }
9906
9907 /*
9908  * some fairly slow code that needs optimization. This walks the list
9909  * of all the inodes with pending delalloc and forces them to disk.
9910  */
9911 static int start_delalloc_inodes(struct btrfs_root *root,
9912                                  struct writeback_control *wbc, bool snapshot,
9913                                  bool in_reclaim_context)
9914 {
9915         struct btrfs_inode *binode;
9916         struct inode *inode;
9917         struct btrfs_delalloc_work *work, *next;
9918         struct list_head works;
9919         struct list_head splice;
9920         int ret = 0;
9921         bool full_flush = wbc->nr_to_write == LONG_MAX;
9922
9923         INIT_LIST_HEAD(&works);
9924         INIT_LIST_HEAD(&splice);
9925
9926         mutex_lock(&root->delalloc_mutex);
9927         spin_lock(&root->delalloc_lock);
9928         list_splice_init(&root->delalloc_inodes, &splice);
9929         while (!list_empty(&splice)) {
9930                 binode = list_entry(splice.next, struct btrfs_inode,
9931                                     delalloc_inodes);
9932
9933                 list_move_tail(&binode->delalloc_inodes,
9934                                &root->delalloc_inodes);
9935
9936                 if (in_reclaim_context &&
9937                     test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
9938                         continue;
9939
9940                 inode = igrab(&binode->vfs_inode);
9941                 if (!inode) {
9942                         cond_resched_lock(&root->delalloc_lock);
9943                         continue;
9944                 }
9945                 spin_unlock(&root->delalloc_lock);
9946
9947                 if (snapshot)
9948                         set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
9949                                 &binode->runtime_flags);
9950                 if (full_flush) {
9951                         work = btrfs_alloc_delalloc_work(inode);
9952                         if (!work) {
9953                                 iput(inode);
9954                                 ret = -ENOMEM;
9955                                 goto out;
9956                         }
9957                         list_add_tail(&work->list, &works);
9958                         btrfs_queue_work(root->fs_info->flush_workers,
9959                                          &work->work);
9960                 } else {
9961                         ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
9962                         btrfs_add_delayed_iput(inode);
9963                         if (ret || wbc->nr_to_write <= 0)
9964                                 goto out;
9965                 }
9966                 cond_resched();
9967                 spin_lock(&root->delalloc_lock);
9968         }
9969         spin_unlock(&root->delalloc_lock);
9970
9971 out:
9972         list_for_each_entry_safe(work, next, &works, list) {
9973                 list_del_init(&work->list);
9974                 wait_for_completion(&work->completion);
9975                 kfree(work);
9976         }
9977
9978         if (!list_empty(&splice)) {
9979                 spin_lock(&root->delalloc_lock);
9980                 list_splice_tail(&splice, &root->delalloc_inodes);
9981                 spin_unlock(&root->delalloc_lock);
9982         }
9983         mutex_unlock(&root->delalloc_mutex);
9984         return ret;
9985 }
9986
9987 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
9988 {
9989         struct writeback_control wbc = {
9990                 .nr_to_write = LONG_MAX,
9991                 .sync_mode = WB_SYNC_NONE,
9992                 .range_start = 0,
9993                 .range_end = LLONG_MAX,
9994         };
9995         struct btrfs_fs_info *fs_info = root->fs_info;
9996
9997         if (BTRFS_FS_ERROR(fs_info))
9998                 return -EROFS;
9999
10000         return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
10001 }
10002
10003 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
10004                                bool in_reclaim_context)
10005 {
10006         struct writeback_control wbc = {
10007                 .nr_to_write = nr,
10008                 .sync_mode = WB_SYNC_NONE,
10009                 .range_start = 0,
10010                 .range_end = LLONG_MAX,
10011         };
10012         struct btrfs_root *root;
10013         struct list_head splice;
10014         int ret;
10015
10016         if (BTRFS_FS_ERROR(fs_info))
10017                 return -EROFS;
10018
10019         INIT_LIST_HEAD(&splice);
10020
10021         mutex_lock(&fs_info->delalloc_root_mutex);
10022         spin_lock(&fs_info->delalloc_root_lock);
10023         list_splice_init(&fs_info->delalloc_roots, &splice);
10024         while (!list_empty(&splice)) {
10025                 /*
10026                  * Reset nr_to_write here so we know that we're doing a full
10027                  * flush.
10028                  */
10029                 if (nr == LONG_MAX)
10030                         wbc.nr_to_write = LONG_MAX;
10031
10032                 root = list_first_entry(&splice, struct btrfs_root,
10033                                         delalloc_root);
10034                 root = btrfs_grab_root(root);
10035                 BUG_ON(!root);
10036                 list_move_tail(&root->delalloc_root,
10037                                &fs_info->delalloc_roots);
10038                 spin_unlock(&fs_info->delalloc_root_lock);
10039
10040                 ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
10041                 btrfs_put_root(root);
10042                 if (ret < 0 || wbc.nr_to_write <= 0)
10043                         goto out;
10044                 spin_lock(&fs_info->delalloc_root_lock);
10045         }
10046         spin_unlock(&fs_info->delalloc_root_lock);
10047
10048         ret = 0;
10049 out:
10050         if (!list_empty(&splice)) {
10051                 spin_lock(&fs_info->delalloc_root_lock);
10052                 list_splice_tail(&splice, &fs_info->delalloc_roots);
10053                 spin_unlock(&fs_info->delalloc_root_lock);
10054         }
10055         mutex_unlock(&fs_info->delalloc_root_mutex);
10056         return ret;
10057 }
10058
10059 static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
10060                          struct dentry *dentry, const char *symname)
10061 {
10062         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
10063         struct btrfs_trans_handle *trans;
10064         struct btrfs_root *root = BTRFS_I(dir)->root;
10065         struct btrfs_path *path;
10066         struct btrfs_key key;
10067         struct inode *inode = NULL;
10068         int err;
10069         u64 objectid;
10070         u64 index = 0;
10071         int name_len;
10072         int datasize;
10073         unsigned long ptr;
10074         struct btrfs_file_extent_item *ei;
10075         struct extent_buffer *leaf;
10076
10077         name_len = strlen(symname);
10078         if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
10079                 return -ENAMETOOLONG;
10080
10081         /*
10082          * 2 items for inode item and ref
10083          * 2 items for dir items
10084          * 1 item for updating parent inode item
10085          * 1 item for the inline extent item
10086          * 1 item for xattr if selinux is on
10087          */
10088         trans = btrfs_start_transaction(root, 7);
10089         if (IS_ERR(trans))
10090                 return PTR_ERR(trans);
10091
10092         err = btrfs_get_free_objectid(root, &objectid);
10093         if (err)
10094                 goto out_unlock;
10095
10096         inode = btrfs_new_inode(trans, root, mnt_userns, dir,
10097                                 dentry->d_name.name, dentry->d_name.len,
10098                                 btrfs_ino(BTRFS_I(dir)), objectid,
10099                                 S_IFLNK | S_IRWXUGO, &index);
10100         if (IS_ERR(inode)) {
10101                 err = PTR_ERR(inode);
10102                 inode = NULL;
10103                 goto out_unlock;
10104         }
10105
10106         /*
10107         * If the active LSM wants to access the inode during
10108         * d_instantiate it needs these. Smack checks to see
10109         * if the filesystem supports xattrs by looking at the
10110         * ops vector.
10111         */
10112         inode->i_fop = &btrfs_file_operations;
10113         inode->i_op = &btrfs_file_inode_operations;
10114         inode->i_mapping->a_ops = &btrfs_aops;
10115
10116         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
10117         if (err)
10118                 goto out_unlock;
10119
10120         path = btrfs_alloc_path();
10121         if (!path) {
10122                 err = -ENOMEM;
10123                 goto out_unlock;
10124         }
10125         key.objectid = btrfs_ino(BTRFS_I(inode));
10126         key.offset = 0;
10127         key.type = BTRFS_EXTENT_DATA_KEY;
10128         datasize = btrfs_file_extent_calc_inline_size(name_len);
10129         err = btrfs_insert_empty_item(trans, root, path, &key,
10130                                       datasize);
10131         if (err) {
10132                 btrfs_free_path(path);
10133                 goto out_unlock;
10134         }
10135         leaf = path->nodes[0];
10136         ei = btrfs_item_ptr(leaf, path->slots[0],
10137                             struct btrfs_file_extent_item);
10138         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
10139         btrfs_set_file_extent_type(leaf, ei,
10140                                    BTRFS_FILE_EXTENT_INLINE);
10141         btrfs_set_file_extent_encryption(leaf, ei, 0);
10142         btrfs_set_file_extent_compression(leaf, ei, 0);
10143         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
10144         btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
10145
10146         ptr = btrfs_file_extent_inline_start(ei);
10147         write_extent_buffer(leaf, symname, ptr, name_len);
10148         btrfs_mark_buffer_dirty(leaf);
10149         btrfs_free_path(path);
10150
10151         inode->i_op = &btrfs_symlink_inode_operations;
10152         inode_nohighmem(inode);
10153         inode_set_bytes(inode, name_len);
10154         btrfs_i_size_write(BTRFS_I(inode), name_len);
10155         err = btrfs_update_inode(trans, root, BTRFS_I(inode));
10156         /*
10157          * Last step, add directory indexes for our symlink inode. This is the
10158          * last step to avoid extra cleanup of these indexes if an error happens
10159          * elsewhere above.
10160          */
10161         if (!err)
10162                 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
10163                                 BTRFS_I(inode), 0, index);
10164         if (err)
10165                 goto out_unlock;
10166
10167         d_instantiate_new(dentry, inode);
10168
10169 out_unlock:
10170         btrfs_end_transaction(trans);
10171         if (err && inode) {
10172                 inode_dec_link_count(inode);
10173                 discard_new_inode(inode);
10174         }
10175         btrfs_btree_balance_dirty(fs_info);
10176         return err;
10177 }
10178
10179 static struct btrfs_trans_handle *insert_prealloc_file_extent(
10180                                        struct btrfs_trans_handle *trans_in,
10181                                        struct btrfs_inode *inode,
10182                                        struct btrfs_key *ins,
10183                                        u64 file_offset)
10184 {
10185         struct btrfs_file_extent_item stack_fi;
10186         struct btrfs_replace_extent_info extent_info;
10187         struct btrfs_trans_handle *trans = trans_in;
10188         struct btrfs_path *path;
10189         u64 start = ins->objectid;
10190         u64 len = ins->offset;
10191         int qgroup_released;
10192         int ret;
10193
10194         memset(&stack_fi, 0, sizeof(stack_fi));
10195
10196         btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
10197         btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
10198         btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
10199         btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
10200         btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
10201         btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
10202         /* Encryption and other encoding is reserved and all 0 */
10203
10204         qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
10205         if (qgroup_released < 0)
10206                 return ERR_PTR(qgroup_released);
10207
10208         if (trans) {
10209                 ret = insert_reserved_file_extent(trans, inode,
10210                                                   file_offset, &stack_fi,
10211                                                   true, qgroup_released);
10212                 if (ret)
10213                         goto free_qgroup;
10214                 return trans;
10215         }
10216
10217         extent_info.disk_offset = start;
10218         extent_info.disk_len = len;
10219         extent_info.data_offset = 0;
10220         extent_info.data_len = len;
10221         extent_info.file_offset = file_offset;
10222         extent_info.extent_buf = (char *)&stack_fi;
10223         extent_info.is_new_extent = true;
10224         extent_info.qgroup_reserved = qgroup_released;
10225         extent_info.insertions = 0;
10226
10227         path = btrfs_alloc_path();
10228         if (!path) {
10229                 ret = -ENOMEM;
10230                 goto free_qgroup;
10231         }
10232
10233         ret = btrfs_replace_file_extents(inode, path, file_offset,
10234                                      file_offset + len - 1, &extent_info,
10235                                      &trans);
10236         btrfs_free_path(path);
10237         if (ret)
10238                 goto free_qgroup;
10239         return trans;
10240
10241 free_qgroup:
10242         /*
10243          * We have released qgroup data range at the beginning of the function,
10244          * and normally qgroup_released bytes will be freed when committing
10245          * transaction.
10246          * But if we error out early, we have to free what we have released
10247          * or we leak qgroup data reservation.
10248          */
10249         btrfs_qgroup_free_refroot(inode->root->fs_info,
10250                         inode->root->root_key.objectid, qgroup_released,
10251                         BTRFS_QGROUP_RSV_DATA);
10252         return ERR_PTR(ret);
10253 }
10254
10255 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
10256                                        u64 start, u64 num_bytes, u64 min_size,
10257                                        loff_t actual_len, u64 *alloc_hint,
10258                                        struct btrfs_trans_handle *trans)
10259 {
10260         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
10261         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
10262         struct extent_map *em;
10263         struct btrfs_root *root = BTRFS_I(inode)->root;
10264         struct btrfs_key ins;
10265         u64 cur_offset = start;
10266         u64 clear_offset = start;
10267         u64 i_size;
10268         u64 cur_bytes;
10269         u64 last_alloc = (u64)-1;
10270         int ret = 0;
10271         bool own_trans = true;
10272         u64 end = start + num_bytes - 1;
10273
10274         if (trans)
10275                 own_trans = false;
10276         while (num_bytes > 0) {
10277                 cur_bytes = min_t(u64, num_bytes, SZ_256M);
10278                 cur_bytes = max(cur_bytes, min_size);
10279                 /*
10280                  * If we are severely fragmented we could end up with really
10281                  * small allocations, so if the allocator is returning small
10282                  * chunks lets make its job easier by only searching for those
10283                  * sized chunks.
10284                  */
10285                 cur_bytes = min(cur_bytes, last_alloc);
10286                 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
10287                                 min_size, 0, *alloc_hint, &ins, 1, 0);
10288                 if (ret)
10289                         break;
10290
10291                 /*
10292                  * We've reserved this space, and thus converted it from
10293                  * ->bytes_may_use to ->bytes_reserved.  Any error that happens
10294                  * from here on out we will only need to clear our reservation
10295                  * for the remaining unreserved area, so advance our
10296                  * clear_offset by our extent size.
10297                  */
10298                 clear_offset += ins.offset;
10299
10300                 last_alloc = ins.offset;
10301                 trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
10302                                                     &ins, cur_offset);
10303                 /*
10304                  * Now that we inserted the prealloc extent we can finally
10305                  * decrement the number of reservations in the block group.
10306                  * If we did it before, we could race with relocation and have
10307                  * relocation miss the reserved extent, making it fail later.
10308                  */
10309                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10310                 if (IS_ERR(trans)) {
10311                         ret = PTR_ERR(trans);
10312                         btrfs_free_reserved_extent(fs_info, ins.objectid,
10313                                                    ins.offset, 0);
10314                         break;
10315                 }
10316
10317                 btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
10318                                         cur_offset + ins.offset -1, 0);
10319
10320                 em = alloc_extent_map();
10321                 if (!em) {
10322                         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
10323                                 &BTRFS_I(inode)->runtime_flags);
10324                         goto next;
10325                 }
10326
10327                 em->start = cur_offset;
10328                 em->orig_start = cur_offset;
10329                 em->len = ins.offset;
10330                 em->block_start = ins.objectid;
10331                 em->block_len = ins.offset;
10332                 em->orig_block_len = ins.offset;
10333                 em->ram_bytes = ins.offset;
10334                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
10335                 em->generation = trans->transid;
10336
10337                 while (1) {
10338                         write_lock(&em_tree->lock);
10339                         ret = add_extent_mapping(em_tree, em, 1);
10340                         write_unlock(&em_tree->lock);
10341                         if (ret != -EEXIST)
10342                                 break;
10343                         btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
10344                                                 cur_offset + ins.offset - 1,
10345                                                 0);
10346                 }
10347                 free_extent_map(em);
10348 next:
10349                 num_bytes -= ins.offset;
10350                 cur_offset += ins.offset;
10351                 *alloc_hint = ins.objectid + ins.offset;
10352
10353                 inode_inc_iversion(inode);
10354                 inode->i_ctime = current_time(inode);
10355                 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
10356                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
10357                     (actual_len > inode->i_size) &&
10358                     (cur_offset > inode->i_size)) {
10359                         if (cur_offset > actual_len)
10360                                 i_size = actual_len;
10361                         else
10362                                 i_size = cur_offset;
10363                         i_size_write(inode, i_size);
10364                         btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
10365                 }
10366
10367                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
10368
10369                 if (ret) {
10370                         btrfs_abort_transaction(trans, ret);
10371                         if (own_trans)
10372                                 btrfs_end_transaction(trans);
10373                         break;
10374                 }
10375
10376                 if (own_trans) {
10377                         btrfs_end_transaction(trans);
10378                         trans = NULL;
10379                 }
10380         }
10381         if (clear_offset < end)
10382                 btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
10383                         end - clear_offset + 1);
10384         return ret;
10385 }
10386
10387 int btrfs_prealloc_file_range(struct inode *inode, int mode,
10388                               u64 start, u64 num_bytes, u64 min_size,
10389                               loff_t actual_len, u64 *alloc_hint)
10390 {
10391         return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10392                                            min_size, actual_len, alloc_hint,
10393                                            NULL);
10394 }
10395
10396 int btrfs_prealloc_file_range_trans(struct inode *inode,
10397                                     struct btrfs_trans_handle *trans, int mode,
10398                                     u64 start, u64 num_bytes, u64 min_size,
10399                                     loff_t actual_len, u64 *alloc_hint)
10400 {
10401         return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10402                                            min_size, actual_len, alloc_hint, trans);
10403 }
10404
10405 static int btrfs_set_page_dirty(struct page *page)
10406 {
10407         return __set_page_dirty_nobuffers(page);
10408 }
10409
10410 static int btrfs_permission(struct user_namespace *mnt_userns,
10411                             struct inode *inode, int mask)
10412 {
10413         struct btrfs_root *root = BTRFS_I(inode)->root;
10414         umode_t mode = inode->i_mode;
10415
10416         if (mask & MAY_WRITE &&
10417             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
10418                 if (btrfs_root_readonly(root))
10419                         return -EROFS;
10420                 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
10421                         return -EACCES;
10422         }
10423         return generic_permission(mnt_userns, inode, mask);
10424 }
10425
10426 static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
10427                          struct dentry *dentry, umode_t mode)
10428 {
10429         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
10430         struct btrfs_trans_handle *trans;
10431         struct btrfs_root *root = BTRFS_I(dir)->root;
10432         struct inode *inode = NULL;
10433         u64 objectid;
10434         u64 index;
10435         int ret = 0;
10436
10437         /*
10438          * 5 units required for adding orphan entry
10439          */
10440         trans = btrfs_start_transaction(root, 5);
10441         if (IS_ERR(trans))
10442                 return PTR_ERR(trans);
10443
10444         ret = btrfs_get_free_objectid(root, &objectid);
10445         if (ret)
10446                 goto out;
10447
10448         inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
10449                         btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
10450         if (IS_ERR(inode)) {
10451                 ret = PTR_ERR(inode);
10452                 inode = NULL;
10453                 goto out;
10454         }
10455
10456         inode->i_fop = &btrfs_file_operations;
10457         inode->i_op = &btrfs_file_inode_operations;
10458
10459         inode->i_mapping->a_ops = &btrfs_aops;
10460
10461         ret = btrfs_init_inode_security(trans, inode, dir, NULL);
10462         if (ret)
10463                 goto out;
10464
10465         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
10466         if (ret)
10467                 goto out;
10468         ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10469         if (ret)
10470                 goto out;
10471
10472         /*
10473          * We set number of links to 0 in btrfs_new_inode(), and here we set
10474          * it to 1 because d_tmpfile() will issue a warning if the count is 0,
10475          * through:
10476          *
10477          *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
10478          */
10479         set_nlink(inode, 1);
10480         d_tmpfile(dentry, inode);
10481         unlock_new_inode(inode);
10482         mark_inode_dirty(inode);
10483 out:
10484         btrfs_end_transaction(trans);
10485         if (ret && inode)
10486                 discard_new_inode(inode);
10487         btrfs_btree_balance_dirty(fs_info);
10488         return ret;
10489 }
10490
10491 void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
10492 {
10493         struct btrfs_fs_info *fs_info = inode->root->fs_info;
10494         unsigned long index = start >> PAGE_SHIFT;
10495         unsigned long end_index = end >> PAGE_SHIFT;
10496         struct page *page;
10497         u32 len;
10498
10499         ASSERT(end + 1 - start <= U32_MAX);
10500         len = end + 1 - start;
10501         while (index <= end_index) {
10502                 page = find_get_page(inode->vfs_inode.i_mapping, index);
10503                 ASSERT(page); /* Pages should be in the extent_io_tree */
10504
10505                 btrfs_page_set_writeback(fs_info, page, start, len);
10506                 put_page(page);
10507                 index++;
10508         }
10509 }
10510
10511 #ifdef CONFIG_SWAP
10512 /*
10513  * Add an entry indicating a block group or device which is pinned by a
10514  * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
10515  * negative errno on failure.
10516  */
10517 static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
10518                                   bool is_block_group)
10519 {
10520         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10521         struct btrfs_swapfile_pin *sp, *entry;
10522         struct rb_node **p;
10523         struct rb_node *parent = NULL;
10524
10525         sp = kmalloc(sizeof(*sp), GFP_NOFS);
10526         if (!sp)
10527                 return -ENOMEM;
10528         sp->ptr = ptr;
10529         sp->inode = inode;
10530         sp->is_block_group = is_block_group;
10531         sp->bg_extent_count = 1;
10532
10533         spin_lock(&fs_info->swapfile_pins_lock);
10534         p = &fs_info->swapfile_pins.rb_node;
10535         while (*p) {
10536                 parent = *p;
10537                 entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
10538                 if (sp->ptr < entry->ptr ||
10539                     (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
10540                         p = &(*p)->rb_left;
10541                 } else if (sp->ptr > entry->ptr ||
10542                            (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
10543                         p = &(*p)->rb_right;
10544                 } else {
10545                         if (is_block_group)
10546                                 entry->bg_extent_count++;
10547                         spin_unlock(&fs_info->swapfile_pins_lock);
10548                         kfree(sp);
10549                         return 1;
10550                 }
10551         }
10552         rb_link_node(&sp->node, parent, p);
10553         rb_insert_color(&sp->node, &fs_info->swapfile_pins);
10554         spin_unlock(&fs_info->swapfile_pins_lock);
10555         return 0;
10556 }
10557
10558 /* Free all of the entries pinned by this swapfile. */
10559 static void btrfs_free_swapfile_pins(struct inode *inode)
10560 {
10561         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10562         struct btrfs_swapfile_pin *sp;
10563         struct rb_node *node, *next;
10564
10565         spin_lock(&fs_info->swapfile_pins_lock);
10566         node = rb_first(&fs_info->swapfile_pins);
10567         while (node) {
10568                 next = rb_next(node);
10569                 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
10570                 if (sp->inode == inode) {
10571                         rb_erase(&sp->node, &fs_info->swapfile_pins);
10572                         if (sp->is_block_group) {
10573                                 btrfs_dec_block_group_swap_extents(sp->ptr,
10574                                                            sp->bg_extent_count);
10575                                 btrfs_put_block_group(sp->ptr);
10576                         }
10577                         kfree(sp);
10578                 }
10579                 node = next;
10580         }
10581         spin_unlock(&fs_info->swapfile_pins_lock);
10582 }
10583
10584 struct btrfs_swap_info {
10585         u64 start;
10586         u64 block_start;
10587         u64 block_len;
10588         u64 lowest_ppage;
10589         u64 highest_ppage;
10590         unsigned long nr_pages;
10591         int nr_extents;
10592 };
10593
10594 static int btrfs_add_swap_extent(struct swap_info_struct *sis,
10595                                  struct btrfs_swap_info *bsi)
10596 {
10597         unsigned long nr_pages;
10598         u64 first_ppage, first_ppage_reported, next_ppage;
10599         int ret;
10600
10601         first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
10602         next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
10603                                 PAGE_SIZE) >> PAGE_SHIFT;
10604
10605         if (first_ppage >= next_ppage)
10606                 return 0;
10607         nr_pages = next_ppage - first_ppage;
10608
10609         first_ppage_reported = first_ppage;
10610         if (bsi->start == 0)
10611                 first_ppage_reported++;
10612         if (bsi->lowest_ppage > first_ppage_reported)
10613                 bsi->lowest_ppage = first_ppage_reported;
10614         if (bsi->highest_ppage < (next_ppage - 1))
10615                 bsi->highest_ppage = next_ppage - 1;
10616
10617         ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
10618         if (ret < 0)
10619                 return ret;
10620         bsi->nr_extents += ret;
10621         bsi->nr_pages += nr_pages;
10622         return 0;
10623 }
10624
10625 static void btrfs_swap_deactivate(struct file *file)
10626 {
10627         struct inode *inode = file_inode(file);
10628
10629         btrfs_free_swapfile_pins(inode);
10630         atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
10631 }
10632
10633 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10634                                sector_t *span)
10635 {
10636         struct inode *inode = file_inode(file);
10637         struct btrfs_root *root = BTRFS_I(inode)->root;
10638         struct btrfs_fs_info *fs_info = root->fs_info;
10639         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
10640         struct extent_state *cached_state = NULL;
10641         struct extent_map *em = NULL;
10642         struct btrfs_device *device = NULL;
10643         struct btrfs_swap_info bsi = {
10644                 .lowest_ppage = (sector_t)-1ULL,
10645         };
10646         int ret = 0;
10647         u64 isize;
10648         u64 start;
10649
10650         /*
10651          * If the swap file was just created, make sure delalloc is done. If the
10652          * file changes again after this, the user is doing something stupid and
10653          * we don't really care.
10654          */
10655         ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
10656         if (ret)
10657                 return ret;
10658
10659         /*
10660          * The inode is locked, so these flags won't change after we check them.
10661          */
10662         if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
10663                 btrfs_warn(fs_info, "swapfile must not be compressed");
10664                 return -EINVAL;
10665         }
10666         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
10667                 btrfs_warn(fs_info, "swapfile must not be copy-on-write");
10668                 return -EINVAL;
10669         }
10670         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
10671                 btrfs_warn(fs_info, "swapfile must not be checksummed");
10672                 return -EINVAL;
10673         }
10674
10675         /*
10676          * Balance or device remove/replace/resize can move stuff around from
10677          * under us. The exclop protection makes sure they aren't running/won't
10678          * run concurrently while we are mapping the swap extents, and
10679          * fs_info->swapfile_pins prevents them from running while the swap
10680          * file is active and moving the extents. Note that this also prevents
10681          * a concurrent device add which isn't actually necessary, but it's not
10682          * really worth the trouble to allow it.
10683          */
10684         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
10685                 btrfs_warn(fs_info,
10686            "cannot activate swapfile while exclusive operation is running");
10687                 return -EBUSY;
10688         }
10689
10690         /*
10691          * Prevent snapshot creation while we are activating the swap file.
10692          * We do not want to race with snapshot creation. If snapshot creation
10693          * already started before we bumped nr_swapfiles from 0 to 1 and
10694          * completes before the first write into the swap file after it is
10695          * activated, than that write would fallback to COW.
10696          */
10697         if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
10698                 btrfs_exclop_finish(fs_info);
10699                 btrfs_warn(fs_info,
10700            "cannot activate swapfile because snapshot creation is in progress");
10701                 return -EINVAL;
10702         }
10703         /*
10704          * Snapshots can create extents which require COW even if NODATACOW is
10705          * set. We use this counter to prevent snapshots. We must increment it
10706          * before walking the extents because we don't want a concurrent
10707          * snapshot to run after we've already checked the extents.
10708          */
10709         atomic_inc(&root->nr_swapfiles);
10710
10711         isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10712
10713         lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
10714         start = 0;
10715         while (start < isize) {
10716                 u64 logical_block_start, physical_block_start;
10717                 struct btrfs_block_group *bg;
10718                 u64 len = isize - start;
10719
10720                 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
10721                 if (IS_ERR(em)) {
10722                         ret = PTR_ERR(em);
10723                         goto out;
10724                 }
10725
10726                 if (em->block_start == EXTENT_MAP_HOLE) {
10727                         btrfs_warn(fs_info, "swapfile must not have holes");
10728                         ret = -EINVAL;
10729                         goto out;
10730                 }
10731                 if (em->block_start == EXTENT_MAP_INLINE) {
10732                         /*
10733                          * It's unlikely we'll ever actually find ourselves
10734                          * here, as a file small enough to fit inline won't be
10735                          * big enough to store more than the swap header, but in
10736                          * case something changes in the future, let's catch it
10737                          * here rather than later.
10738                          */
10739                         btrfs_warn(fs_info, "swapfile must not be inline");
10740                         ret = -EINVAL;
10741                         goto out;
10742                 }
10743                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10744                         btrfs_warn(fs_info, "swapfile must not be compressed");
10745                         ret = -EINVAL;
10746                         goto out;
10747                 }
10748
10749                 logical_block_start = em->block_start + (start - em->start);
10750                 len = min(len, em->len - (start - em->start));
10751                 free_extent_map(em);
10752                 em = NULL;
10753
10754                 ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
10755                 if (ret < 0) {
10756                         goto out;
10757                 } else if (ret) {
10758                         ret = 0;
10759                 } else {
10760                         btrfs_warn(fs_info,
10761                                    "swapfile must not be copy-on-write");
10762                         ret = -EINVAL;
10763                         goto out;
10764                 }
10765
10766                 em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
10767                 if (IS_ERR(em)) {
10768                         ret = PTR_ERR(em);
10769                         goto out;
10770                 }
10771
10772                 if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10773                         btrfs_warn(fs_info,
10774                                    "swapfile must have single data profile");
10775                         ret = -EINVAL;
10776                         goto out;
10777                 }
10778
10779                 if (device == NULL) {
10780                         device = em->map_lookup->stripes[0].dev;
10781                         ret = btrfs_add_swapfile_pin(inode, device, false);
10782                         if (ret == 1)
10783                                 ret = 0;
10784                         else if (ret)
10785                                 goto out;
10786                 } else if (device != em->map_lookup->stripes[0].dev) {
10787                         btrfs_warn(fs_info, "swapfile must be on one device");
10788                         ret = -EINVAL;
10789                         goto out;
10790                 }
10791
10792                 physical_block_start = (em->map_lookup->stripes[0].physical +
10793                                         (logical_block_start - em->start));
10794                 len = min(len, em->len - (logical_block_start - em->start));
10795                 free_extent_map(em);
10796                 em = NULL;
10797
10798                 bg = btrfs_lookup_block_group(fs_info, logical_block_start);
10799                 if (!bg) {
10800                         btrfs_warn(fs_info,
10801                            "could not find block group containing swapfile");
10802                         ret = -EINVAL;
10803                         goto out;
10804                 }
10805
10806                 if (!btrfs_inc_block_group_swap_extents(bg)) {
10807                         btrfs_warn(fs_info,
10808                            "block group for swapfile at %llu is read-only%s",
10809                            bg->start,
10810                            atomic_read(&fs_info->scrubs_running) ?
10811                                        " (scrub running)" : "");
10812                         btrfs_put_block_group(bg);
10813                         ret = -EINVAL;
10814                         goto out;
10815                 }
10816
10817                 ret = btrfs_add_swapfile_pin(inode, bg, true);
10818                 if (ret) {
10819                         btrfs_put_block_group(bg);
10820                         if (ret == 1)
10821                                 ret = 0;
10822                         else
10823                                 goto out;
10824                 }
10825
10826                 if (bsi.block_len &&
10827                     bsi.block_start + bsi.block_len == physical_block_start) {
10828                         bsi.block_len += len;
10829                 } else {
10830                         if (bsi.block_len) {
10831                                 ret = btrfs_add_swap_extent(sis, &bsi);
10832                                 if (ret)
10833                                         goto out;
10834                         }
10835                         bsi.start = start;
10836                         bsi.block_start = physical_block_start;
10837                         bsi.block_len = len;
10838                 }
10839
10840                 start += len;
10841         }
10842
10843         if (bsi.block_len)
10844                 ret = btrfs_add_swap_extent(sis, &bsi);
10845
10846 out:
10847         if (!IS_ERR_OR_NULL(em))
10848                 free_extent_map(em);
10849
10850         unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
10851
10852         if (ret)
10853                 btrfs_swap_deactivate(file);
10854
10855         btrfs_drew_write_unlock(&root->snapshot_lock);
10856
10857         btrfs_exclop_finish(fs_info);
10858
10859         if (ret)
10860                 return ret;
10861
10862         if (device)
10863                 sis->bdev = device->bdev;
10864         *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10865         sis->max = bsi.nr_pages;
10866         sis->pages = bsi.nr_pages - 1;
10867         sis->highest_bit = bsi.nr_pages - 1;
10868         return bsi.nr_extents;
10869 }
10870 #else
10871 static void btrfs_swap_deactivate(struct file *file)
10872 {
10873 }
10874
10875 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10876                                sector_t *span)
10877 {
10878         return -EOPNOTSUPP;
10879 }
10880 #endif
10881
10882 /*
10883  * Update the number of bytes used in the VFS' inode. When we replace extents in
10884  * a range (clone, dedupe, fallocate's zero range), we must update the number of
10885  * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10886  * always get a correct value.
10887  */
10888 void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10889                               const u64 add_bytes,
10890                               const u64 del_bytes)
10891 {
10892         if (add_bytes == del_bytes)
10893                 return;
10894
10895         spin_lock(&inode->lock);
10896         if (del_bytes > 0)
10897                 inode_sub_bytes(&inode->vfs_inode, del_bytes);
10898         if (add_bytes > 0)
10899                 inode_add_bytes(&inode->vfs_inode, add_bytes);
10900         spin_unlock(&inode->lock);
10901 }
10902
10903 static const struct inode_operations btrfs_dir_inode_operations = {
10904         .getattr        = btrfs_getattr,
10905         .lookup         = btrfs_lookup,
10906         .create         = btrfs_create,
10907         .unlink         = btrfs_unlink,
10908         .link           = btrfs_link,
10909         .mkdir          = btrfs_mkdir,
10910         .rmdir          = btrfs_rmdir,
10911         .rename         = btrfs_rename2,
10912         .symlink        = btrfs_symlink,
10913         .setattr        = btrfs_setattr,
10914         .mknod          = btrfs_mknod,
10915         .listxattr      = btrfs_listxattr,
10916         .permission     = btrfs_permission,
10917         .get_acl        = btrfs_get_acl,
10918         .set_acl        = btrfs_set_acl,
10919         .update_time    = btrfs_update_time,
10920         .tmpfile        = btrfs_tmpfile,
10921         .fileattr_get   = btrfs_fileattr_get,
10922         .fileattr_set   = btrfs_fileattr_set,
10923 };
10924
10925 static const struct file_operations btrfs_dir_file_operations = {
10926         .llseek         = generic_file_llseek,
10927         .read           = generic_read_dir,
10928         .iterate_shared = btrfs_real_readdir,
10929         .open           = btrfs_opendir,
10930         .unlocked_ioctl = btrfs_ioctl,
10931 #ifdef CONFIG_COMPAT
10932         .compat_ioctl   = btrfs_compat_ioctl,
10933 #endif
10934         .release        = btrfs_release_file,
10935         .fsync          = btrfs_sync_file,
10936 };
10937
10938 /*
10939  * btrfs doesn't support the bmap operation because swapfiles
10940  * use bmap to make a mapping of extents in the file.  They assume
10941  * these extents won't change over the life of the file and they
10942  * use the bmap result to do IO directly to the drive.
10943  *
10944  * the btrfs bmap call would return logical addresses that aren't
10945  * suitable for IO and they also will change frequently as COW
10946  * operations happen.  So, swapfile + btrfs == corruption.
10947  *
10948  * For now we're avoiding this by dropping bmap.
10949  */
10950 static const struct address_space_operations btrfs_aops = {
10951         .readpage       = btrfs_readpage,
10952         .writepage      = btrfs_writepage,
10953         .writepages     = btrfs_writepages,
10954         .readahead      = btrfs_readahead,
10955         .direct_IO      = noop_direct_IO,
10956         .invalidatepage = btrfs_invalidatepage,
10957         .releasepage    = btrfs_releasepage,
10958 #ifdef CONFIG_MIGRATION
10959         .migratepage    = btrfs_migratepage,
10960 #endif
10961         .set_page_dirty = btrfs_set_page_dirty,
10962         .error_remove_page = generic_error_remove_page,
10963         .swap_activate  = btrfs_swap_activate,
10964         .swap_deactivate = btrfs_swap_deactivate,
10965 };
10966
10967 static const struct inode_operations btrfs_file_inode_operations = {
10968         .getattr        = btrfs_getattr,
10969         .setattr        = btrfs_setattr,
10970         .listxattr      = btrfs_listxattr,
10971         .permission     = btrfs_permission,
10972         .fiemap         = btrfs_fiemap,
10973         .get_acl        = btrfs_get_acl,
10974         .set_acl        = btrfs_set_acl,
10975         .update_time    = btrfs_update_time,
10976         .fileattr_get   = btrfs_fileattr_get,
10977         .fileattr_set   = btrfs_fileattr_set,
10978 };
10979 static const struct inode_operations btrfs_special_inode_operations = {
10980         .getattr        = btrfs_getattr,
10981         .setattr        = btrfs_setattr,
10982         .permission     = btrfs_permission,
10983         .listxattr      = btrfs_listxattr,
10984         .get_acl        = btrfs_get_acl,
10985         .set_acl        = btrfs_set_acl,
10986         .update_time    = btrfs_update_time,
10987 };
10988 static const struct inode_operations btrfs_symlink_inode_operations = {
10989         .get_link       = page_get_link,
10990         .getattr        = btrfs_getattr,
10991         .setattr        = btrfs_setattr,
10992         .permission     = btrfs_permission,
10993         .listxattr      = btrfs_listxattr,
10994         .update_time    = btrfs_update_time,
10995 };
10996
10997 const struct dentry_operations btrfs_dentry_operations = {
10998         .d_delete       = btrfs_dentry_delete,
10999 };