fs/btrfs/direct-io.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include <linux/fsverity.h>
   4 #include <linux/iomap.h>
   5 #include "ctree.h"
   6 #include "delalloc-space.h"
   7 #include "direct-io.h"
   8 #include "extent-tree.h"
   9 #include "file.h"
  10 #include "fs.h"
  11 #include "transaction.h"
  12 #include "volumes.h"
  13
  14 struct btrfs_dio_data {
  15         ssize_t submitted;
  16         struct extent_changeset *data_reserved;
  17         struct btrfs_ordered_extent *ordered;
  18         bool data_space_reserved;
  19         bool nocow_done;
  20 };
  21
  22 struct btrfs_dio_private {
  23         /* Range of I/O */
  24         u64 file_offset;
  25         u32 bytes;
  26
  27         /* This must be last */
  28         struct btrfs_bio bbio;
  29 };
  30
  31 static struct bio_set btrfs_dio_bioset;
  32
  33 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
  34                               struct extent_state **cached_state,
  35                               unsigned int iomap_flags)
  36 {
  37         const bool writing = (iomap_flags & IOMAP_WRITE);
  38         const bool nowait = (iomap_flags & IOMAP_NOWAIT);
  39         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
  40         struct btrfs_ordered_extent *ordered;
  41         int ret = 0;
  42
  43         /* Direct lock must be taken before the extent lock. */
  44         if (nowait) {
  45                 if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
  46                         return -EAGAIN;
  47         } else {
  48                 lock_dio_extent(io_tree, lockstart, lockend, cached_state);
  49         }
  50
  51         while (1) {
  52                 if (nowait) {
  53                         if (!try_lock_extent(io_tree, lockstart, lockend,
  54                                              cached_state)) {
  55                                 ret = -EAGAIN;
  56                                 break;
  57                         }
  58                 } else {
  59                         lock_extent(io_tree, lockstart, lockend, cached_state);
  60                 }
  61                 /*
  62                  * We're concerned with the entire range that we're going to be
  63                  * doing DIO to, so we need to make sure there's no ordered
  64                  * extents in this range.
  65                  */
  66                 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
  67                                                      lockend - lockstart + 1);
  68
  69                 /*
  70                  * We need to make sure there are no buffered pages in this
  71                  * range either, we could have raced between the invalidate in
  72                  * generic_file_direct_write and locking the extent.  The
  73                  * invalidate needs to happen so that reads after a write do not
  74                  * get stale data.
  75                  */
  76                 if (!ordered &&
  77                     (!writing || !filemap_range_has_page(inode->i_mapping,
  78                                                          lockstart, lockend)))
  79                         break;
  80
  81                 unlock_extent(io_tree, lockstart, lockend, cached_state);
  82
  83                 if (ordered) {
  84                         if (nowait) {
  85                                 btrfs_put_ordered_extent(ordered);
  86                                 ret = -EAGAIN;
  87                                 break;
  88                         }
  89                         /*
  90                          * If we are doing a DIO read and the ordered extent we
  91                          * found is for a buffered write, we can not wait for it
  92                          * to complete and retry, because if we do so we can
  93                          * deadlock with concurrent buffered writes on page
  94                          * locks. This happens only if our DIO read covers more
  95                          * than one extent map, if at this point has already
  96                          * created an ordered extent for a previous extent map
  97                          * and locked its range in the inode's io tree, and a
  98                          * concurrent write against that previous extent map's
  99                          * range and this range started (we unlock the ranges
 100                          * in the io tree only when the bios complete and
 101                          * buffered writes always lock pages before attempting
 102                          * to lock range in the io tree).
 103                          */
 104                         if (writing ||
 105                             test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
 106                                 btrfs_start_ordered_extent(ordered);
 107                         else
 108                                 ret = nowait ? -EAGAIN : -ENOTBLK;
 109                         btrfs_put_ordered_extent(ordered);
 110                 } else {
 111                         /*
 112                          * We could trigger writeback for this range (and wait
 113                          * for it to complete) and then invalidate the pages for
 114                          * this range (through invalidate_inode_pages2_range()),
 115                          * but that can lead us to a deadlock with a concurrent
 116                          * call to readahead (a buffered read or a defrag call
 117                          * triggered a readahead) on a page lock due to an
 118                          * ordered dio extent we created before but did not have
 119                          * yet a corresponding bio submitted (whence it can not
 120                          * complete), which makes readahead wait for that
 121                          * ordered extent to complete while holding a lock on
 122                          * that page.
 123                          */
 124                         ret = nowait ? -EAGAIN : -ENOTBLK;
 125                 }
 126
 127                 if (ret)
 128                         break;
 129
 130                 cond_resched();
 131         }
 132
 133         if (ret)
 134                 unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
 135         return ret;
 136 }
 137
 138 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
 139                                                   struct btrfs_dio_data *dio_data,
 140                                                   const u64 start,
 141                                                   const struct btrfs_file_extent *file_extent,
 142                                                   const int type)
 143 {
 144         struct extent_map *em = NULL;
 145         struct btrfs_ordered_extent *ordered;
 146
 147         if (type != BTRFS_ORDERED_NOCOW) {
 148                 em = btrfs_create_io_em(inode, start, file_extent, type);
 149                 if (IS_ERR(em))
 150                         goto out;
 151         }
 152
 153         ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
 154                                              (1 << type) |
 155                                              (1 << BTRFS_ORDERED_DIRECT));
 156         if (IS_ERR(ordered)) {
 157                 if (em) {
 158                         free_extent_map(em);
 159                         btrfs_drop_extent_map_range(inode, start,
 160                                         start + file_extent->num_bytes - 1, false);
 161                 }
 162                 em = ERR_CAST(ordered);
 163         } else {
 164                 ASSERT(!dio_data->ordered);
 165                 dio_data->ordered = ordered;
 166         }
 167  out:
 168
 169         return em;
 170 }
 171
 172 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
 173                                                   struct btrfs_dio_data *dio_data,
 174                                                   u64 start, u64 len)
 175 {
 176         struct btrfs_root *root = inode->root;
 177         struct btrfs_fs_info *fs_info = root->fs_info;
 178         struct btrfs_file_extent file_extent;
 179         struct extent_map *em;
 180         struct btrfs_key ins;
 181         u64 alloc_hint;
 182         int ret;
 183
 184         alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
 185 again:
 186         ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
 187                                    0, alloc_hint, &ins, 1, 1);
 188         if (ret == -EAGAIN) {
 189                 ASSERT(btrfs_is_zoned(fs_info));
 190                 wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
 191                                TASK_UNINTERRUPTIBLE);
 192                 goto again;
 193         }
 194         if (ret)
 195                 return ERR_PTR(ret);
 196
 197         file_extent.disk_bytenr = ins.objectid;
 198         file_extent.disk_num_bytes = ins.offset;
 199         file_extent.num_bytes = ins.offset;
 200         file_extent.ram_bytes = ins.offset;
 201         file_extent.offset = 0;
 202         file_extent.compression = BTRFS_COMPRESS_NONE;
 203         em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
 204                                      BTRFS_ORDERED_REGULAR);
 205         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 206         if (IS_ERR(em))
 207                 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
 208                                            1);
 209
 210         return em;
 211 }
 212
 213 static int btrfs_get_blocks_direct_write(struct extent_map **map,
 214                                          struct inode *inode,
 215                                          struct btrfs_dio_data *dio_data,
 216                                          u64 start, u64 *lenp,
 217                                          unsigned int iomap_flags)
 218 {
 219         const bool nowait = (iomap_flags & IOMAP_NOWAIT);
 220         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 221         struct btrfs_file_extent file_extent;
 222         struct extent_map *em = *map;
 223         int type;
 224         u64 block_start;
 225         struct btrfs_block_group *bg;
 226         bool can_nocow = false;
 227         bool space_reserved = false;
 228         u64 len = *lenp;
 229         u64 prev_len;
 230         int ret = 0;
 231
 232         /*
 233          * We don't allocate a new extent in the following cases
 234          *
 235          * 1) The inode is marked as NODATACOW. In this case we'll just use the
 236          * existing extent.
 237          * 2) The extent is marked as PREALLOC. We're good to go here and can
 238          * just use the extent.
 239          *
 240          */
 241         if ((em->flags & EXTENT_FLAG_PREALLOC) ||
 242             ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
 243              em->disk_bytenr != EXTENT_MAP_HOLE)) {
 244                 if (em->flags & EXTENT_FLAG_PREALLOC)
 245                         type = BTRFS_ORDERED_PREALLOC;
 246                 else
 247                         type = BTRFS_ORDERED_NOCOW;
 248                 len = min(len, em->len - (start - em->start));
 249                 block_start = extent_map_block_start(em) + (start - em->start);
 250
 251                 if (can_nocow_extent(inode, start, &len,
 252                                      &file_extent, false, false) == 1) {
 253                         bg = btrfs_inc_nocow_writers(fs_info, block_start);
 254                         if (bg)
 255                                 can_nocow = true;
 256                 }
 257         }
 258
 259         prev_len = len;
 260         if (can_nocow) {
 261                 struct extent_map *em2;
 262
 263                 /* We can NOCOW, so only need to reserve metadata space. */
 264                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
 265                                                       nowait);
 266                 if (ret < 0) {
 267                         /* Our caller expects us to free the input extent map. */
 268                         free_extent_map(em);
 269                         *map = NULL;
 270                         btrfs_dec_nocow_writers(bg);
 271                         if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
 272                                 ret = -EAGAIN;
 273                         goto out;
 274                 }
 275                 space_reserved = true;
 276
 277                 em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
 278                                               &file_extent, type);
 279                 btrfs_dec_nocow_writers(bg);
 280                 if (type == BTRFS_ORDERED_PREALLOC) {
 281                         free_extent_map(em);
 282                         *map = em2;
 283                         em = em2;
 284                 }
 285
 286                 if (IS_ERR(em2)) {
 287                         ret = PTR_ERR(em2);
 288                         goto out;
 289                 }
 290
 291                 dio_data->nocow_done = true;
 292         } else {
 293                 /* Our caller expects us to free the input extent map. */
 294                 free_extent_map(em);
 295                 *map = NULL;
 296
 297                 if (nowait) {
 298                         ret = -EAGAIN;
 299                         goto out;
 300                 }
 301
 302                 /*
 303                  * If we could not allocate data space before locking the file
 304                  * range and we can't do a NOCOW write, then we have to fail.
 305                  */
 306                 if (!dio_data->data_space_reserved) {
 307                         ret = -ENOSPC;
 308                         goto out;
 309                 }
 310
 311                 /*
 312                  * We have to COW and we have already reserved data space before,
 313                  * so now we reserve only metadata.
 314                  */
 315                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
 316                                                       false);
 317                 if (ret < 0)
 318                         goto out;
 319                 space_reserved = true;
 320
 321                 em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
 322                 if (IS_ERR(em)) {
 323                         ret = PTR_ERR(em);
 324                         goto out;
 325                 }
 326                 *map = em;
 327                 len = min(len, em->len - (start - em->start));
 328                 if (len < prev_len)
 329                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
 330                                                         prev_len - len, true);
 331         }
 332
 333         /*
 334          * We have created our ordered extent, so we can now release our reservation
 335          * for an outstanding extent.
 336          */
 337         btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
 338
 339         /*
 340          * Need to update the i_size under the extent lock so buffered
 341          * readers will get the updated i_size when we unlock.
 342          */
 343         if (start + len > i_size_read(inode))
 344                 i_size_write(inode, start + len);
 345 out:
 346         if (ret && space_reserved) {
 347                 btrfs_delalloc_release_extents(BTRFS_I(inode), len);
 348                 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
 349         }
 350         *lenp = len;
 351         return ret;
 352 }
 353
 354 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 355                 loff_t length, unsigned int flags, struct iomap *iomap,
 356                 struct iomap *srcmap)
 357 {
 358         struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
 359         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 360         struct extent_map *em;
 361         struct extent_state *cached_state = NULL;
 362         struct btrfs_dio_data *dio_data = iter->private;
 363         u64 lockstart, lockend;
 364         const bool write = !!(flags & IOMAP_WRITE);
 365         int ret = 0;
 366         u64 len = length;
 367         const u64 data_alloc_len = length;
 368         u32 unlock_bits = EXTENT_LOCKED;
 369
 370         /*
 371          * We could potentially fault if we have a buffer > PAGE_SIZE, and if
 372          * we're NOWAIT we may submit a bio for a partial range and return
 373          * EIOCBQUEUED, which would result in an errant short read.
 374          *
 375          * The best way to handle this would be to allow for partial completions
 376          * of iocb's, so we could submit the partial bio, return and fault in
 377          * the rest of the pages, and then submit the io for the rest of the
 378          * range.  However we don't have that currently, so simply return
 379          * -EAGAIN at this point so that the normal path is used.
 380          */
 381         if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
 382                 return -EAGAIN;
 383
 384         /*
 385          * Cap the size of reads to that usually seen in buffered I/O as we need
 386          * to allocate a contiguous array for the checksums.
 387          */
 388         if (!write)
 389                 len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
 390
 391         lockstart = start;
 392         lockend = start + len - 1;
 393
 394         /*
 395          * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
 396          * enough if we've written compressed pages to this area, so we need to
 397          * flush the dirty pages again to make absolutely sure that any
 398          * outstanding dirty pages are on disk - the first flush only starts
 399          * compression on the data, while keeping the pages locked, so by the
 400          * time the second flush returns we know bios for the compressed pages
 401          * were submitted and finished, and the pages no longer under writeback.
 402          *
 403          * If we have a NOWAIT request and we have any pages in the range that
 404          * are locked, likely due to compression still in progress, we don't want
 405          * to block on page locks. We also don't want to block on pages marked as
 406          * dirty or under writeback (same as for the non-compression case).
 407          * iomap_dio_rw() did the same check, but after that and before we got
 408          * here, mmap'ed writes may have happened or buffered reads started
 409          * (readpage() and readahead(), which lock pages), as we haven't locked
 410          * the file range yet.
 411          */
 412         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
 413                      &BTRFS_I(inode)->runtime_flags)) {
 414                 if (flags & IOMAP_NOWAIT) {
 415                         if (filemap_range_needs_writeback(inode->i_mapping,
 416                                                           lockstart, lockend))
 417                                 return -EAGAIN;
 418                 } else {
 419                         ret = filemap_fdatawrite_range(inode->i_mapping, start,
 420                                                        start + length - 1);
 421                         if (ret)
 422                                 return ret;
 423                 }
 424         }
 425
 426         memset(dio_data, 0, sizeof(*dio_data));
 427
 428         /*
 429          * We always try to allocate data space and must do it before locking
 430          * the file range, to avoid deadlocks with concurrent writes to the same
 431          * range if the range has several extents and the writes don't expand the
 432          * current i_size (the inode lock is taken in shared mode). If we fail to
 433          * allocate data space here we continue and later, after locking the
 434          * file range, we fail with ENOSPC only if we figure out we can not do a
 435          * NOCOW write.
 436          */
 437         if (write && !(flags & IOMAP_NOWAIT)) {
 438                 ret = btrfs_check_data_free_space(BTRFS_I(inode),
 439                                                   &dio_data->data_reserved,
 440                                                   start, data_alloc_len, false);
 441                 if (!ret)
 442                         dio_data->data_space_reserved = true;
 443                 else if (ret && !(BTRFS_I(inode)->flags &
 444                                   (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
 445                         goto err;
 446         }
 447
 448         /*
 449          * If this errors out it's because we couldn't invalidate pagecache for
 450          * this range and we need to fallback to buffered IO, or we are doing a
 451          * NOWAIT read/write and we need to block.
 452          */
 453         ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
 454         if (ret < 0)
 455                 goto err;
 456
 457         em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
 458         if (IS_ERR(em)) {
 459                 ret = PTR_ERR(em);
 460                 goto unlock_err;
 461         }
 462
 463         /*
 464          * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
 465          * io.  INLINE is special, and we could probably kludge it in here, but
 466          * it's still buffered so for safety lets just fall back to the generic
 467          * buffered path.
 468          *
 469          * For COMPRESSED we _have_ to read the entire extent in so we can
 470          * decompress it, so there will be buffering required no matter what we
 471          * do, so go ahead and fallback to buffered.
 472          *
 473          * We return -ENOTBLK because that's what makes DIO go ahead and go back
 474          * to buffered IO.  Don't blame me, this is the price we pay for using
 475          * the generic code.
 476          */
 477         if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
 478                 free_extent_map(em);
 479                 /*
 480                  * If we are in a NOWAIT context, return -EAGAIN in order to
 481                  * fallback to buffered IO. This is not only because we can
 482                  * block with buffered IO (no support for NOWAIT semantics at
 483                  * the moment) but also to avoid returning short reads to user
 484                  * space - this happens if we were able to read some data from
 485                  * previous non-compressed extents and then when we fallback to
 486                  * buffered IO, at btrfs_file_read_iter() by calling
 487                  * filemap_read(), we fail to fault in pages for the read buffer,
 488                  * in which case filemap_read() returns a short read (the number
 489                  * of bytes previously read is > 0, so it does not return -EFAULT).
 490                  */
 491                 ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
 492                 goto unlock_err;
 493         }
 494
 495         len = min(len, em->len - (start - em->start));
 496
 497         /*
 498          * If we have a NOWAIT request and the range contains multiple extents
 499          * (or a mix of extents and holes), then we return -EAGAIN to make the
 500          * caller fallback to a context where it can do a blocking (without
 501          * NOWAIT) request. This way we avoid doing partial IO and returning
 502          * success to the caller, which is not optimal for writes and for reads
 503          * it can result in unexpected behaviour for an application.
 504          *
 505          * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
 506          * iomap_dio_rw(), we can end up returning less data then what the caller
 507          * asked for, resulting in an unexpected, and incorrect, short read.
 508          * That is, the caller asked to read N bytes and we return less than that,
 509          * which is wrong unless we are crossing EOF. This happens if we get a
 510          * page fault error when trying to fault in pages for the buffer that is
 511          * associated to the struct iov_iter passed to iomap_dio_rw(), and we
 512          * have previously submitted bios for other extents in the range, in
 513          * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
 514          * those bios have completed by the time we get the page fault error,
 515          * which we return back to our caller - we should only return EIOCBQUEUED
 516          * after we have submitted bios for all the extents in the range.
 517          */
 518         if ((flags & IOMAP_NOWAIT) && len < length) {
 519                 free_extent_map(em);
 520                 ret = -EAGAIN;
 521                 goto unlock_err;
 522         }
 523
 524         if (write) {
 525                 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
 526                                                     start, &len, flags);
 527                 if (ret < 0)
 528                         goto unlock_err;
 529                 /* Recalc len in case the new em is smaller than requested */
 530                 len = min(len, em->len - (start - em->start));
 531                 if (dio_data->data_space_reserved) {
 532                         u64 release_offset;
 533                         u64 release_len = 0;
 534
 535                         if (dio_data->nocow_done) {
 536                                 release_offset = start;
 537                                 release_len = data_alloc_len;
 538                         } else if (len < data_alloc_len) {
 539                                 release_offset = start + len;
 540                                 release_len = data_alloc_len - len;
 541                         }
 542
 543                         if (release_len > 0)
 544                                 btrfs_free_reserved_data_space(BTRFS_I(inode),
 545                                                                dio_data->data_reserved,
 546                                                                release_offset,
 547                                                                release_len);
 548                 }
 549         }
 550
 551         /*
 552          * Translate extent map information to iomap.
 553          * We trim the extents (and move the addr) even though iomap code does
 554          * that, since we have locked only the parts we are performing I/O in.
 555          */
 556         if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
 557             ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
 558                 iomap->addr = IOMAP_NULL_ADDR;
 559                 iomap->type = IOMAP_HOLE;
 560         } else {
 561                 iomap->addr = extent_map_block_start(em) + (start - em->start);
 562                 iomap->type = IOMAP_MAPPED;
 563         }
 564         iomap->offset = start;
 565         iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
 566         iomap->length = len;
 567         free_extent_map(em);
 568
 569         /*
 570          * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
 571          * writes only hold it for this part.  We hold the extent lock until
 572          * we're completely done with the extent map to make sure it remains
 573          * valid.
 574          */
 575         if (write)
 576                 unlock_bits |= EXTENT_DIO_LOCKED;
 577
 578         clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 579                          unlock_bits, &cached_state);
 580
 581         /* We didn't use everything, unlock the dio extent for the remainder. */
 582         if (!write && (start + len) < lockend)
 583                 unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
 584                                   lockend, NULL);
 585
 586         return 0;
 587
 588 unlock_err:
 589         /*
 590          * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
 591          * to update this, be explicit that we expect EXTENT_LOCKED and
 592          * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
 593          */
 594         clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 595                          EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
 596 err:
 597         if (dio_data->data_space_reserved) {
 598                 btrfs_free_reserved_data_space(BTRFS_I(inode),
 599                                                dio_data->data_reserved,
 600                                                start, data_alloc_len);
 601                 extent_changeset_free(dio_data->data_reserved);
 602         }
 603
 604         return ret;
 605 }
 606
 607 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 608                 ssize_t written, unsigned int flags, struct iomap *iomap)
 609 {
 610         struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
 611         struct btrfs_dio_data *dio_data = iter->private;
 612         size_t submitted = dio_data->submitted;
 613         const bool write = !!(flags & IOMAP_WRITE);
 614         int ret = 0;
 615
 616         if (!write && (iomap->type == IOMAP_HOLE)) {
 617                 /* If reading from a hole, unlock and return */
 618                 unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
 619                                   pos + length - 1, NULL);
 620                 return 0;
 621         }
 622
 623         if (submitted < length) {
 624                 pos += submitted;
 625                 length -= submitted;
 626                 if (write)
 627                         btrfs_finish_ordered_extent(dio_data->ordered, NULL,
 628                                                     pos, length, false);
 629                 else
 630                         unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
 631                                           pos + length - 1, NULL);
 632                 ret = -ENOTBLK;
 633         }
 634         if (write) {
 635                 btrfs_put_ordered_extent(dio_data->ordered);
 636                 dio_data->ordered = NULL;
 637         }
 638
 639         if (write)
 640                 extent_changeset_free(dio_data->data_reserved);
 641         return ret;
 642 }
 643
 644 static void btrfs_dio_end_io(struct btrfs_bio *bbio)
 645 {
 646         struct btrfs_dio_private *dip =
 647                 container_of(bbio, struct btrfs_dio_private, bbio);
 648         struct btrfs_inode *inode = bbio->inode;
 649         struct bio *bio = &bbio->bio;
 650
 651         if (bio->bi_status) {
 652                 btrfs_warn(inode->root->fs_info,
 653                 "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
 654                            btrfs_ino(inode), bio->bi_opf,
 655                            dip->file_offset, dip->bytes, bio->bi_status);
 656         }
 657
 658         if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
 659                 btrfs_finish_ordered_extent(bbio->ordered, NULL,
 660                                             dip->file_offset, dip->bytes,
 661                                             !bio->bi_status);
 662         } else {
 663                 unlock_dio_extent(&inode->io_tree, dip->file_offset,
 664                                   dip->file_offset + dip->bytes - 1, NULL);
 665         }
 666
 667         bbio->bio.bi_private = bbio->private;
 668         iomap_dio_bio_end_io(bio);
 669 }
 670
 671 static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
 672                                         struct btrfs_ordered_extent *ordered)
 673 {
 674         u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
 675         u64 len = bbio->bio.bi_iter.bi_size;
 676         struct btrfs_ordered_extent *new;
 677         int ret;
 678
 679         /* Must always be called for the beginning of an ordered extent. */
 680         if (WARN_ON_ONCE(start != ordered->disk_bytenr))
 681                 return -EINVAL;
 682
 683         /* No need to split if the ordered extent covers the entire bio. */
 684         if (ordered->disk_num_bytes == len) {
 685                 refcount_inc(&ordered->refs);
 686                 bbio->ordered = ordered;
 687                 return 0;
 688         }
 689
 690         /*
 691          * Don't split the extent_map for NOCOW extents, as we're writing into
 692          * a pre-existing one.
 693          */
 694         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
 695                 ret = split_extent_map(bbio->inode, bbio->file_offset,
 696                                        ordered->num_bytes, len,
 697                                        ordered->disk_bytenr);
 698                 if (ret)
 699                         return ret;
 700         }
 701
 702         new = btrfs_split_ordered_extent(ordered, len);
 703         if (IS_ERR(new))
 704                 return PTR_ERR(new);
 705         bbio->ordered = new;
 706         return 0;
 707 }
 708
 709 static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
 710                                 loff_t file_offset)
 711 {
 712         struct btrfs_bio *bbio = btrfs_bio(bio);
 713         struct btrfs_dio_private *dip =
 714                 container_of(bbio, struct btrfs_dio_private, bbio);
 715         struct btrfs_dio_data *dio_data = iter->private;
 716
 717         btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
 718                        btrfs_dio_end_io, bio->bi_private);
 719         bbio->inode = BTRFS_I(iter->inode);
 720         bbio->file_offset = file_offset;
 721
 722         dip->file_offset = file_offset;
 723         dip->bytes = bio->bi_iter.bi_size;
 724
 725         dio_data->submitted += bio->bi_iter.bi_size;
 726
 727         /*
 728          * Check if we are doing a partial write.  If we are, we need to split
 729          * the ordered extent to match the submitted bio.  Hang on to the
 730          * remaining unfinishable ordered_extent in dio_data so that it can be
 731          * cancelled in iomap_end to avoid a deadlock wherein faulting the
 732          * remaining pages is blocked on the outstanding ordered extent.
 733          */
 734         if (iter->flags & IOMAP_WRITE) {
 735                 int ret;
 736
 737                 ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
 738                 if (ret) {
 739                         btrfs_finish_ordered_extent(dio_data->ordered, NULL,
 740                                                     file_offset, dip->bytes,
 741                                                     !ret);
 742                         bio->bi_status = errno_to_blk_status(ret);
 743                         iomap_dio_bio_end_io(bio);
 744                         return;
 745                 }
 746         }
 747
 748         btrfs_submit_bbio(bbio, 0);
 749 }
 750
 751 static const struct iomap_ops btrfs_dio_iomap_ops = {
 752         .iomap_begin            = btrfs_dio_iomap_begin,
 753         .iomap_end              = btrfs_dio_iomap_end,
 754 };
 755
 756 static const struct iomap_dio_ops btrfs_dio_ops = {
 757         .submit_io              = btrfs_dio_submit_io,
 758         .bio_set                = &btrfs_dio_bioset,
 759 };
 760
 761 static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
 762                               size_t done_before)
 763 {
 764         struct btrfs_dio_data data = { 0 };
 765
 766         return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
 767                             IOMAP_DIO_PARTIAL, &data, done_before);
 768 }
 769
 770 static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
 771                                          size_t done_before)
 772 {
 773         struct btrfs_dio_data data = { 0 };
 774
 775         return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
 776                             IOMAP_DIO_PARTIAL, &data, done_before);
 777 }
 778
 779 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
 780                                const struct iov_iter *iter, loff_t offset)
 781 {
 782         const u32 blocksize_mask = fs_info->sectorsize - 1;
 783
 784         if (offset & blocksize_mask)
 785                 return -EINVAL;
 786
 787         if (iov_iter_alignment(iter) & blocksize_mask)
 788                 return -EINVAL;
 789
 790         return 0;
 791 }
 792
 793 ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
 794 {
 795         struct file *file = iocb->ki_filp;
 796         struct inode *inode = file_inode(file);
 797         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 798         loff_t pos;
 799         ssize_t written = 0;
 800         ssize_t written_buffered;
 801         size_t prev_left = 0;
 802         loff_t endbyte;
 803         ssize_t ret;
 804         unsigned int ilock_flags = 0;
 805         struct iomap_dio *dio;
 806
 807         if (iocb->ki_flags & IOCB_NOWAIT)
 808                 ilock_flags |= BTRFS_ILOCK_TRY;
 809
 810         /*
 811          * If the write DIO is within EOF, use a shared lock and also only if
 812          * security bits will likely not be dropped by file_remove_privs() called
 813          * from btrfs_write_check(). Either will need to be rechecked after the
 814          * lock was acquired.
 815          */
 816         if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
 817                 ilock_flags |= BTRFS_ILOCK_SHARED;
 818
 819 relock:
 820         ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
 821         if (ret < 0)
 822                 return ret;
 823
 824         /* Shared lock cannot be used with security bits set. */
 825         if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
 826                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 827                 ilock_flags &= ~BTRFS_ILOCK_SHARED;
 828                 goto relock;
 829         }
 830
 831         ret = generic_write_checks(iocb, from);
 832         if (ret <= 0) {
 833                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 834                 return ret;
 835         }
 836
 837         ret = btrfs_write_check(iocb, from, ret);
 838         if (ret < 0) {
 839                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 840                 goto out;
 841         }
 842
 843         pos = iocb->ki_pos;
 844         /*
 845          * Re-check since file size may have changed just before taking the
 846          * lock or pos may have changed because of O_APPEND in generic_write_check()
 847          */
 848         if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
 849             pos + iov_iter_count(from) > i_size_read(inode)) {
 850                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 851                 ilock_flags &= ~BTRFS_ILOCK_SHARED;
 852                 goto relock;
 853         }
 854
 855         if (check_direct_IO(fs_info, from, pos)) {
 856                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 857                 goto buffered;
 858         }
 859
 860         /*
 861          * The iov_iter can be mapped to the same file range we are writing to.
 862          * If that's the case, then we will deadlock in the iomap code, because
 863          * it first calls our callback btrfs_dio_iomap_begin(), which will create
 864          * an ordered extent, and after that it will fault in the pages that the
 865          * iov_iter refers to. During the fault in we end up in the readahead
 866          * pages code (starting at btrfs_readahead()), which will lock the range,
 867          * find that ordered extent and then wait for it to complete (at
 868          * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
 869          * obviously the ordered extent can never complete as we didn't submit
 870          * yet the respective bio(s). This always happens when the buffer is
 871          * memory mapped to the same file range, since the iomap DIO code always
 872          * invalidates pages in the target file range (after starting and waiting
 873          * for any writeback).
 874          *
 875          * So here we disable page faults in the iov_iter and then retry if we
 876          * got -EFAULT, faulting in the pages before the retry.
 877          */
 878 again:
 879         from->nofault = true;
 880         dio = btrfs_dio_write(iocb, from, written);
 881         from->nofault = false;
 882
 883         if (IS_ERR_OR_NULL(dio)) {
 884                 ret = PTR_ERR_OR_ZERO(dio);
 885         } else {
 886                 /*
 887                  * If we have a synchronous write, we must make sure the fsync
 888                  * triggered by the iomap_dio_complete() call below doesn't
 889                  * deadlock on the inode lock - we are already holding it and we
 890                  * can't call it after unlocking because we may need to complete
 891                  * partial writes due to the input buffer (or parts of it) not
 892                  * being already faulted in.
 893                  */
 894                 ASSERT(current->journal_info == NULL);
 895                 current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
 896                 ret = iomap_dio_complete(dio);
 897                 current->journal_info = NULL;
 898         }
 899
 900         /* No increment (+=) because iomap returns a cumulative value. */
 901         if (ret > 0)
 902                 written = ret;
 903
 904         if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
 905                 const size_t left = iov_iter_count(from);
 906                 /*
 907                  * We have more data left to write. Try to fault in as many as
 908                  * possible of the remainder pages and retry. We do this without
 909                  * releasing and locking again the inode, to prevent races with
 910                  * truncate.
 911                  *
 912                  * Also, in case the iov refers to pages in the file range of the
 913                  * file we want to write to (due to a mmap), we could enter an
 914                  * infinite loop if we retry after faulting the pages in, since
 915                  * iomap will invalidate any pages in the range early on, before
 916                  * it tries to fault in the pages of the iov. So we keep track of
 917                  * how much was left of iov in the previous EFAULT and fallback
 918                  * to buffered IO in case we haven't made any progress.
 919                  */
 920                 if (left == prev_left) {
 921                         ret = -ENOTBLK;
 922                 } else {
 923                         fault_in_iov_iter_readable(from, left);
 924                         prev_left = left;
 925                         goto again;
 926                 }
 927         }
 928
 929         btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 930
 931         /*
 932          * If 'ret' is -ENOTBLK or we have not written all data, then it means
 933          * we must fallback to buffered IO.
 934          */
 935         if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
 936                 goto out;
 937
 938 buffered:
 939         /*
 940          * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
 941          * it must retry the operation in a context where blocking is acceptable,
 942          * because even if we end up not blocking during the buffered IO attempt
 943          * below, we will block when flushing and waiting for the IO.
 944          */
 945         if (iocb->ki_flags & IOCB_NOWAIT) {
 946                 ret = -EAGAIN;
 947                 goto out;
 948         }
 949
 950         pos = iocb->ki_pos;
 951         written_buffered = btrfs_buffered_write(iocb, from);
 952         if (written_buffered < 0) {
 953                 ret = written_buffered;
 954                 goto out;
 955         }
 956         /*
 957          * Ensure all data is persisted. We want the next direct IO read to be
 958          * able to read what was just written.
 959          */
 960         endbyte = pos + written_buffered - 1;
 961         ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
 962         if (ret)
 963                 goto out;
 964         ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
 965         if (ret)
 966                 goto out;
 967         written += written_buffered;
 968         iocb->ki_pos = pos + written_buffered;
 969         invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
 970                                  endbyte >> PAGE_SHIFT);
 971 out:
 972         return ret < 0 ? ret : written;
 973 }
 974
 975 static int check_direct_read(struct btrfs_fs_info *fs_info,
 976                              const struct iov_iter *iter, loff_t offset)
 977 {
 978         int ret;
 979         int i, seg;
 980
 981         ret = check_direct_IO(fs_info, iter, offset);
 982         if (ret < 0)
 983                 return ret;
 984
 985         if (!iter_is_iovec(iter))
 986                 return 0;
 987
 988         for (seg = 0; seg < iter->nr_segs; seg++) {
 989                 for (i = seg + 1; i < iter->nr_segs; i++) {
 990                         const struct iovec *iov1 = iter_iov(iter) + seg;
 991                         const struct iovec *iov2 = iter_iov(iter) + i;
 992
 993                         if (iov1->iov_base == iov2->iov_base)
 994                                 return -EINVAL;
 995                 }
 996         }
 997         return 0;
 998 }
 999
1000 ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
1001 {
1002         struct inode *inode = file_inode(iocb->ki_filp);
1003         size_t prev_left = 0;
1004         ssize_t read = 0;
1005         ssize_t ret;
1006
1007         if (fsverity_active(inode))
1008                 return 0;
1009
1010         if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
1011                 return 0;
1012
1013         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1014 again:
1015         /*
1016          * This is similar to what we do for direct IO writes, see the comment
1017          * at btrfs_direct_write(), but we also disable page faults in addition
1018          * to disabling them only at the iov_iter level. This is because when
1019          * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
1020          * which can still trigger page fault ins despite having set ->nofault
1021          * to true of our 'to' iov_iter.
1022          *
1023          * The difference to direct IO writes is that we deadlock when trying
1024          * to lock the extent range in the inode's tree during he page reads
1025          * triggered by the fault in (while for writes it is due to waiting for
1026          * our own ordered extent). This is because for direct IO reads,
1027          * btrfs_dio_iomap_begin() returns with the extent range locked, which
1028          * is only unlocked in the endio callback (end_bio_extent_readpage()).
1029          */
1030         pagefault_disable();
1031         to->nofault = true;
1032         ret = btrfs_dio_read(iocb, to, read);
1033         to->nofault = false;
1034         pagefault_enable();
1035
1036         /* No increment (+=) because iomap returns a cumulative value. */
1037         if (ret > 0)
1038                 read = ret;
1039
1040         if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
1041                 const size_t left = iov_iter_count(to);
1042
1043                 if (left == prev_left) {
1044                         /*
1045                          * We didn't make any progress since the last attempt,
1046                          * fallback to a buffered read for the remainder of the
1047                          * range. This is just to avoid any possibility of looping
1048                          * for too long.
1049                          */
1050                         ret = read;
1051                 } else {
1052                         /*
1053                          * We made some progress since the last retry or this is
1054                          * the first time we are retrying. Fault in as many pages
1055                          * as possible and retry.
1056                          */
1057                         fault_in_iov_iter_writeable(to, left);
1058                         prev_left = left;
1059                         goto again;
1060                 }
1061         }
1062         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1063         return ret < 0 ? ret : read;
1064 }
1065
1066 int __init btrfs_init_dio(void)
1067 {
1068         if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
1069                         offsetof(struct btrfs_dio_private, bbio.bio),
1070                         BIOSET_NEED_BVECS))
1071                 return -ENOMEM;
1072
1073         return 0;
1074 }
1075
1076 void __cold btrfs_destroy_dio(void)
1077 {
1078         bioset_exit(&btrfs_dio_bioset);
1079 }