]> Git Repo - J-linux.git/commitdiff
Merge branch 'work.iov_iter' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
authorLinus Torvalds <[email protected]>
Sat, 3 Jul 2021 18:30:04 +0000 (11:30 -0700)
committerLinus Torvalds <[email protected]>
Sat, 3 Jul 2021 18:30:04 +0000 (11:30 -0700)
Pull iov_iter updates from Al Viro:
 "iov_iter cleanups and fixes.

  There are followups, but this is what had sat in -next this cycle. IMO
  the macro forest in there became much thinner and easier to follow..."

* 'work.iov_iter' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (37 commits)
  csum_and_copy_to_pipe_iter(): leave handling of csum_state to caller
  clean up copy_mc_pipe_to_iter()
  pipe_zero(): we don't need no stinkin' kmap_atomic()...
  iov_iter: clean csum_and_copy_...() primitives up a bit
  copy_page_from_iter(): don't need kmap_atomic() for kvec/bvec cases
  copy_page_to_iter(): don't bother with kmap_atomic() for bvec/kvec cases
  iterate_xarray(): only of the first iteration we might get offset != 0
  pull handling of ->iov_offset into iterate_{iovec,bvec,xarray}
  iov_iter: make iterator callbacks use base and len instead of iovec
  iov_iter: make the amount already copied available to iterator callbacks
  iov_iter: get rid of separate bvec and xarray callbacks
  iov_iter: teach iterate_{bvec,xarray}() about possible short copies
  iterate_bvec(): expand bvec.h macro forest, massage a bit
  iov_iter: unify iterate_iovec and iterate_kvec
  iov_iter: massage iterate_iovec and iterate_kvec to logics similar to iterate_bvec
  iterate_and_advance(): get rid of magic in case when n is 0
  csum_and_copy_to_iter(): massage into form closer to csum_and_copy_from_iter()
  iov_iter: replace iov_iter_copy_from_user_atomic() with iterator-advancing variant
  [xarray] iov_iter_npages(): just use DIV_ROUND_UP()
  iov_iter_npages(): don't bother with iterate_all_kinds()
  ...

1  2 
fs/btrfs/file.c
fs/iomap/buffered-io.c
mm/filemap.c

diff --combined fs/btrfs/file.c
index 28a05ba47060e66544eff306834d92534331a2c1,78cb8f9eaa6b42710214b64065fc4f22302690c5..ee34497500e169415efdb47817c5af65260aebfd
@@@ -28,7 -28,6 +28,7 @@@
  #include "compression.h"
  #include "delalloc-space.h"
  #include "reflink.h"
 +#include "subpage.h"
  
  static struct kmem_cache *btrfs_inode_defrag_cachep;
  /*
@@@ -399,7 -398,7 +399,7 @@@ static noinline int btrfs_copy_from_use
                /*
                 * Copy data from userspace to the current page
                 */
-               copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+               copied = copy_page_from_iter_atomic(page, offset, count, i);
  
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
                 * The rest of the btrfs_file_write code will fall
                 * back to page at a time copies after we return 0.
                 */
-               if (!PageUptodate(page) && copied < count)
-                       copied = 0;
+               if (unlikely(copied < count)) {
+                       if (!PageUptodate(page)) {
+                               iov_iter_revert(i, copied);
+                               copied = 0;
+                       }
+                       if (!copied)
+                               break;
+               }
  
-               iov_iter_advance(i, copied);
                write_bytes -= copied;
                total_copied += copied;
-               /* Return to btrfs_file_write_iter to fault page */
-               if (unlikely(copied == 0))
-                       break;
-               if (copied < PAGE_SIZE - offset) {
-                       offset += copied;
-               } else {
+               offset += copied;
+               if (offset == PAGE_SIZE) {
                        pg++;
                        offset = 0;
                }
@@@ -483,7 -481,6 +482,7 @@@ int btrfs_dirty_pages(struct btrfs_inod
        start_pos = round_down(pos, fs_info->sectorsize);
        num_bytes = round_up(write_bytes + pos - start_pos,
                             fs_info->sectorsize);
 +      ASSERT(num_bytes <= U32_MAX);
  
        end_of_last_block = start_pos + num_bytes - 1;
  
  
        for (i = 0; i < num_pages; i++) {
                struct page *p = pages[i];
 -              SetPageUptodate(p);
 +
 +              btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
                ClearPageChecked(p);
 -              set_page_dirty(p);
 +              btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
        }
  
        /*
@@@ -1097,7 -1093,7 +1096,7 @@@ int btrfs_mark_extent_written(struct bt
        int del_nr = 0;
        int del_slot = 0;
        int recow;
 -      int ret;
 +      int ret = 0;
        u64 ino = btrfs_ino(inode);
  
        path = btrfs_alloc_path();
@@@ -1318,7 -1314,7 +1317,7 @@@ again
        }
  out:
        btrfs_free_path(path);
 -      return 0;
 +      return ret;
  }
  
  /*
@@@ -2070,30 -2066,6 +2069,30 @@@ static int start_ordered_ops(struct ino
        return ret;
  }
  
 +static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
 +{
 +      struct btrfs_inode *inode = BTRFS_I(ctx->inode);
 +      struct btrfs_fs_info *fs_info = inode->root->fs_info;
 +
 +      if (btrfs_inode_in_log(inode, fs_info->generation) &&
 +          list_empty(&ctx->ordered_extents))
 +              return true;
 +
 +      /*
 +       * If we are doing a fast fsync we can not bail out if the inode's
 +       * last_trans is <= then the last committed transaction, because we only
 +       * update the last_trans of the inode during ordered extent completion,
 +       * and for a fast fsync we don't wait for that, we only wait for the
 +       * writeback to complete.
 +       */
 +      if (inode->last_trans <= fs_info->last_trans_committed &&
 +          (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
 +           list_empty(&ctx->ordered_extents)))
 +              return true;
 +
 +      return false;
 +}
 +
  /*
   * fsync call for both files and directories.  This logs the inode into
   * the tree log instead of forcing full commits whenever possible.
@@@ -2212,8 -2184,17 +2211,8 @@@ int btrfs_sync_file(struct file *file, 
  
        atomic_inc(&root->log_batch);
  
 -      /*
 -       * If we are doing a fast fsync we can not bail out if the inode's
 -       * last_trans is <= then the last committed transaction, because we only
 -       * update the last_trans of the inode during ordered extent completion,
 -       * and for a fast fsync we don't wait for that, we only wait for the
 -       * writeback to complete.
 -       */
        smp_mb();
 -      if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
 -          (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed &&
 -           (full_sync || list_empty(&ctx.ordered_extents)))) {
 +      if (skip_inode_logging(&ctx)) {
                /*
                 * We've had everything committed since the last time we were
                 * modified so clear this flag in case it was set for whatever
@@@ -2486,17 -2467,6 +2485,17 @@@ static int btrfs_punch_hole_lock_range(
                                       const u64 lockend,
                                       struct extent_state **cached_state)
  {
 +      /*
 +       * For subpage case, if the range is not at page boundary, we could
 +       * have pages at the leading/tailing part of the range.
 +       * This could lead to dead loop since filemap_range_has_page()
 +       * will always return true.
 +       * So here we need to do extra page alignment for
 +       * filemap_range_has_page().
 +       */
 +      const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
 +      const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
 +
        while (1) {
                struct btrfs_ordered_extent *ordered;
                int ret;
                    (ordered->file_offset + ordered->num_bytes <= lockstart ||
                     ordered->file_offset > lockend)) &&
                     !filemap_range_has_page(inode->i_mapping,
 -                                           lockstart, lockend)) {
 +                                           page_lockstart, page_lockend)) {
                        if (ordered)
                                btrfs_put_ordered_extent(ordered);
                        break;
@@@ -3048,20 -3018,22 +3047,20 @@@ struct falloc_range 
   */
  static int add_falloc_range(struct list_head *head, u64 start, u64 len)
  {
 -      struct falloc_range *prev = NULL;
        struct falloc_range *range = NULL;
  
 -      if (list_empty(head))
 -              goto insert;
 -
 -      /*
 -       * As fallocate iterate by bytenr order, we only need to check
 -       * the last range.
 -       */
 -      prev = list_entry(head->prev, struct falloc_range, list);
 -      if (prev->start + prev->len == start) {
 -              prev->len += len;
 -              return 0;
 +      if (!list_empty(head)) {
 +              /*
 +               * As fallocate iterates by bytenr order, we only need to check
 +               * the last range.
 +               */
 +              range = list_last_entry(head, struct falloc_range, list);
 +              if (range->start + range->len == start) {
 +                      range->len += len;
 +                      return 0;
 +              }
        }
 -insert:
 +
        range = kmalloc(sizeof(*range), GFP_KERNEL);
        if (!range)
                return -ENOMEM;
diff --combined fs/iomap/buffered-io.c
index 0065781935c7e30966e11b12fde49d5819ce01be,c5ff13e0e7cfbb10382e15450462fd27c6f7cb3b..41da4f14c00bb515b67acc28fbb0acef3662a9a6
@@@ -394,7 -394,7 +394,7 @@@ void iomap_readahead(struct readahead_c
  {
        struct inode *inode = rac->mapping->host;
        loff_t pos = readahead_pos(rac);
 -      loff_t length = readahead_length(rac);
 +      size_t length = readahead_length(rac);
        struct iomap_readpage_ctx ctx = {
                .rac    = rac,
        };
        trace_iomap_readahead(inode, readahead_count(rac));
  
        while (length > 0) {
 -              loff_t ret = iomap_apply(inode, pos, length, 0, ops,
 +              ssize_t ret = iomap_apply(inode, pos, length, 0, ops,
                                &ctx, iomap_readahead_actor);
                if (ret <= 0) {
                        WARN_ON_ONCE(ret == 0);
@@@ -640,6 -640,31 +640,6 @@@ out_no_page
        return status;
  }
  
 -int
 -iomap_set_page_dirty(struct page *page)
 -{
 -      struct address_space *mapping = page_mapping(page);
 -      int newly_dirty;
 -
 -      if (unlikely(!mapping))
 -              return !TestSetPageDirty(page);
 -
 -      /*
 -       * Lock out page's memcg migration to keep PageDirty
 -       * synchronized with per-memcg dirty page counters.
 -       */
 -      lock_page_memcg(page);
 -      newly_dirty = !TestSetPageDirty(page);
 -      if (newly_dirty)
 -              __set_page_dirty(page, mapping, 0);
 -      unlock_page_memcg(page);
 -
 -      if (newly_dirty)
 -              __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 -      return newly_dirty;
 -}
 -EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
 -
  static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
                size_t copied, struct page *page)
  {
        if (unlikely(copied < len && !PageUptodate(page)))
                return 0;
        iomap_set_range_uptodate(page, offset_in_page(pos), len);
 -      iomap_set_page_dirty(page);
 +      __set_page_dirty_nobuffers(page);
        return copied;
  }
  
@@@ -746,10 -771,6 +746,6 @@@ again
                 * Otherwise there's a nasty deadlock on copying from the
                 * same page as we're writing to, without it being marked
                 * up-to-date.
-                *
-                * Not only is this an optimisation, but it is also required
-                * to check that the address is actually valid, when atomic
-                * usercopies are used, below.
                 */
                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
                        status = -EFAULT;
                if (mapping_writably_mapped(inode->i_mapping))
                        flush_dcache_page(page);
  
-               copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+               copied = copy_page_from_iter_atomic(page, offset, bytes, i);
  
-               copied = iomap_write_end(inode, pos, bytes, copied, page, iomap,
+               status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
                                srcmap);
  
-               cond_resched();
+               if (unlikely(copied != status))
+                       iov_iter_revert(i, copied - status);
  
-               iov_iter_advance(i, copied);
-               if (unlikely(copied == 0)) {
+               cond_resched();
+               if (unlikely(status == 0)) {
                        /*
-                        * If we were unable to copy any data at all, we must
-                        * fall back to a single segment length write.
-                        *
-                        * If we didn't fallback here, we could livelock
-                        * because not all segments in the iov can be copied at
-                        * once without a pagefault.
+                        * A short copy made iomap_write_end() reject the
+                        * thing entirely.  Might be memory poisoning
+                        * halfway through, might be a race with munmap,
+                        * might be severe memory pressure.
                         */
-                       bytes = min_t(unsigned long, PAGE_SIZE - offset,
-                                               iov_iter_single_seg_count(i));
+                       if (copied)
+                               bytes = copied;
                        goto again;
                }
-               pos += copied;
-               written += copied;
-               length -= copied;
+               pos += status;
+               written += status;
+               length -= status;
  
                balance_dirty_pages_ratelimited(inode->i_mapping);
        } while (iov_iter_count(i) && length);
diff --combined mm/filemap.c
index ac82a93d4f38c5d6f134284a72230583738453ab,cf9de790f49391264d35aa90a1db9e67f885bddc..d1458ecf2f51ec2100eae25e3c53801e96acc0e5
@@@ -872,7 -872,7 +872,7 @@@ noinline int __add_to_page_cache_locked
        page->index = offset;
  
        if (!huge) {
 -              error = mem_cgroup_charge(page, current->mm, gfp);
 +              error = mem_cgroup_charge(page, NULL, gfp);
                if (error)
                        goto error;
                charged = true;
@@@ -3642,10 -3642,6 +3642,6 @@@ again
                 * Otherwise there's a nasty deadlock on copying from the
                 * same page as we're writing to, without it being marked
                 * up-to-date.
-                *
-                * Not only is this an optimisation, but it is also required
-                * to check that the address is actually valid, when atomic
-                * usercopies are used, below.
                 */
                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
                        status = -EFAULT;
                if (mapping_writably_mapped(mapping))
                        flush_dcache_page(page);
  
-               copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+               copied = copy_page_from_iter_atomic(page, offset, bytes, i);
                flush_dcache_page(page);
  
                status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                                page, fsdata);
-               if (unlikely(status < 0))
-                       break;
-               copied = status;
+               if (unlikely(status != copied)) {
+                       iov_iter_revert(i, copied - max(status, 0L));
+                       if (unlikely(status < 0))
+                               break;
+               }
                cond_resched();
  
-               iov_iter_advance(i, copied);
-               if (unlikely(copied == 0)) {
+               if (unlikely(status == 0)) {
                        /*
-                        * If we were unable to copy any data at all, we must
-                        * fall back to a single segment length write.
-                        *
-                        * If we didn't fallback here, we could livelock
-                        * because not all segments in the iov can be copied at
-                        * once without a pagefault.
+                        * A short copy made ->write_end() reject the
+                        * thing entirely.  Might be memory poisoning
+                        * halfway through, might be a race with munmap,
+                        * might be severe memory pressure.
                         */
-                       bytes = min_t(unsigned long, PAGE_SIZE - offset,
-                                               iov_iter_single_seg_count(i));
+                       if (copied)
+                               bytes = copied;
                        goto again;
                }
-               pos += copied;
-               written += copied;
+               pos += status;
+               written += status;
  
                balance_dirty_pages_ratelimited(mapping);
        } while (iov_iter_count(i));
This page took 0.099454 seconds and 4 git commands to generate.