Merge tag 'vfs-5.10-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

author Linus Torvalds <[email protected]>

Fri, 23 Oct 2020 18:33:41 +0000 (11:33 -0700)

committer Linus Torvalds <[email protected]>

Fri, 23 Oct 2020 18:33:41 +0000 (11:33 -0700)
author Linus Torvalds <[email protected]>
Fri, 23 Oct 2020 18:33:41 +0000 (11:33 -0700)
committer Linus Torvalds <[email protected]>
Fri, 23 Oct 2020 18:33:41 +0000 (11:33 -0700)
diff --combined fs/Makefile

index 7bb2a05fda1f18cf7f20ba960bbecd0a0d32e1c0,7173350739c5f6303662cbfa6731332a96c289e4..999d1a23f036c9f96a06e056d333e2e3832cdc37
--- 1/fs/Makefile
--- 2/fs/Makefile
+++ b/fs/Makefile
@@@ -14,7 -14,7 +14,7 @@@ obj-y :=      open.o read_write.o file_table
                 pnode.o splice.o sync.o utimes.o d_path.o \
                 stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
                 fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
-               kernel_read_file.o
- -              remap_range.o
++              kernel_read_file.o remap_range.o
   
   ifeq ($(CONFIG_BLOCK),y)
   obj-y +=      buffer.o block_dev.o direct-io.o mpage.o
@@@ -38,6 -38,7 +38,6 @@@ obj-$(CONFIG_FS_DAX)          += dax.
   obj-$(CONFIG_FS_ENCRYPTION)   += crypto/
   obj-$(CONFIG_FS_VERITY)               += verity/
   obj-$(CONFIG_FILE_LOCKING)      += locks.o
- -obj-$(CONFIG_COMPAT)          += compat.o
   obj-$(CONFIG_BINFMT_AOUT)     += binfmt_aout.o
   obj-$(CONFIG_BINFMT_EM86)     += binfmt_em86.o
   obj-$(CONFIG_BINFMT_MISC)     += binfmt_misc.o
diff --combined fs/read_write.c

index a669fb049b84451951bde0866d6ebb0014c98e7a,016444255d3e6f1170652757e54117e96e89d2c3..75f764b434184238a33f3c9bc688bfb395b83cb9
--- 1/fs/read_write.c
--- 2/fs/read_write.c
+++ b/fs/read_write.c
@@@ -419,42 -419,27 +419,42 @@@ static ssize_t new_sync_read(struct fil
         return ret;
   }
   
+ +static int warn_unsupported(struct file *file, const char *op)
+ +{
+ +      pr_warn_ratelimited(
+ +              "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
+ +              op, file, current->pid, current->comm);
+ +      return -EINVAL;
+ +}
+ +
   ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
   {
- -      mm_segment_t old_fs = get_fs();
+ +      struct kvec iov = {
+ +              .iov_base       = buf,
+ +              .iov_len        = min_t(size_t, count, MAX_RW_COUNT),
+ +      };
+ +      struct kiocb kiocb;
+ +      struct iov_iter iter;
         ssize_t ret;
   
         if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
                 return -EINVAL;
         if (!(file->f_mode & FMODE_CAN_READ))
                 return -EINVAL;
+ +      /*
+ +       * Also fail if ->read_iter and ->read are both wired up as that
+ +       * implies very convoluted semantics.
+ +       */
+ +      if (unlikely(!file->f_op->read_iter || file->f_op->read))
+ +              return warn_unsupported(file, "read");
   
- -      if (count > MAX_RW_COUNT)
- -              count =  MAX_RW_COUNT;
- -      set_fs(KERNEL_DS);
- -      if (file->f_op->read)
- -              ret = file->f_op->read(file, (void __user *)buf, count, pos);
- -      else if (file->f_op->read_iter)
- -              ret = new_sync_read(file, (void __user *)buf, count, pos);
- -      else
- -              ret = -EINVAL;
- -      set_fs(old_fs);
+ +      init_sync_kiocb(&kiocb, file);
+ +      kiocb.ki_pos = pos ? *pos : 0;
+ +      iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
+ +      ret = file->f_op->read_iter(&kiocb, &iter);
         if (ret > 0) {
+ +              if (pos)
+ +                      *pos = kiocb.ki_pos;
                 fsnotify_access(file);
                 add_rchar(current, ret);
         }
@@@ -525,32 -510,28 +525,32 @@@ static ssize_t new_sync_write(struct fi
   /* caller is responsible for file_start_write/file_end_write */
   ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
   {
- -      mm_segment_t old_fs;
- -      const char __user *p;
+ +      struct kvec iov = {
+ +              .iov_base       = (void *)buf,
+ +              .iov_len        = min_t(size_t, count, MAX_RW_COUNT),
+ +      };
+ +      struct kiocb kiocb;
+ +      struct iov_iter iter;
         ssize_t ret;
   
         if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
                 return -EBADF;
         if (!(file->f_mode & FMODE_CAN_WRITE))
                 return -EINVAL;
+ +      /*
+ +       * Also fail if ->write_iter and ->write are both wired up as that
+ +       * implies very convoluted semantics.
+ +       */
+ +      if (unlikely(!file->f_op->write_iter || file->f_op->write))
+ +              return warn_unsupported(file, "write");
   
- -      old_fs = get_fs();
- -      set_fs(KERNEL_DS);
- -      p = (__force const char __user *)buf;
- -      if (count > MAX_RW_COUNT)
- -              count =  MAX_RW_COUNT;
- -      if (file->f_op->write)
- -              ret = file->f_op->write(file, p, count, pos);
- -      else if (file->f_op->write_iter)
- -              ret = new_sync_write(file, p, count, pos);
- -      else
- -              ret = -EINVAL;
- -      set_fs(old_fs);
+ +      init_sync_kiocb(&kiocb, file);
+ +      kiocb.ki_pos = pos ? *pos : 0;
+ +      iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
+ +      ret = file->f_op->write_iter(&kiocb, &iter);
         if (ret > 0) {
+ +              if (pos)
+ +                      *pos = kiocb.ki_pos;
                 fsnotify_modify(file);
                 add_wchar(current, ret);
         }
@@@ -779,6 -760,185 +779,6 @@@ static ssize_t do_loop_readv_writev(str
         return ret;
   }
   
- -/**
- - * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
- - *     into the kernel and check that it is valid.
- - *
- - * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
- - * @uvector: Pointer to the userspace array.
- - * @nr_segs: Number of elements in userspace array.
- - * @fast_segs: Number of elements in @fast_pointer.
- - * @fast_pointer: Pointer to (usually small on-stack) kernel array.
- - * @ret_pointer: (output parameter) Pointer to a variable that will point to
- - *     either @fast_pointer, a newly allocated kernel array, or NULL,
- - *     depending on which array was used.
- - *
- - * This function copies an array of &struct iovec of @nr_segs from
- - * userspace into the kernel and checks that each element is valid (e.g.
- - * it does not point to a kernel address or cause overflow by being too
- - * large, etc.).
- - *
- - * As an optimization, the caller may provide a pointer to a small
- - * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
- - * (the size of this array, or 0 if unused, should be given in @fast_segs).
- - *
- - * @ret_pointer will always point to the array that was used, so the
- - * caller must take care not to call kfree() on it e.g. in case the
- - * @fast_pointer array was used and it was allocated on the stack.
- - *
- - * Return: The total number of bytes covered by the iovec array on success
- - *   or a negative error code on error.
- - */
- -ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
- -                            unsigned long nr_segs, unsigned long fast_segs,
- -                            struct iovec *fast_pointer,
- -                            struct iovec **ret_pointer)
- -{
- -      unsigned long seg;
- -      ssize_t ret;
- -      struct iovec *iov = fast_pointer;
- -
- -      /*
- -       * SuS says "The readv() function *may* fail if the iovcnt argument
- -       * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
- -       * traditionally returned zero for zero segments, so...
- -       */
- -      if (nr_segs == 0) {
- -              ret = 0;
- -              goto out;
- -      }
- -
- -      /*
- -       * First get the "struct iovec" from user memory and
- -       * verify all the pointers
- -       */
- -      if (nr_segs > UIO_MAXIOV) {
- -              ret = -EINVAL;
- -              goto out;
- -      }
- -      if (nr_segs > fast_segs) {
- -              iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
- -              if (iov == NULL) {
- -                      ret = -ENOMEM;
- -                      goto out;
- -              }
- -      }
- -      if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
- -              ret = -EFAULT;
- -              goto out;
- -      }
- -
- -      /*
- -       * According to the Single Unix Specification we should return EINVAL
- -       * if an element length is < 0 when cast to ssize_t or if the
- -       * total length would overflow the ssize_t return value of the
- -       * system call.
- -       *
- -       * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
- -       * overflow case.
- -       */
- -      ret = 0;
- -      for (seg = 0; seg < nr_segs; seg++) {
- -              void __user *buf = iov[seg].iov_base;
- -              ssize_t len = (ssize_t)iov[seg].iov_len;
- -
- -              /* see if we we're about to use an invalid len or if
- -               * it's about to overflow ssize_t */
- -              if (len < 0) {
- -                      ret = -EINVAL;
- -                      goto out;
- -              }
- -              if (type >= 0
- -                  && unlikely(!access_ok(buf, len))) {
- -                      ret = -EFAULT;
- -                      goto out;
- -              }
- -              if (len > MAX_RW_COUNT - ret) {
- -                      len = MAX_RW_COUNT - ret;
- -                      iov[seg].iov_len = len;
- -              }
- -              ret += len;
- -      }
- -out:
- -      *ret_pointer = iov;
- -      return ret;
- -}
- -
- -#ifdef CONFIG_COMPAT
- -ssize_t compat_rw_copy_check_uvector(int type,
- -              const struct compat_iovec __user *uvector, unsigned long nr_segs,
- -              unsigned long fast_segs, struct iovec *fast_pointer,
- -              struct iovec **ret_pointer)
- -{
- -      compat_ssize_t tot_len;
- -      struct iovec *iov = *ret_pointer = fast_pointer;
- -      ssize_t ret = 0;
- -      int seg;
- -
- -      /*
- -       * SuS says "The readv() function *may* fail if the iovcnt argument
- -       * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
- -       * traditionally returned zero for zero segments, so...
- -       */
- -      if (nr_segs == 0)
- -              goto out;
- -
- -      ret = -EINVAL;
- -      if (nr_segs > UIO_MAXIOV)
- -              goto out;
- -      if (nr_segs > fast_segs) {
- -              ret = -ENOMEM;
- -              iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
- -              if (iov == NULL)
- -                      goto out;
- -      }
- -      *ret_pointer = iov;
- -
- -      ret = -EFAULT;
- -      if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
- -              goto out;
- -
- -      /*
- -       * Single unix specification:
- -       * We should -EINVAL if an element length is not >= 0 and fitting an
- -       * ssize_t.
- -       *
- -       * In Linux, the total length is limited to MAX_RW_COUNT, there is
- -       * no overflow possibility.
- -       */
- -      tot_len = 0;
- -      ret = -EINVAL;
- -      for (seg = 0; seg < nr_segs; seg++) {
- -              compat_uptr_t buf;
- -              compat_ssize_t len;
- -
- -              if (__get_user(len, &uvector->iov_len) ||
- -                 __get_user(buf, &uvector->iov_base)) {
- -                      ret = -EFAULT;
- -                      goto out;
- -              }
- -              if (len < 0)    /* size_t not fitting in compat_ssize_t .. */
- -                      goto out;
- -              if (type >= 0 &&
- -                  !access_ok(compat_ptr(buf), len)) {
- -                      ret = -EFAULT;
- -                      goto out;
- -              }
- -              if (len > MAX_RW_COUNT - tot_len)
- -                      len = MAX_RW_COUNT - tot_len;
- -              tot_len += len;
- -              iov->iov_base = compat_ptr(buf);
- -              iov->iov_len = (compat_size_t) len;
- -              uvector++;
- -              iov++;
- -      }
- -      ret = tot_len;
- -
- -out:
- -      return ret;
- -}
- -#endif
- -
   static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
                 loff_t *pos, rwf_t flags)
   {
@@@ -908,7 -1068,7 +908,7 @@@ ssize_t vfs_iter_write(struct file *fil
   }
   EXPORT_SYMBOL(vfs_iter_write);
   
- -ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
+ +static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
                   unsigned long vlen, loff_t *pos, rwf_t flags)
   {
         struct iovec iovstack[UIO_FASTIOV];
@@@ -1095,93 -1255,224 +1095,93 @@@ SYSCALL_DEFINE6(pwritev2, unsigned long
         return do_pwritev(fd, vec, vlen, pos, flags);
   }
   
+ +/*
+ + * Various compat syscalls.  Note that they all pretend to take a native
+ + * iovec - import_iovec will properly treat those as compat_iovecs based on
+ + * in_compat_syscall().
+ + */
   #ifdef CONFIG_COMPAT
- -static size_t compat_readv(struct file *file,
- -                         const struct compat_iovec __user *vec,
- -                         unsigned long vlen, loff_t *pos, rwf_t flags)
- -{
- -      struct iovec iovstack[UIO_FASTIOV];
- -      struct iovec *iov = iovstack;
- -      struct iov_iter iter;
- -      ssize_t ret;
- -
- -      ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
- -      if (ret >= 0) {
- -              ret = do_iter_read(file, &iter, pos, flags);
- -              kfree(iov);
- -      }
- -      if (ret > 0)
- -              add_rchar(current, ret);
- -      inc_syscr(current);
- -      return ret;
- -}
- -
- -static size_t do_compat_readv(compat_ulong_t fd,
- -                               const struct compat_iovec __user *vec,
- -                               compat_ulong_t vlen, rwf_t flags)
- -{
- -      struct fd f = fdget_pos(fd);
- -      ssize_t ret;
- -      loff_t pos;
- -
- -      if (!f.file)
- -              return -EBADF;
- -      pos = f.file->f_pos;
- -      ret = compat_readv(f.file, vec, vlen, &pos, flags);
- -      if (ret >= 0)
- -              f.file->f_pos = pos;
- -      fdput_pos(f);
- -      return ret;
- -
- -}
- -
- -COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
- -              const struct compat_iovec __user *,vec,
- -              compat_ulong_t, vlen)
- -{
- -      return do_compat_readv(fd, vec, vlen, 0);
- -}
- -
- -static long do_compat_preadv64(unsigned long fd,
- -                                const struct compat_iovec __user *vec,
- -                                unsigned long vlen, loff_t pos, rwf_t flags)
- -{
- -      struct fd f;
- -      ssize_t ret;
- -
- -      if (pos < 0)
- -              return -EINVAL;
- -      f = fdget(fd);
- -      if (!f.file)
- -              return -EBADF;
- -      ret = -ESPIPE;
- -      if (f.file->f_mode & FMODE_PREAD)
- -              ret = compat_readv(f.file, vec, vlen, &pos, flags);
- -      fdput(f);
- -      return ret;
- -}
- -
   #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
   COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
- -              const struct compat_iovec __user *,vec,
+ +              const struct iovec __user *, vec,
                 unsigned long, vlen, loff_t, pos)
   {
- -      return do_compat_preadv64(fd, vec, vlen, pos, 0);
+ +      return do_preadv(fd, vec, vlen, pos, 0);
   }
   #endif
   
   COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
- -              const struct compat_iovec __user *,vec,
+ +              const struct iovec __user *, vec,
                 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
   {
         loff_t pos = ((loff_t)pos_high << 32) | pos_low;
   
- -      return do_compat_preadv64(fd, vec, vlen, pos, 0);
+ +      return do_preadv(fd, vec, vlen, pos, 0);
   }
   
   #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
   COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
- -              const struct compat_iovec __user *,vec,
+ +              const struct iovec __user *, vec,
                 unsigned long, vlen, loff_t, pos, rwf_t, flags)
   {
         if (pos == -1)
- -              return do_compat_readv(fd, vec, vlen, flags);
- -
- -      return do_compat_preadv64(fd, vec, vlen, pos, flags);
+ +              return do_readv(fd, vec, vlen, flags);
+ +      return do_preadv(fd, vec, vlen, pos, flags);
   }
   #endif
   
   COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
- -              const struct compat_iovec __user *,vec,
+ +              const struct iovec __user *, vec,
                 compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
                 rwf_t, flags)
   {
         loff_t pos = ((loff_t)pos_high << 32) | pos_low;
   
         if (pos == -1)
- -              return do_compat_readv(fd, vec, vlen, flags);
- -
- -      return do_compat_preadv64(fd, vec, vlen, pos, flags);
- -}
- -
- -static size_t compat_writev(struct file *file,
- -                          const struct compat_iovec __user *vec,
- -                          unsigned long vlen, loff_t *pos, rwf_t flags)
- -{
- -      struct iovec iovstack[UIO_FASTIOV];
- -      struct iovec *iov = iovstack;
- -      struct iov_iter iter;
- -      ssize_t ret;
- -
- -      ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
- -      if (ret >= 0) {
- -              file_start_write(file);
- -              ret = do_iter_write(file, &iter, pos, flags);
- -              file_end_write(file);
- -              kfree(iov);
- -      }
- -      if (ret > 0)
- -              add_wchar(current, ret);
- -      inc_syscw(current);
- -      return ret;
- -}
- -
- -static size_t do_compat_writev(compat_ulong_t fd,
- -                                const struct compat_iovec __user* vec,
- -                                compat_ulong_t vlen, rwf_t flags)
- -{
- -      struct fd f = fdget_pos(fd);
- -      ssize_t ret;
- -      loff_t pos;
- -
- -      if (!f.file)
- -              return -EBADF;
- -      pos = f.file->f_pos;
- -      ret = compat_writev(f.file, vec, vlen, &pos, flags);
- -      if (ret >= 0)
- -              f.file->f_pos = pos;
- -      fdput_pos(f);
- -      return ret;
- -}
- -
- -COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
- -              const struct compat_iovec __user *, vec,
- -              compat_ulong_t, vlen)
- -{
- -      return do_compat_writev(fd, vec, vlen, 0);
- -}
- -
- -static long do_compat_pwritev64(unsigned long fd,
- -                                 const struct compat_iovec __user *vec,
- -                                 unsigned long vlen, loff_t pos, rwf_t flags)
- -{
- -      struct fd f;
- -      ssize_t ret;
- -
- -      if (pos < 0)
- -              return -EINVAL;
- -      f = fdget(fd);
- -      if (!f.file)
- -              return -EBADF;
- -      ret = -ESPIPE;
- -      if (f.file->f_mode & FMODE_PWRITE)
- -              ret = compat_writev(f.file, vec, vlen, &pos, flags);
- -      fdput(f);
- -      return ret;
+ +              return do_readv(fd, vec, vlen, flags);
+ +      return do_preadv(fd, vec, vlen, pos, flags);
   }
   
   #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
   COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
- -              const struct compat_iovec __user *,vec,
+ +              const struct iovec __user *, vec,
                 unsigned long, vlen, loff_t, pos)
   {
- -      return do_compat_pwritev64(fd, vec, vlen, pos, 0);
+ +      return do_pwritev(fd, vec, vlen, pos, 0);
   }
   #endif
   
   COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
- -              const struct compat_iovec __user *,vec,
+ +              const struct iovec __user *,vec,
                 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
   {
         loff_t pos = ((loff_t)pos_high << 32) | pos_low;
   
- -      return do_compat_pwritev64(fd, vec, vlen, pos, 0);
+ +      return do_pwritev(fd, vec, vlen, pos, 0);
   }
   
   #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
   COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
- -              const struct compat_iovec __user *,vec,
+ +              const struct iovec __user *, vec,
                 unsigned long, vlen, loff_t, pos, rwf_t, flags)
   {
         if (pos == -1)
- -              return do_compat_writev(fd, vec, vlen, flags);
- -
- -      return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+ +              return do_writev(fd, vec, vlen, flags);
+ +      return do_pwritev(fd, vec, vlen, pos, flags);
   }
   #endif
   
   COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
- -              const struct compat_iovec __user *,vec,
+ +              const struct iovec __user *,vec,
                 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
   {
         loff_t pos = ((loff_t)pos_high << 32) | pos_low;
   
         if (pos == -1)
- -              return do_compat_writev(fd, vec, vlen, flags);
- -
- -      return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+ +              return do_writev(fd, vec, vlen, flags);
+ +      return do_pwritev(fd, vec, vlen, pos, flags);
   }
- -
- -#endif
+ +#endif /* CONFIG_COMPAT */
   
   static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
                            size_t count, loff_t max)
@@@ -1410,6 -1701,59 +1410,59 @@@ static ssize_t do_copy_file_range(struc
                                        flags);
   }
   
+ /*
+  * Performs necessary checks before doing a file copy
+  *
+  * Can adjust amount of bytes to copy via @req_count argument.
+  * Returns appropriate error code that caller should return or
+  * zero in case the copy should be allowed.
+  */
+ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
+                                   struct file *file_out, loff_t pos_out,
+                                   size_t *req_count, unsigned int flags)
+ {
+       struct inode *inode_in = file_inode(file_in);
+       struct inode *inode_out = file_inode(file_out);
+       uint64_t count = *req_count;
+       loff_t size_in;
+       int ret;
+ 
+       ret = generic_file_rw_checks(file_in, file_out);
+       if (ret)
+               return ret;
+ 
+       /* Don't touch certain kinds of inodes */
+       if (IS_IMMUTABLE(inode_out))
+               return -EPERM;
+ 
+       if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+               return -ETXTBSY;
+ 
+       /* Ensure offsets don't wrap. */
+       if (pos_in + count < pos_in || pos_out + count < pos_out)
+               return -EOVERFLOW;
+ 
+       /* Shorten the copy to EOF */
+       size_in = i_size_read(inode_in);
+       if (pos_in >= size_in)
+               count = 0;
+       else
+               count = min(count, size_in - (uint64_t)pos_in);
+ 
+       ret = generic_write_check_limits(file_out, pos_out, &count);
+       if (ret)
+               return ret;
+ 
+       /* Don't allow overlapped copying within the same file. */
+       if (inode_in == inode_out &&
+           pos_out + count > pos_in &&
+           pos_out < pos_in + count)
+               return -EINVAL;
+ 
+       *req_count = count;
+       return 0;
+ }
+ 
   /*
    * copy_file_range() differs from regular file read and write in that it
    * specifically allows return partial success.  When it does so is up to
@@@ -1542,475 -1886,92 +1595,92 @@@ out2
         return ret;
   }
   
- static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
-                            bool write)
- {
-       struct inode *inode = file_inode(file);
- 
-       if (unlikely(pos < 0 || len < 0))
-               return -EINVAL;
- 
-        if (unlikely((loff_t) (pos + len) < 0))
-               return -EINVAL;
- 
-       if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
-               loff_t end = len ? pos + len - 1 : OFFSET_MAX;
-               int retval;
- 
-               retval = locks_mandatory_area(inode, file, pos, end,
-                               write ? F_WRLCK : F_RDLCK);
-               if (retval < 0)
-                       return retval;
-       }
- 
-       return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
- }
- /*
-  * Ensure that we don't remap a partial EOF block in the middle of something
-  * else.  Assume that the offsets have already been checked for block
-  * alignment.
-  *
-  * For clone we only link a partial EOF block above or at the destination file's
-  * EOF.  For deduplication we accept a partial EOF block only if it ends at the
-  * destination file's EOF (can not link it into the middle of a file).
-  *
-  * Shorten the request if possible.
-  */
- static int generic_remap_check_len(struct inode *inode_in,
-                                  struct inode *inode_out,
-                                  loff_t pos_out,
-                                  loff_t *len,
-                                  unsigned int remap_flags)
- {
-       u64 blkmask = i_blocksize(inode_in) - 1;
-       loff_t new_len = *len;
- 
-       if ((*len & blkmask) == 0)
-               return 0;
- 
-       if (pos_out + *len < i_size_read(inode_out))
-               new_len &= ~blkmask;
- 
-       if (new_len == *len)
-               return 0;
- 
-       if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
-               *len = new_len;
-               return 0;
-       }
- 
-       return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
- }
- 
- /* Read a page's worth of file data into the page cache. */
- static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
- {
-       struct page *page;
- 
-       page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
-       if (IS_ERR(page))
-               return page;
-       if (!PageUptodate(page)) {
-               put_page(page);
-               return ERR_PTR(-EIO);
-       }
-       return page;
- }
- 
   /*
-  * Lock two pages, ensuring that we lock in offset order if the pages are from
-  * the same file.
+  * Don't operate on ranges the page cache doesn't support, and don't exceed the
+  * LFS limits.  If pos is under the limit it becomes a short access.  If it
+  * exceeds the limit we return -EFBIG.
    */
- static void vfs_lock_two_pages(struct page *page1, struct page *page2)
- {
-       /* Always lock in order of increasing index. */
-       if (page1->index > page2->index)
-               swap(page1, page2);
- 
-       lock_page(page1);
-       if (page1 != page2)
-               lock_page(page2);
- }
- 
- /* Unlock two pages, being careful not to unlock the same page twice. */
- static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
+ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
   {
-       unlock_page(page1);
-       if (page1 != page2)
-               unlock_page(page2);
- }
- 
- /*
-  * Compare extents of two files to see if they are the same.
-  * Caller must have locked both inodes to prevent write races.
-  */
- static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
-                                        struct inode *dest, loff_t destoff,
-                                        loff_t len, bool *is_same)
- {
-       loff_t src_poff;
-       loff_t dest_poff;
-       void *src_addr;
-       void *dest_addr;
-       struct page *src_page;
-       struct page *dest_page;
-       loff_t cmp_len;
-       bool same;
-       int error;
- 
-       error = -EINVAL;
-       same = true;
-       while (len) {
-               src_poff = srcoff & (PAGE_SIZE - 1);
-               dest_poff = destoff & (PAGE_SIZE - 1);
-               cmp_len = min(PAGE_SIZE - src_poff,
-                             PAGE_SIZE - dest_poff);
-               cmp_len = min(cmp_len, len);
-               if (cmp_len <= 0)
-                       goto out_error;
- 
-               src_page = vfs_dedupe_get_page(src, srcoff);
-               if (IS_ERR(src_page)) {
-                       error = PTR_ERR(src_page);
-                       goto out_error;
-               }
-               dest_page = vfs_dedupe_get_page(dest, destoff);
-               if (IS_ERR(dest_page)) {
-                       error = PTR_ERR(dest_page);
-                       put_page(src_page);
-                       goto out_error;
-               }
- 
-               vfs_lock_two_pages(src_page, dest_page);
+       struct inode *inode = file->f_mapping->host;
+       loff_t max_size = inode->i_sb->s_maxbytes;
+       loff_t limit = rlimit(RLIMIT_FSIZE);
   
-               /*
-                * Now that we've locked both pages, make sure they're still
-                * mapped to the file data we're interested in.  If not,
-                * someone is invalidating pages on us and we lose.
-                */
-               if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
-                   src_page->mapping != src->i_mapping ||
-                   dest_page->mapping != dest->i_mapping) {
-                       same = false;
-                       goto unlock;
+       if (limit != RLIM_INFINITY) {
+               if (pos >= limit) {
+                       send_sig(SIGXFSZ, current, 0);
+                       return -EFBIG;
                 }
+               *count = min(*count, limit - pos);
+       }
   
-               src_addr = kmap_atomic(src_page);
-               dest_addr = kmap_atomic(dest_page);
- 
-               flush_dcache_page(src_page);
-               flush_dcache_page(dest_page);
- 
-               if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
-                       same = false;
+       if (!(file->f_flags & O_LARGEFILE))
+               max_size = MAX_NON_LFS;
   
-               kunmap_atomic(dest_addr);
-               kunmap_atomic(src_addr);
- unlock:
-               vfs_unlock_two_pages(src_page, dest_page);
-               put_page(dest_page);
-               put_page(src_page);
+       if (unlikely(pos >= max_size))
+               return -EFBIG;
   
-               if (!same)
-                       break;
+       *count = min(*count, max_size - pos);
   
-               srcoff += cmp_len;
-               destoff += cmp_len;
-               len -= cmp_len;
-       }
- 
-       *is_same = same;
         return 0;
- 
- out_error:
-       return error;
   }
   
   /*
-  * Check that the two inodes are eligible for cloning, the ranges make
-  * sense, and then flush all dirty data.  Caller must ensure that the
-  * inodes have been locked against any other modifications.
+  * Performs necessary checks before doing a write
    *
-  * If there's an error, then the usual negative error code is returned.
-  * Otherwise returns 0 with *len set to the request length.
+  * Can adjust writing position or amount of bytes to write.
+  * Returns appropriate error code that caller should return or
+  * zero in case that write should be allowed.
    */
- int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
-                                 struct file *file_out, loff_t pos_out,
-                                 loff_t *len, unsigned int remap_flags)
+ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
   {
-       struct inode *inode_in = file_inode(file_in);
-       struct inode *inode_out = file_inode(file_out);
-       bool same_inode = (inode_in == inode_out);
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file->f_mapping->host;
+       loff_t count;
         int ret;
   
-       /* Don't touch certain kinds of inodes */
-       if (IS_IMMUTABLE(inode_out))
-               return -EPERM;
- 
-       if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+       if (IS_SWAPFILE(inode))
                 return -ETXTBSY;
   
-       /* Don't reflink dirs, pipes, sockets... */
-       if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-               return -EISDIR;
-       if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
-               return -EINVAL;
- 
-       /* Zero length dedupe exits immediately; reflink goes to EOF. */
-       if (*len == 0) {
-               loff_t isize = i_size_read(inode_in);
- 
-               if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
-                       return 0;
-               if (pos_in > isize)
-                       return -EINVAL;
-               *len = isize - pos_in;
-               if (*len == 0)
-                       return 0;
-       }
- 
-       /* Check that we don't violate system file offset limits. */
-       ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
-                       remap_flags);
-       if (ret)
-               return ret;
- 
-       /* Wait for the completion of any pending IOs on both files */
-       inode_dio_wait(inode_in);
-       if (!same_inode)
-               inode_dio_wait(inode_out);
- 
-       ret = filemap_write_and_wait_range(inode_in->i_mapping,
-                       pos_in, pos_in + *len - 1);
-       if (ret)
-               return ret;
- 
-       ret = filemap_write_and_wait_range(inode_out->i_mapping,
-                       pos_out, pos_out + *len - 1);
-       if (ret)
-               return ret;
- 
-       /*
-        * Check that the extents are the same.
-        */
-       if (remap_flags & REMAP_FILE_DEDUP) {
-               bool            is_same = false;
- 
-               ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
-                               inode_out, pos_out, *len, &is_same);
-               if (ret)
-                       return ret;
-               if (!is_same)
-                       return -EBADE;
-       }
- 
-       ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
-                       remap_flags);
-       if (ret)
-               return ret;
- 
-       /* If can't alter the file contents, we're done. */
-       if (!(remap_flags & REMAP_FILE_DEDUP))
-               ret = file_modified(file_out);
- 
-       return ret;
- }
- EXPORT_SYMBOL(generic_remap_file_range_prep);
- 
- loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
-                          struct file *file_out, loff_t pos_out,
-                          loff_t len, unsigned int remap_flags)
- {
-       loff_t ret;
- 
-       WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
- 
-       /*
-        * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
-        * the same mount. Practically, they only need to be on the same file
-        * system.
-        */
-       if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
-               return -EXDEV;
- 
-       ret = generic_file_rw_checks(file_in, file_out);
-       if (ret < 0)
-               return ret;
- 
-       if (!file_in->f_op->remap_file_range)
-               return -EOPNOTSUPP;
- 
-       ret = remap_verify_area(file_in, pos_in, len, false);
-       if (ret)
-               return ret;
- 
-       ret = remap_verify_area(file_out, pos_out, len, true);
-       if (ret)
-               return ret;
- 
-       ret = file_in->f_op->remap_file_range(file_in, pos_in,
-                       file_out, pos_out, len, remap_flags);
-       if (ret < 0)
-               return ret;
- 
-       fsnotify_access(file_in);
-       fsnotify_modify(file_out);
-       return ret;
- }
- EXPORT_SYMBOL(do_clone_file_range);
- 
- loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
-                           struct file *file_out, loff_t pos_out,
-                           loff_t len, unsigned int remap_flags)
- {
-       loff_t ret;
- 
-       file_start_write(file_out);
-       ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
-                                 remap_flags);
-       file_end_write(file_out);
- 
-       return ret;
- }
- EXPORT_SYMBOL(vfs_clone_file_range);
- 
- /* Check whether we are allowed to dedupe the destination file */
- static bool allow_file_dedupe(struct file *file)
- {
-       if (capable(CAP_SYS_ADMIN))
-               return true;
-       if (file->f_mode & FMODE_WRITE)
-               return true;
-       if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
-               return true;
-       if (!inode_permission(file_inode(file), MAY_WRITE))
-               return true;
-       return false;
- }
+       if (!iov_iter_count(from))
+               return 0;
   
- loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
-                                struct file *dst_file, loff_t dst_pos,
-                                loff_t len, unsigned int remap_flags)
- {
-       loff_t ret;
+       /* FIXME: this is for backwards compatibility with 2.4 */
+       if (iocb->ki_flags & IOCB_APPEND)
+               iocb->ki_pos = i_size_read(inode);
   
-       WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
-                                    REMAP_FILE_CAN_SHORTEN));
+       if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+               return -EINVAL;
   
-       ret = mnt_want_write_file(dst_file);
+       count = iov_iter_count(from);
+       ret = generic_write_check_limits(file, iocb->ki_pos, &count);
         if (ret)
                 return ret;
   
-       ret = remap_verify_area(dst_file, dst_pos, len, true);
-       if (ret < 0)
-               goto out_drop_write;
- 
-       ret = -EPERM;
-       if (!allow_file_dedupe(dst_file))
-               goto out_drop_write;
- 
-       ret = -EXDEV;
-       if (src_file->f_path.mnt != dst_file->f_path.mnt)
-               goto out_drop_write;
- 
-       ret = -EISDIR;
-       if (S_ISDIR(file_inode(dst_file)->i_mode))
-               goto out_drop_write;
- 
-       ret = -EINVAL;
-       if (!dst_file->f_op->remap_file_range)
-               goto out_drop_write;
- 
-       if (len == 0) {
-               ret = 0;
-               goto out_drop_write;
-       }
- 
-       ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
-                       dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
- out_drop_write:
-       mnt_drop_write_file(dst_file);
- 
-       return ret;
+       iov_iter_truncate(from, count);
+       return iov_iter_count(from);
   }
- EXPORT_SYMBOL(vfs_dedupe_file_range_one);
+ EXPORT_SYMBOL(generic_write_checks);
   
- int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
+ /*
+  * Performs common checks before doing a file copy/clone
+  * from @file_in to @file_out.
+  */
+ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
   {
-       struct file_dedupe_range_info *info;
-       struct inode *src = file_inode(file);
-       u64 off;
-       u64 len;
-       int i;
-       int ret;
-       u16 count = same->dest_count;
-       loff_t deduped;
- 
-       if (!(file->f_mode & FMODE_READ))
-               return -EINVAL;
- 
-       if (same->reserved1 || same->reserved2)
-               return -EINVAL;
- 
-       off = same->src_offset;
-       len = same->src_length;
+       struct inode *inode_in = file_inode(file_in);
+       struct inode *inode_out = file_inode(file_out);
   
-       if (S_ISDIR(src->i_mode))
+       /* Don't copy dirs, pipes, sockets... */
+       if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
                 return -EISDIR;
- 
-       if (!S_ISREG(src->i_mode))
-               return -EINVAL;
- 
-       if (!file->f_op->remap_file_range)
-               return -EOPNOTSUPP;
- 
-       ret = remap_verify_area(file, off, len, false);
-       if (ret < 0)
-               return ret;
-       ret = 0;
- 
-       if (off + len > i_size_read(src))
+       if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
                 return -EINVAL;
   
-       /* Arbitrary 1G limit on a single dedupe request, can be raised. */
-       len = min_t(u64, len, 1 << 30);
- 
-       /* pre-format output fields to sane values */
-       for (i = 0; i < count; i++) {
-               same->info[i].bytes_deduped = 0ULL;
-               same->info[i].status = FILE_DEDUPE_RANGE_SAME;
-       }
- 
-       for (i = 0, info = same->info; i < count; i++, info++) {
-               struct fd dst_fd = fdget(info->dest_fd);
-               struct file *dst_file = dst_fd.file;
- 
-               if (!dst_file) {
-                       info->status = -EBADF;
-                       goto next_loop;
-               }
- 
-               if (info->reserved) {
-                       info->status = -EINVAL;
-                       goto next_fdput;
-               }
- 
-               deduped = vfs_dedupe_file_range_one(file, off, dst_file,
-                                                   info->dest_offset, len,
-                                                   REMAP_FILE_CAN_SHORTEN);
-               if (deduped == -EBADE)
-                       info->status = FILE_DEDUPE_RANGE_DIFFERS;
-               else if (deduped < 0)
-                       info->status = deduped;
-               else
-                       info->bytes_deduped = len;
+       if (!(file_in->f_mode & FMODE_READ) ||
+           !(file_out->f_mode & FMODE_WRITE) ||
+           (file_out->f_flags & O_APPEND))
+               return -EBADF;
   
- next_fdput:
-               fdput(dst_fd);
- next_loop:
-               if (fatal_signal_pending(current))
-                       break;
-       }
-       return ret;
+       return 0;
   }
- EXPORT_SYMBOL(vfs_dedupe_file_range);
diff --combined include/linux/fs.h

index 83817d24e9023434c614fbe53ebf6aa3c835aa3d,8fb063ab7d5034f5270949402fadfb2f22fbc801..16e3789634d3b1bac014bc39f00332eb9c7e8265
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -178,6 -178,14 +178,6 @@@ typedef int (dio_iodone_t)(struct kioc
   /* File supports async buffered reads */
   #define FMODE_BUF_RASYNC      ((__force fmode_t)0x40000000)
   
- -/*
- - * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
- - * that indicates that they should check the contents of the iovec are
- - * valid, but not check the memory that the iovec elements
- - * points too.
- - */
- -#define CHECK_IOVEC_ONLY -1
- -
   /*
    * Attribute flags.  These should be or-ed together to figure out what
    * has been changed!
@@@ -302,20 -310,17 +302,20 @@@ enum rw_hint 
         WRITE_LIFE_EXTREME      = RWH_WRITE_LIFE_EXTREME,
   };
   
- -#define IOCB_EVENTFD          (1 << 0)
- -#define IOCB_APPEND           (1 << 1)
- -#define IOCB_DIRECT           (1 << 2)
- -#define IOCB_HIPRI            (1 << 3)
- -#define IOCB_DSYNC            (1 << 4)
- -#define IOCB_SYNC             (1 << 5)
- -#define IOCB_WRITE            (1 << 6)
- -#define IOCB_NOWAIT           (1 << 7)
+ +/* Match RWF_* bits to IOCB bits */
+ +#define IOCB_HIPRI            (__force int) RWF_HIPRI
+ +#define IOCB_DSYNC            (__force int) RWF_DSYNC
+ +#define IOCB_SYNC             (__force int) RWF_SYNC
+ +#define IOCB_NOWAIT           (__force int) RWF_NOWAIT
+ +#define IOCB_APPEND           (__force int) RWF_APPEND
+ +
+ +/* non-RWF related bits - start at 16 */
+ +#define IOCB_EVENTFD          (1 << 16)
+ +#define IOCB_DIRECT           (1 << 17)
+ +#define IOCB_WRITE            (1 << 18)
   /* iocb->ki_waitq is valid */
- -#define IOCB_WAITQ            (1 << 8)
- -#define IOCB_NOIO             (1 << 9)
+ +#define IOCB_WAITQ            (1 << 19)
+ +#define IOCB_NOIO             (1 << 20)
   
   struct kiocb {
         struct file             *ki_filp;
@@@ -1366,12 -1371,6 +1366,12 @@@ extern int send_sigurg(struct fown_stru
   #define SB_ACTIVE     (1<<30)
   #define SB_NOUSER     (1<<31)
   
+ +/* These flags relate to encoding and casefolding */
+ +#define SB_ENC_STRICT_MODE_FL (1 << 0)
+ +
+ +#define sb_has_strict_encoding(sb) \
+ +      (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)
+ +
   /*
    *    Umount options
    */
@@@ -1386,7 -1385,7 +1386,7 @@@
   #define SB_I_CGROUPWB 0x00000001      /* cgroup-aware writeback enabled */
   #define SB_I_NOEXEC   0x00000002      /* Ignore executables on this fs */
   #define SB_I_NODEV    0x00000004      /* Ignore devices on this fs */
- -#define SB_I_MULTIROOT        0x00000008      /* Multiple roots to the dentry tree */
+ +#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */
   
   /* sb->s_iflags to limit user namespace mounts */
   #define SB_I_USERNS_VISIBLE           0x00000010 /* fstype already mounted */
@@@ -1441,10 -1440,6 +1441,10 @@@ struct super_block 
   #endif
   #ifdef CONFIG_FS_VERITY
         const struct fsverity_operations *s_vop;
+ +#endif
+ +#ifdef CONFIG_UNICODE
+ +      struct unicode_map *s_encoding;
+ +      __u16 s_encoding_flags;
   #endif
         struct hlist_bl_head    s_roots;        /* alternate root dentries for NFS */
         struct list_head        s_mounts;       /* list of mounts; _not_ for fs use */
@@@ -1892,8 -1887,15 +1892,8 @@@ static inline int call_mmap(struct fil
         return file->f_op->mmap(file, vma);
   }
   
- -ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
- -                            unsigned long nr_segs, unsigned long fast_segs,
- -                            struct iovec *fast_pointer,
- -                            struct iovec **ret_pointer);
- -
   extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
   extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
- -extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
- -              unsigned long, loff_t *, rwf_t);
   extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                    loff_t, size_t, unsigned int);
   extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
@@@ -2217,7 -2219,6 +2217,7 @@@ struct file_system_type 
   #define FS_HAS_SUBTYPE                4
   #define FS_USERNS_MOUNT               8       /* Can be mounted by userns root */
   #define FS_DISALLOW_NOTIFY_PERM       16      /* Disable fanotify permission events */
+ +#define FS_THP_SUPPORT                8192    /* Remove once all fs converted */
   #define FS_RENAME_DOES_D_MOVE 32768   /* FS will handle d_move() during rename() internally. */
         int (*init_fs_context)(struct fs_context *);
         const struct fs_parameter_spec *parameters;
@@@ -2590,10 -2591,6 +2590,10 @@@ extern bool is_bad_inode(struct inode *
   unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                         pgoff_t start, pgoff_t end);
   
+ +void invalidate_mapping_pagevec(struct address_space *mapping,
+ +                              pgoff_t start, pgoff_t end,
+ +                              unsigned long *nr_pagevec);
+ +
   static inline void invalidate_remote_inode(struct inode *inode)
   {
         if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@@ -2705,6 -2702,33 +2705,6 @@@ static inline errseq_t file_sample_sb_e
         return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
   }
   
- -static inline int filemap_nr_thps(struct address_space *mapping)
- -{
- -#ifdef CONFIG_READ_ONLY_THP_FOR_FS
- -      return atomic_read(&mapping->nr_thps);
- -#else
- -      return 0;
- -#endif
- -}
- -
- -static inline void filemap_nr_thps_inc(struct address_space *mapping)
- -{
- -#ifdef CONFIG_READ_ONLY_THP_FOR_FS
- -      atomic_inc(&mapping->nr_thps);
- -#else
- -      WARN_ON_ONCE(1);
- -#endif
- -}
- -
- -static inline void filemap_nr_thps_dec(struct address_space *mapping)
- -{
- -#ifdef CONFIG_READ_ONLY_THP_FOR_FS
- -      atomic_dec(&mapping->nr_thps);
- -#else
- -      WARN_ON_ONCE(1);
- -#endif
- -}
- -
   extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
                            int datasync);
   extern int vfs_fsync(struct file *file, int datasync);
@@@ -2834,6 -2858,45 +2834,6 @@@ static inline void i_readcount_inc(stru
   #endif
   extern int do_pipe_flags(int *, int);
   
- -#define __kernel_read_file_id(id) \
- -      id(UNKNOWN, unknown)            \
- -      id(FIRMWARE, firmware)          \
- -      id(FIRMWARE_PREALLOC_BUFFER, firmware)  \
- -      id(FIRMWARE_EFI_EMBEDDED, firmware)     \
- -      id(MODULE, kernel-module)               \
- -      id(KEXEC_IMAGE, kexec-image)            \
- -      id(KEXEC_INITRAMFS, kexec-initramfs)    \
- -      id(POLICY, security-policy)             \
- -      id(X509_CERTIFICATE, x509-certificate)  \
- -      id(MAX_ID, )
- -
- -#define __fid_enumify(ENUM, dummy) READING_ ## ENUM,
- -#define __fid_stringify(dummy, str) #str,
- -
- -enum kernel_read_file_id {
- -      __kernel_read_file_id(__fid_enumify)
- -};
- -
- -static const char * const kernel_read_file_str[] = {
- -      __kernel_read_file_id(__fid_stringify)
- -};
- -
- -static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id)
- -{
- -      if ((unsigned)id >= READING_MAX_ID)
- -              return kernel_read_file_str[READING_UNKNOWN];
- -
- -      return kernel_read_file_str[id];
- -}
- -
- -extern int kernel_read_file(struct file *, void **, loff_t *, loff_t,
- -                          enum kernel_read_file_id);
- -extern int kernel_read_file_from_path(const char *, void **, loff_t *, loff_t,
- -                                    enum kernel_read_file_id);
- -extern int kernel_read_file_from_path_initns(const char *, void **, loff_t *, loff_t,
- -                                           enum kernel_read_file_id);
- -extern int kernel_read_file_from_fd(int, void **, loff_t *, loff_t,
- -                                  enum kernel_read_file_id);
   extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
   ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos);
   extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
@@@ -2946,13 -3009,9 +2946,9 @@@ extern int sb_min_blocksize(struct supe
   extern int generic_file_mmap(struct file *, struct vm_area_struct *);
   extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
   extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
- extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
-                               struct file *file_out, loff_t pos_out,
-                               loff_t *count, unsigned int remap_flags);
+ extern int generic_write_check_limits(struct file *file, loff_t pos,
+               loff_t *count);
   extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
- extern int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
-                                   struct file *file_out, loff_t pos_out,
-                                   size_t *count, unsigned int flags);
   extern ssize_t generic_file_buffered_read(struct kiocb *iocb,
                 struct iov_iter *to, ssize_t already_read);
   extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
@@@ -3016,6 -3075,8 +3012,6 @@@ enum 
         DIO_SKIP_HOLES  = 0x02,
   };
   
- -void dio_end_io(struct bio *bio);
- -
   ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
                              struct block_device *bdev, struct iov_iter *iter,
                              get_block_t get_block,
@@@ -3197,12 -3258,6 +3193,12 @@@ extern int generic_file_fsync(struct fi
   
   extern int generic_check_addressable(unsigned, u64);
   
+ +#ifdef CONFIG_UNICODE
+ +extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
+ +extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+ +                              const char *str, const struct qstr *name);
+ +#endif
+ +
   #ifdef CONFIG_MIGRATION
   extern int buffer_migrate_page(struct address_space *,
                                 struct page *, struct page *,
@@@ -3258,9 -3313,6 +3254,9 @@@ static inline int kiocb_set_rw_flags(st
   {
         int kiocb_flags = 0;
   
+ +      /* make sure there's no overlap between RWF and private IOCB flags */
+ +      BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD);
+ +
         if (!flags)
                 return 0;
         if (unlikely(flags & ~RWF_SUPPORTED))
@@@ -3269,11 -3321,16 +3265,11 @@@
         if (flags & RWF_NOWAIT) {
                 if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
                         return -EOPNOTSUPP;
- -              kiocb_flags |= IOCB_NOWAIT | IOCB_NOIO;
+ +              kiocb_flags |= IOCB_NOIO;
         }
- -      if (flags & RWF_HIPRI)
- -              kiocb_flags |= IOCB_HIPRI;
- -      if (flags & RWF_DSYNC)
- -              kiocb_flags |= IOCB_DSYNC;
+ +      kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
         if (flags & RWF_SYNC)
- -              kiocb_flags |= (IOCB_DSYNC | IOCB_SYNC);
- -      if (flags & RWF_APPEND)
- -              kiocb_flags |= IOCB_APPEND;
+ +              kiocb_flags |= IOCB_DSYNC;
   
         ki->ki_flags |= kiocb_flags;
         return 0;
@@@ -3453,6 -3510,15 +3449,6 @@@ extern int vfs_fadvise(struct file *fil
   extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
                            int advice);
   
- -#if defined(CONFIG_IO_URING)
- -extern struct sock *io_uring_get_socket(struct file *file);
- -#else
- -static inline struct sock *io_uring_get_socket(struct file *file)
- -{
- -      return NULL;
- -}
- -#endif
- -
   int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
                              unsigned int flags);
   
diff --combined mm/filemap.c

index e4101b5bfa82c8ca6e38ccbb5c56f2c889b7d021,9962fd682f202525d0d407965b8a87a6b1d4eca6..d5e7c2029d16b4d8d8801d1058221b645904d5d7
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -249,7 -249,7 +249,7 @@@ static void page_cache_free_page(struc
                 freepage(page);
   
         if (PageTransHuge(page) && !PageHuge(page)) {
- -              page_ref_sub(page, HPAGE_PMD_NR);
+ +              page_ref_sub(page, thp_nr_pages(page));
                 VM_BUG_ON_PAGE(page_count(page) <= 0, page);
         } else {
                 put_page(page);
@@@ -414,7 -414,7 +414,7 @@@ int __filemap_fdatawrite_range(struct a
                 .range_end = end,
         };
   
- -      if (!mapping_cap_writeback_dirty(mapping) ||
+ +      if (!mapping_can_writeback(mapping) ||
             !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                 return 0;
   
@@@ -827,14 -827,15 +827,14 @@@ int replace_page_cache_page(struct pag
   }
   EXPORT_SYMBOL_GPL(replace_page_cache_page);
   
- -static int __add_to_page_cache_locked(struct page *page,
- -                                    struct address_space *mapping,
- -                                    pgoff_t offset, gfp_t gfp_mask,
- -                                    void **shadowp)
+ +noinline int __add_to_page_cache_locked(struct page *page,
+ +                                      struct address_space *mapping,
+ +                                      pgoff_t offset, gfp_t gfp,
+ +                                      void **shadowp)
   {
         XA_STATE(xas, &mapping->i_pages, offset);
         int huge = PageHuge(page);
         int error;
- -      void *old;
   
         VM_BUG_ON_PAGE(!PageLocked(page), page);
         VM_BUG_ON_PAGE(PageSwapBacked(page), page);
@@@ -845,46 -846,25 +845,46 @@@
         page->index = offset;
   
         if (!huge) {
- -              error = mem_cgroup_charge(page, current->mm, gfp_mask);
+ +              error = mem_cgroup_charge(page, current->mm, gfp);
                 if (error)
                         goto error;
         }
   
+ +      gfp &= GFP_RECLAIM_MASK;
+ +
         do {
+ +              unsigned int order = xa_get_order(xas.xa, xas.xa_index);
+ +              void *entry, *old = NULL;
+ +
+ +              if (order > thp_order(page))
+ +                      xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
+ +                                      order, gfp);
                 xas_lock_irq(&xas);
- -              old = xas_load(&xas);
- -              if (old && !xa_is_value(old))
- -                      xas_set_err(&xas, -EEXIST);
+ +              xas_for_each_conflict(&xas, entry) {
+ +                      old = entry;
+ +                      if (!xa_is_value(entry)) {
+ +                              xas_set_err(&xas, -EEXIST);
+ +                              goto unlock;
+ +                      }
+ +              }
+ +
+ +              if (old) {
+ +                      if (shadowp)
+ +                              *shadowp = old;
+ +                      /* entry may have been split before we acquired lock */
+ +                      order = xa_get_order(xas.xa, xas.xa_index);
+ +                      if (order > thp_order(page)) {
+ +                              xas_split(&xas, old, order);
+ +                              xas_reset(&xas);
+ +                      }
+ +              }
+ +
                 xas_store(&xas, page);
                 if (xas_error(&xas))
                         goto unlock;
   
- -              if (xa_is_value(old)) {
+ +              if (old)
                         mapping->nrexceptional--;
- -                      if (shadowp)
- -                              *shadowp = old;
- -              }
                 mapping->nrpages++;
   
                 /* hugetlb pages do not participate in page cache accounting */
@@@ -892,7 -872,7 +892,7 @@@
                         __inc_lruvec_page_state(page, NR_FILE_PAGES);
   unlock:
                 xas_unlock_irq(&xas);
- -      } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+ +      } while (xas_nomem(&xas, gfp));
   
         if (xas_error(&xas)) {
                 error = xas_error(&xas);
@@@ -1445,7 -1425,7 +1445,7 @@@ static inline bool clear_bit_unlock_is_
    * unlock_page - unlock a locked page
    * @page: the page
    *
- - * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
+ + * Unlocks the page and wakes up sleepers in wait_on_page_locked().
    * Also wakes sleepers in wait_on_page_writeback() because the wakeup
    * mechanism between PageLocked pages and PageWriteback pages is shared.
    * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
@@@ -1665,19 -1645,19 +1665,19 @@@ EXPORT_SYMBOL(page_cache_prev_miss)
   /**
    * find_get_entry - find and get a page cache entry
    * @mapping: the address_space to search
- - * @offset: the page cache index
+ + * @index: The page cache index.
    *
    * Looks up the page cache slot at @mapping & @offset.  If there is a
- - * page cache page, it is returned with an increased refcount.
+ + * page cache page, the head page is returned with an increased refcount.
    *
    * If the slot holds a shadow entry of a previously evicted page, or a
    * swap entry from shmem/tmpfs, it is returned.
    *
- - * Return: the found page or shadow entry, %NULL if nothing is found.
+ + * Return: The head page or shadow entry, %NULL if nothing is found.
    */
- -struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
+ +struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
   {
- -      XA_STATE(xas, &mapping->i_pages, offset);
+ +      XA_STATE(xas, &mapping->i_pages, index);
         struct page *page;
   
         rcu_read_lock();
@@@ -1705,6 -1685,7 +1705,6 @@@ repeat
                 put_page(page);
                 goto repeat;
         }
- -      page = find_subpage(page, offset);
   out:
         rcu_read_unlock();
   
@@@ -1712,37 -1693,40 +1712,37 @@@
   }
   
   /**
- - * find_lock_entry - locate, pin and lock a page cache entry
- - * @mapping: the address_space to search
- - * @offset: the page cache index
+ + * find_lock_entry - Locate and lock a page cache entry.
+ + * @mapping: The address_space to search.
+ + * @index: The page cache index.
    *
- - * Looks up the page cache slot at @mapping & @offset.  If there is a
- - * page cache page, it is returned locked and with an increased
- - * refcount.
+ + * Looks up the page at @mapping & @index.  If there is a page in the
+ + * cache, the head page is returned locked and with an increased refcount.
    *
    * If the slot holds a shadow entry of a previously evicted page, or a
    * swap entry from shmem/tmpfs, it is returned.
    *
- - * find_lock_entry() may sleep.
- - *
- - * Return: the found page or shadow entry, %NULL if nothing is found.
+ + * Context: May sleep.
+ + * Return: The head page or shadow entry, %NULL if nothing is found.
    */
- -struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
+ +struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
   {
         struct page *page;
   
   repeat:
- -      page = find_get_entry(mapping, offset);
+ +      page = find_get_entry(mapping, index);
         if (page && !xa_is_value(page)) {
                 lock_page(page);
                 /* Has the page been truncated? */
- -              if (unlikely(page_mapping(page) != mapping)) {
+ +              if (unlikely(page->mapping != mapping)) {
                         unlock_page(page);
                         put_page(page);
                         goto repeat;
                 }
- -              VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+ +              VM_BUG_ON_PAGE(!thp_contains(page, index), page);
         }
         return page;
   }
- -EXPORT_SYMBOL(find_lock_entry);
   
   /**
    * pagecache_get_page - Find and get a reference to a page.
@@@ -1757,8 -1741,6 +1757,8 @@@
    *
    * * %FGP_ACCESSED - The page will be marked accessed.
    * * %FGP_LOCK - The page is returned locked.
+ + * * %FGP_HEAD - If the page is present and a THP, return the head page
+ + *   rather than the exact page specified by the index.
    * * %FGP_CREAT - If no page is present then a new page is allocated using
    *   @gfp_mask and added to the page cache and the VM's LRU list.
    *   The page is returned locked and with an increased refcount.
@@@ -1799,12 -1781,12 +1799,12 @@@ repeat
                 }
   
                 /* Has the page been truncated? */
- -              if (unlikely(compound_head(page)->mapping != mapping)) {
+ +              if (unlikely(page->mapping != mapping)) {
                         unlock_page(page);
                         put_page(page);
                         goto repeat;
                 }
- -              VM_BUG_ON_PAGE(page->index != index, page);
+ +              VM_BUG_ON_PAGE(!thp_contains(page, index), page);
         }
   
         if (fgp_flags & FGP_ACCESSED)
@@@ -1814,13 -1796,11 +1814,13 @@@
                 if (page_is_idle(page))
                         clear_page_idle(page);
         }
+ +      if (!(fgp_flags & FGP_HEAD))
+ +              page = find_subpage(page, index);
   
   no_page:
         if (!page && (fgp_flags & FGP_CREAT)) {
                 int err;
- -              if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
+ +              if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
                         gfp_mask |= __GFP_WRITE;
                 if (fgp_flags & FGP_NOFS)
                         gfp_mask &= ~__GFP_FS;
@@@ -2199,14 -2179,6 +2199,14 @@@ ssize_t generic_file_buffered_read(stru
         last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
         offset = *ppos & ~PAGE_MASK;
   
+ +      /*
+ +       * If we've already successfully copied some data, then we
+ +       * can no longer safely return -EIOCBQUEUED. Hence mark
+ +       * an async read NOWAIT at that point.
+ +       */
+ +      if (written && (iocb->ki_flags & IOCB_WAITQ))
+ +              iocb->ki_flags |= IOCB_NOWAIT;
+ +
         for (;;) {
                 struct page *page;
                 pgoff_t end_index;
@@@ -2596,8 -2568,8 +2596,8 @@@ static struct file *do_sync_mmap_readah
         struct file *file = vmf->vma->vm_file;
         struct file_ra_state *ra = &file->f_ra;
         struct address_space *mapping = file->f_mapping;
+ +      DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
         struct file *fpin = NULL;
- -      pgoff_t offset = vmf->pgoff;
         unsigned int mmap_miss;
   
         /* If we don't want any read-ahead, don't bother */
@@@ -2608,7 -2580,8 +2608,7 @@@
   
         if (vmf->vma->vm_flags & VM_SEQ_READ) {
                 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- -              page_cache_sync_readahead(mapping, ra, file, offset,
- -                                        ra->ra_pages);
+ +              page_cache_sync_ra(&ractl, ra, ra->ra_pages);
                 return fpin;
         }
   
@@@ -2628,11 -2601,10 +2628,11 @@@
          * mmap read-around
          */
         fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- -      ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+ +      ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
         ra->size = ra->ra_pages;
         ra->async_size = ra->ra_pages / 4;
- -      ra_submit(ra, mapping, file);
+ +      ractl._index = ra->start;
+ +      do_page_cache_ra(&ractl, ra->size, ra->async_size);
         return fpin;
   }
   
@@@ -2821,42 -2793,42 +2821,42 @@@ void filemap_map_pages(struct vm_fault 
         pgoff_t last_pgoff = start_pgoff;
         unsigned long max_idx;
         XA_STATE(xas, &mapping->i_pages, start_pgoff);
- -      struct page *page;
+ +      struct page *head, *page;
         unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
   
         rcu_read_lock();
- -      xas_for_each(&xas, page, end_pgoff) {
- -              if (xas_retry(&xas, page))
+ +      xas_for_each(&xas, head, end_pgoff) {
+ +              if (xas_retry(&xas, head))
                         continue;
- -              if (xa_is_value(page))
+ +              if (xa_is_value(head))
                         goto next;
   
                 /*
                  * Check for a locked page first, as a speculative
                  * reference may adversely influence page migration.
                  */
- -              if (PageLocked(page))
+ +              if (PageLocked(head))
                         goto next;
- -              if (!page_cache_get_speculative(page))
+ +              if (!page_cache_get_speculative(head))
                         goto next;
   
                 /* Has the page moved or been split? */
- -              if (unlikely(page != xas_reload(&xas)))
+ +              if (unlikely(head != xas_reload(&xas)))
                         goto skip;
- -              page = find_subpage(page, xas.xa_index);
+ +              page = find_subpage(head, xas.xa_index);
   
- -              if (!PageUptodate(page) ||
+ +              if (!PageUptodate(head) ||
                                 PageReadahead(page) ||
                                 PageHWPoison(page))
                         goto skip;
- -              if (!trylock_page(page))
+ +              if (!trylock_page(head))
                         goto skip;
   
- -              if (page->mapping != mapping || !PageUptodate(page))
+ +              if (head->mapping != mapping || !PageUptodate(head))
                         goto unlock;
   
                 max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
- -              if (page->index >= max_idx)
+ +              if (xas.xa_index >= max_idx)
                         goto unlock;
   
                 if (mmap_miss > 0)
@@@ -2868,12 -2840,12 +2868,12 @@@
                 last_pgoff = xas.xa_index;
                 if (alloc_set_pte(vmf, page))
                         goto unlock;
- -              unlock_page(page);
+ +              unlock_page(head);
                 goto next;
   unlock:
- -              unlock_page(page);
+ +              unlock_page(head);
   skip:
- -              put_page(page);
+ +              put_page(head);
   next:
                 /* Huge page is mapped? No need to proceed. */
                 if (pmd_trans_huge(*vmf->pmd))
@@@ -3012,7 -2984,7 +3012,7 @@@ filler
                 goto out;
   
         /*
- -       * Page is not up to date and may be locked due one of the following
+ +       * Page is not up to date and may be locked due to one of the following
          * case a: Page is being filled and the page lock is held
          * case b: Read/write error clearing the page uptodate status
          * case c: Truncation in progress (page locked)
@@@ -3121,228 -3093,6 +3121,6 @@@ struct page *read_cache_page_gfp(struc
   }
   EXPORT_SYMBOL(read_cache_page_gfp);
   
- /*
-  * Don't operate on ranges the page cache doesn't support, and don't exceed the
-  * LFS limits.  If pos is under the limit it becomes a short access.  If it
-  * exceeds the limit we return -EFBIG.
-  */
- static int generic_write_check_limits(struct file *file, loff_t pos,
-                                     loff_t *count)
- {
-       struct inode *inode = file->f_mapping->host;
-       loff_t max_size = inode->i_sb->s_maxbytes;
-       loff_t limit = rlimit(RLIMIT_FSIZE);
- 
-       if (limit != RLIM_INFINITY) {
-               if (pos >= limit) {
-                       send_sig(SIGXFSZ, current, 0);
-                       return -EFBIG;
-               }
-               *count = min(*count, limit - pos);
-       }
- 
-       if (!(file->f_flags & O_LARGEFILE))
-               max_size = MAX_NON_LFS;
- 
-       if (unlikely(pos >= max_size))
-               return -EFBIG;
- 
-       *count = min(*count, max_size - pos);
- 
-       return 0;
- }
- 
- /*
-  * Performs necessary checks before doing a write
-  *
-  * Can adjust writing position or amount of bytes to write.
-  * Returns appropriate error code that caller should return or
-  * zero in case that write should be allowed.
-  */
- inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
- {
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-       loff_t count;
-       int ret;
- 
-       if (IS_SWAPFILE(inode))
-               return -ETXTBSY;
- 
-       if (!iov_iter_count(from))
-               return 0;
- 
-       /* FIXME: this is for backwards compatibility with 2.4 */
-       if (iocb->ki_flags & IOCB_APPEND)
-               iocb->ki_pos = i_size_read(inode);
- 
-       if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
-               return -EINVAL;
- 
-       count = iov_iter_count(from);
-       ret = generic_write_check_limits(file, iocb->ki_pos, &count);
-       if (ret)
-               return ret;
- 
-       iov_iter_truncate(from, count);
-       return iov_iter_count(from);
- }
- EXPORT_SYMBOL(generic_write_checks);
- 
- /*
-  * Performs necessary checks before doing a clone.
-  *
-  * Can adjust amount of bytes to clone via @req_count argument.
-  * Returns appropriate error code that caller should return or
-  * zero in case the clone should be allowed.
-  */
- int generic_remap_checks(struct file *file_in, loff_t pos_in,
-                        struct file *file_out, loff_t pos_out,
-                        loff_t *req_count, unsigned int remap_flags)
- {
-       struct inode *inode_in = file_in->f_mapping->host;
-       struct inode *inode_out = file_out->f_mapping->host;
-       uint64_t count = *req_count;
-       uint64_t bcount;
-       loff_t size_in, size_out;
-       loff_t bs = inode_out->i_sb->s_blocksize;
-       int ret;
- 
-       /* The start of both ranges must be aligned to an fs block. */
-       if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
-               return -EINVAL;
- 
-       /* Ensure offsets don't wrap. */
-       if (pos_in + count < pos_in || pos_out + count < pos_out)
-               return -EINVAL;
- 
-       size_in = i_size_read(inode_in);
-       size_out = i_size_read(inode_out);
- 
-       /* Dedupe requires both ranges to be within EOF. */
-       if ((remap_flags & REMAP_FILE_DEDUP) &&
-           (pos_in >= size_in || pos_in + count > size_in ||
-            pos_out >= size_out || pos_out + count > size_out))
-               return -EINVAL;
- 
-       /* Ensure the infile range is within the infile. */
-       if (pos_in >= size_in)
-               return -EINVAL;
-       count = min(count, size_in - (uint64_t)pos_in);
- 
-       ret = generic_write_check_limits(file_out, pos_out, &count);
-       if (ret)
-               return ret;
- 
-       /*
-        * If the user wanted us to link to the infile's EOF, round up to the
-        * next block boundary for this check.
-        *
-        * Otherwise, make sure the count is also block-aligned, having
-        * already confirmed the starting offsets' block alignment.
-        */
-       if (pos_in + count == size_in) {
-               bcount = ALIGN(size_in, bs) - pos_in;
-       } else {
-               if (!IS_ALIGNED(count, bs))
-                       count = ALIGN_DOWN(count, bs);
-               bcount = count;
-       }
- 
-       /* Don't allow overlapped cloning within the same file. */
-       if (inode_in == inode_out &&
-           pos_out + bcount > pos_in &&
-           pos_out < pos_in + bcount)
-               return -EINVAL;
- 
-       /*
-        * We shortened the request but the caller can't deal with that, so
-        * bounce the request back to userspace.
-        */
-       if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
-               return -EINVAL;
- 
-       *req_count = count;
-       return 0;
- }
- 
- 
- /*
-  * Performs common checks before doing a file copy/clone
-  * from @file_in to @file_out.
-  */
- int generic_file_rw_checks(struct file *file_in, struct file *file_out)
- {
-       struct inode *inode_in = file_inode(file_in);
-       struct inode *inode_out = file_inode(file_out);
- 
-       /* Don't copy dirs, pipes, sockets... */
-       if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-               return -EISDIR;
-       if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
-               return -EINVAL;
- 
-       if (!(file_in->f_mode & FMODE_READ) ||
-           !(file_out->f_mode & FMODE_WRITE) ||
-           (file_out->f_flags & O_APPEND))
-               return -EBADF;
- 
-       return 0;
- }
- 
- /*
-  * Performs necessary checks before doing a file copy
-  *
-  * Can adjust amount of bytes to copy via @req_count argument.
-  * Returns appropriate error code that caller should return or
-  * zero in case the copy should be allowed.
-  */
- int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
-                            struct file *file_out, loff_t pos_out,
-                            size_t *req_count, unsigned int flags)
- {
-       struct inode *inode_in = file_inode(file_in);
-       struct inode *inode_out = file_inode(file_out);
-       uint64_t count = *req_count;
-       loff_t size_in;
-       int ret;
- 
-       ret = generic_file_rw_checks(file_in, file_out);
-       if (ret)
-               return ret;
- 
-       /* Don't touch certain kinds of inodes */
-       if (IS_IMMUTABLE(inode_out))
-               return -EPERM;
- 
-       if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
-               return -ETXTBSY;
- 
-       /* Ensure offsets don't wrap. */
-       if (pos_in + count < pos_in || pos_out + count < pos_out)
-               return -EOVERFLOW;
- 
-       /* Shorten the copy to EOF */
-       size_in = i_size_read(inode_in);
-       if (pos_in >= size_in)
-               count = 0;
-       else
-               count = min(count, size_in - (uint64_t)pos_in);
- 
-       ret = generic_write_check_limits(file_out, pos_out, &count);
-       if (ret)
-               return ret;
- 
-       /* Don't allow overlapped copying within the same file. */
-       if (inode_in == inode_out &&
-           pos_out + count > pos_in &&
-           pos_out < pos_in + count)
-               return -EINVAL;
- 
-       *req_count = count;
-       return 0;
- }
- 
   int pagecache_write_begin(struct file *file, struct address_space *mapping,
                                 loff_t pos, unsigned len, unsigned flags,
                                 struct page **pagep, void **fsdata)
author	Linus Torvalds <[email protected]>
	Fri, 23 Oct 2020 18:33:41 +0000 (11:33 -0700)
committer	Linus Torvalds <[email protected]>
	Fri, 23 Oct 2020 18:33:41 +0000 (11:33 -0700)
		1	2
fs/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
fs/read_write.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history