From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 23 Oct 2020 18:33:41 +0000 (-0700)
Subject: Merge tag 'vfs-5.10-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
X-Git-Tag: v5.10-rc1~43
X-Git-Url: https://repo.jachan.dev/linux.git/commitdiff_plain/c4728cfbed0f54eacc21138c99da2a91895c8c5a?hp=-c

Merge tag 'vfs-5.10-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull clone/dedupe/remap code refactoring from Darrick Wong:
 "Move the generic file range remap (aka reflink and dedupe) functions
  out of mm/filemap.c and fs/read_write.c and into fs/remap_range.c to
  reduce clutter in the first two files"

* tag 'vfs-5.10-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  vfs: move the generic write and copy checks out of mm
  vfs: move the remap range helpers to remap_range.c
  vfs: move generic_remap_checks out of mm
---

c4728cfbed0f54eacc21138c99da2a91895c8c5a
diff --combined fs/Makefile
index 7bb2a05fda1f,7173350739c5..999d1a23f036
--- a/fs/Makefile
+++ b/fs/Makefile
@@@ -14,7 -14,7 +14,7 @@@ obj-y :=	open.o read_write.o file_table
  		pnode.o splice.o sync.o utimes.o d_path.o \
  		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
  		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
- 		kernel_read_file.o
 -		remap_range.o
++		kernel_read_file.o remap_range.o
  
  ifeq ($(CONFIG_BLOCK),y)
  obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
@@@ -38,6 -38,7 +38,6 @@@ obj-$(CONFIG_FS_DAX)		+= dax.
  obj-$(CONFIG_FS_ENCRYPTION)	+= crypto/
  obj-$(CONFIG_FS_VERITY)		+= verity/
  obj-$(CONFIG_FILE_LOCKING)      += locks.o
 -obj-$(CONFIG_COMPAT)		+= compat.o
  obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
  obj-$(CONFIG_BINFMT_EM86)	+= binfmt_em86.o
  obj-$(CONFIG_BINFMT_MISC)	+= binfmt_misc.o
diff --combined fs/read_write.c
index a669fb049b84,016444255d3e..75f764b43418
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@@ -419,42 -419,27 +419,42 @@@ static ssize_t new_sync_read(struct fil
  	return ret;
  }
  
 +static int warn_unsupported(struct file *file, const char *op)
 +{
 +	pr_warn_ratelimited(
 +		"kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
 +		op, file, current->pid, current->comm);
 +	return -EINVAL;
 +}
 +
  ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
  {
 -	mm_segment_t old_fs = get_fs();
 +	struct kvec iov = {
 +		.iov_base	= buf,
 +		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
 +	};
 +	struct kiocb kiocb;
 +	struct iov_iter iter;
  	ssize_t ret;
  
  	if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
  		return -EINVAL;
  	if (!(file->f_mode & FMODE_CAN_READ))
  		return -EINVAL;
 +	/*
 +	 * Also fail if ->read_iter and ->read are both wired up as that
 +	 * implies very convoluted semantics.
 +	 */
 +	if (unlikely(!file->f_op->read_iter || file->f_op->read))
 +		return warn_unsupported(file, "read");
  
 -	if (count > MAX_RW_COUNT)
 -		count =  MAX_RW_COUNT;
 -	set_fs(KERNEL_DS);
 -	if (file->f_op->read)
 -		ret = file->f_op->read(file, (void __user *)buf, count, pos);
 -	else if (file->f_op->read_iter)
 -		ret = new_sync_read(file, (void __user *)buf, count, pos);
 -	else
 -		ret = -EINVAL;
 -	set_fs(old_fs);
 +	init_sync_kiocb(&kiocb, file);
 +	kiocb.ki_pos = pos ? *pos : 0;
 +	iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
 +	ret = file->f_op->read_iter(&kiocb, &iter);
  	if (ret > 0) {
 +		if (pos)
 +			*pos = kiocb.ki_pos;
  		fsnotify_access(file);
  		add_rchar(current, ret);
  	}
@@@ -525,32 -510,28 +525,32 @@@ static ssize_t new_sync_write(struct fi
  /* caller is responsible for file_start_write/file_end_write */
  ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
  {
 -	mm_segment_t old_fs;
 -	const char __user *p;
 +	struct kvec iov = {
 +		.iov_base	= (void *)buf,
 +		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
 +	};
 +	struct kiocb kiocb;
 +	struct iov_iter iter;
  	ssize_t ret;
  
  	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
  		return -EBADF;
  	if (!(file->f_mode & FMODE_CAN_WRITE))
  		return -EINVAL;
 +	/*
 +	 * Also fail if ->write_iter and ->write are both wired up as that
 +	 * implies very convoluted semantics.
 +	 */
 +	if (unlikely(!file->f_op->write_iter || file->f_op->write))
 +		return warn_unsupported(file, "write");
  
 -	old_fs = get_fs();
 -	set_fs(KERNEL_DS);
 -	p = (__force const char __user *)buf;
 -	if (count > MAX_RW_COUNT)
 -		count =  MAX_RW_COUNT;
 -	if (file->f_op->write)
 -		ret = file->f_op->write(file, p, count, pos);
 -	else if (file->f_op->write_iter)
 -		ret = new_sync_write(file, p, count, pos);
 -	else
 -		ret = -EINVAL;
 -	set_fs(old_fs);
 +	init_sync_kiocb(&kiocb, file);
 +	kiocb.ki_pos = pos ? *pos : 0;
 +	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
 +	ret = file->f_op->write_iter(&kiocb, &iter);
  	if (ret > 0) {
 +		if (pos)
 +			*pos = kiocb.ki_pos;
  		fsnotify_modify(file);
  		add_wchar(current, ret);
  	}
@@@ -779,6 -760,185 +779,6 @@@ static ssize_t do_loop_readv_writev(str
  	return ret;
  }
  
 -/**
 - * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
 - *     into the kernel and check that it is valid.
 - *
 - * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
 - * @uvector: Pointer to the userspace array.
 - * @nr_segs: Number of elements in userspace array.
 - * @fast_segs: Number of elements in @fast_pointer.
 - * @fast_pointer: Pointer to (usually small on-stack) kernel array.
 - * @ret_pointer: (output parameter) Pointer to a variable that will point to
 - *     either @fast_pointer, a newly allocated kernel array, or NULL,
 - *     depending on which array was used.
 - *
 - * This function copies an array of &struct iovec of @nr_segs from
 - * userspace into the kernel and checks that each element is valid (e.g.
 - * it does not point to a kernel address or cause overflow by being too
 - * large, etc.).
 - *
 - * As an optimization, the caller may provide a pointer to a small
 - * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
 - * (the size of this array, or 0 if unused, should be given in @fast_segs).
 - *
 - * @ret_pointer will always point to the array that was used, so the
 - * caller must take care not to call kfree() on it e.g. in case the
 - * @fast_pointer array was used and it was allocated on the stack.
 - *
 - * Return: The total number of bytes covered by the iovec array on success
 - *   or a negative error code on error.
 - */
 -ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 -			      unsigned long nr_segs, unsigned long fast_segs,
 -			      struct iovec *fast_pointer,
 -			      struct iovec **ret_pointer)
 -{
 -	unsigned long seg;
 -	ssize_t ret;
 -	struct iovec *iov = fast_pointer;
 -
 -	/*
 -	 * SuS says "The readv() function *may* fail if the iovcnt argument
 -	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 -	 * traditionally returned zero for zero segments, so...
 -	 */
 -	if (nr_segs == 0) {
 -		ret = 0;
 -		goto out;
 -	}
 -
 -	/*
 -	 * First get the "struct iovec" from user memory and
 -	 * verify all the pointers
 -	 */
 -	if (nr_segs > UIO_MAXIOV) {
 -		ret = -EINVAL;
 -		goto out;
 -	}
 -	if (nr_segs > fast_segs) {
 -		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
 -		if (iov == NULL) {
 -			ret = -ENOMEM;
 -			goto out;
 -		}
 -	}
 -	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 -		ret = -EFAULT;
 -		goto out;
 -	}
 -
 -	/*
 -	 * According to the Single Unix Specification we should return EINVAL
 -	 * if an element length is < 0 when cast to ssize_t or if the
 -	 * total length would overflow the ssize_t return value of the
 -	 * system call.
 -	 *
 -	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 -	 * overflow case.
 -	 */
 -	ret = 0;
 -	for (seg = 0; seg < nr_segs; seg++) {
 -		void __user *buf = iov[seg].iov_base;
 -		ssize_t len = (ssize_t)iov[seg].iov_len;
 -
 -		/* see if we we're about to use an invalid len or if
 -		 * it's about to overflow ssize_t */
 -		if (len < 0) {
 -			ret = -EINVAL;
 -			goto out;
 -		}
 -		if (type >= 0
 -		    && unlikely(!access_ok(buf, len))) {
 -			ret = -EFAULT;
 -			goto out;
 -		}
 -		if (len > MAX_RW_COUNT - ret) {
 -			len = MAX_RW_COUNT - ret;
 -			iov[seg].iov_len = len;
 -		}
 -		ret += len;
 -	}
 -out:
 -	*ret_pointer = iov;
 -	return ret;
 -}
 -
 -#ifdef CONFIG_COMPAT
 -ssize_t compat_rw_copy_check_uvector(int type,
 -		const struct compat_iovec __user *uvector, unsigned long nr_segs,
 -		unsigned long fast_segs, struct iovec *fast_pointer,
 -		struct iovec **ret_pointer)
 -{
 -	compat_ssize_t tot_len;
 -	struct iovec *iov = *ret_pointer = fast_pointer;
 -	ssize_t ret = 0;
 -	int seg;
 -
 -	/*
 -	 * SuS says "The readv() function *may* fail if the iovcnt argument
 -	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 -	 * traditionally returned zero for zero segments, so...
 -	 */
 -	if (nr_segs == 0)
 -		goto out;
 -
 -	ret = -EINVAL;
 -	if (nr_segs > UIO_MAXIOV)
 -		goto out;
 -	if (nr_segs > fast_segs) {
 -		ret = -ENOMEM;
 -		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
 -		if (iov == NULL)
 -			goto out;
 -	}
 -	*ret_pointer = iov;
 -
 -	ret = -EFAULT;
 -	if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
 -		goto out;
 -
 -	/*
 -	 * Single unix specification:
 -	 * We should -EINVAL if an element length is not >= 0 and fitting an
 -	 * ssize_t.
 -	 *
 -	 * In Linux, the total length is limited to MAX_RW_COUNT, there is
 -	 * no overflow possibility.
 -	 */
 -	tot_len = 0;
 -	ret = -EINVAL;
 -	for (seg = 0; seg < nr_segs; seg++) {
 -		compat_uptr_t buf;
 -		compat_ssize_t len;
 -
 -		if (__get_user(len, &uvector->iov_len) ||
 -		   __get_user(buf, &uvector->iov_base)) {
 -			ret = -EFAULT;
 -			goto out;
 -		}
 -		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
 -			goto out;
 -		if (type >= 0 &&
 -		    !access_ok(compat_ptr(buf), len)) {
 -			ret = -EFAULT;
 -			goto out;
 -		}
 -		if (len > MAX_RW_COUNT - tot_len)
 -			len = MAX_RW_COUNT - tot_len;
 -		tot_len += len;
 -		iov->iov_base = compat_ptr(buf);
 -		iov->iov_len = (compat_size_t) len;
 -		uvector++;
 -		iov++;
 -	}
 -	ret = tot_len;
 -
 -out:
 -	return ret;
 -}
 -#endif
 -
  static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
  		loff_t *pos, rwf_t flags)
  {
@@@ -908,7 -1068,7 +908,7 @@@ ssize_t vfs_iter_write(struct file *fil
  }
  EXPORT_SYMBOL(vfs_iter_write);
  
 -ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 +static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
  		  unsigned long vlen, loff_t *pos, rwf_t flags)
  {
  	struct iovec iovstack[UIO_FASTIOV];
@@@ -1095,93 -1255,224 +1095,93 @@@ SYSCALL_DEFINE6(pwritev2, unsigned long
  	return do_pwritev(fd, vec, vlen, pos, flags);
  }
  
 +/*
 + * Various compat syscalls.  Note that they all pretend to take a native
 + * iovec - import_iovec will properly treat those as compat_iovecs based on
 + * in_compat_syscall().
 + */
  #ifdef CONFIG_COMPAT
 -static size_t compat_readv(struct file *file,
 -			   const struct compat_iovec __user *vec,
 -			   unsigned long vlen, loff_t *pos, rwf_t flags)
 -{
 -	struct iovec iovstack[UIO_FASTIOV];
 -	struct iovec *iov = iovstack;
 -	struct iov_iter iter;
 -	ssize_t ret;
 -
 -	ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
 -	if (ret >= 0) {
 -		ret = do_iter_read(file, &iter, pos, flags);
 -		kfree(iov);
 -	}
 -	if (ret > 0)
 -		add_rchar(current, ret);
 -	inc_syscr(current);
 -	return ret;
 -}
 -
 -static size_t do_compat_readv(compat_ulong_t fd,
 -				 const struct compat_iovec __user *vec,
 -				 compat_ulong_t vlen, rwf_t flags)
 -{
 -	struct fd f = fdget_pos(fd);
 -	ssize_t ret;
 -	loff_t pos;
 -
 -	if (!f.file)
 -		return -EBADF;
 -	pos = f.file->f_pos;
 -	ret = compat_readv(f.file, vec, vlen, &pos, flags);
 -	if (ret >= 0)
 -		f.file->f_pos = pos;
 -	fdput_pos(f);
 -	return ret;
 -
 -}
 -
 -COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
 -		const struct compat_iovec __user *,vec,
 -		compat_ulong_t, vlen)
 -{
 -	return do_compat_readv(fd, vec, vlen, 0);
 -}
 -
 -static long do_compat_preadv64(unsigned long fd,
 -				  const struct compat_iovec __user *vec,
 -				  unsigned long vlen, loff_t pos, rwf_t flags)
 -{
 -	struct fd f;
 -	ssize_t ret;
 -
 -	if (pos < 0)
 -		return -EINVAL;
 -	f = fdget(fd);
 -	if (!f.file)
 -		return -EBADF;
 -	ret = -ESPIPE;
 -	if (f.file->f_mode & FMODE_PREAD)
 -		ret = compat_readv(f.file, vec, vlen, &pos, flags);
 -	fdput(f);
 -	return ret;
 -}
 -
  #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
  COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
 -		const struct compat_iovec __user *,vec,
 +		const struct iovec __user *, vec,
  		unsigned long, vlen, loff_t, pos)
  {
 -	return do_compat_preadv64(fd, vec, vlen, pos, 0);
 +	return do_preadv(fd, vec, vlen, pos, 0);
  }
  #endif
  
  COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
 -		const struct compat_iovec __user *,vec,
 +		const struct iovec __user *, vec,
  		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
  {
  	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  
 -	return do_compat_preadv64(fd, vec, vlen, pos, 0);
 +	return do_preadv(fd, vec, vlen, pos, 0);
  }
  
  #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
  COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
 -		const struct compat_iovec __user *,vec,
 +		const struct iovec __user *, vec,
  		unsigned long, vlen, loff_t, pos, rwf_t, flags)
  {
  	if (pos == -1)
 -		return do_compat_readv(fd, vec, vlen, flags);
 -
 -	return do_compat_preadv64(fd, vec, vlen, pos, flags);
 +		return do_readv(fd, vec, vlen, flags);
 +	return do_preadv(fd, vec, vlen, pos, flags);
  }
  #endif
  
  COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
 -		const struct compat_iovec __user *,vec,
 +		const struct iovec __user *, vec,
  		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
  		rwf_t, flags)
  {
  	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  
  	if (pos == -1)
 -		return do_compat_readv(fd, vec, vlen, flags);
 -
 -	return do_compat_preadv64(fd, vec, vlen, pos, flags);
 -}
 -
 -static size_t compat_writev(struct file *file,
 -			    const struct compat_iovec __user *vec,
 -			    unsigned long vlen, loff_t *pos, rwf_t flags)
 -{
 -	struct iovec iovstack[UIO_FASTIOV];
 -	struct iovec *iov = iovstack;
 -	struct iov_iter iter;
 -	ssize_t ret;
 -
 -	ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
 -	if (ret >= 0) {
 -		file_start_write(file);
 -		ret = do_iter_write(file, &iter, pos, flags);
 -		file_end_write(file);
 -		kfree(iov);
 -	}
 -	if (ret > 0)
 -		add_wchar(current, ret);
 -	inc_syscw(current);
 -	return ret;
 -}
 -
 -static size_t do_compat_writev(compat_ulong_t fd,
 -				  const struct compat_iovec __user* vec,
 -				  compat_ulong_t vlen, rwf_t flags)
 -{
 -	struct fd f = fdget_pos(fd);
 -	ssize_t ret;
 -	loff_t pos;
 -
 -	if (!f.file)
 -		return -EBADF;
 -	pos = f.file->f_pos;
 -	ret = compat_writev(f.file, vec, vlen, &pos, flags);
 -	if (ret >= 0)
 -		f.file->f_pos = pos;
 -	fdput_pos(f);
 -	return ret;
 -}
 -
 -COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
 -		const struct compat_iovec __user *, vec,
 -		compat_ulong_t, vlen)
 -{
 -	return do_compat_writev(fd, vec, vlen, 0);
 -}
 -
 -static long do_compat_pwritev64(unsigned long fd,
 -				   const struct compat_iovec __user *vec,
 -				   unsigned long vlen, loff_t pos, rwf_t flags)
 -{
 -	struct fd f;
 -	ssize_t ret;
 -
 -	if (pos < 0)
 -		return -EINVAL;
 -	f = fdget(fd);
 -	if (!f.file)
 -		return -EBADF;
 -	ret = -ESPIPE;
 -	if (f.file->f_mode & FMODE_PWRITE)
 -		ret = compat_writev(f.file, vec, vlen, &pos, flags);
 -	fdput(f);
 -	return ret;
 +		return do_readv(fd, vec, vlen, flags);
 +	return do_preadv(fd, vec, vlen, pos, flags);
  }
  
  #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
  COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
 -		const struct compat_iovec __user *,vec,
 +		const struct iovec __user *, vec,
  		unsigned long, vlen, loff_t, pos)
  {
 -	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
 +	return do_pwritev(fd, vec, vlen, pos, 0);
  }
  #endif
  
  COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
 -		const struct compat_iovec __user *,vec,
 +		const struct iovec __user *,vec,
  		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
  {
  	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  
 -	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
 +	return do_pwritev(fd, vec, vlen, pos, 0);
  }
  
  #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
  COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
 -		const struct compat_iovec __user *,vec,
 +		const struct iovec __user *, vec,
  		unsigned long, vlen, loff_t, pos, rwf_t, flags)
  {
  	if (pos == -1)
 -		return do_compat_writev(fd, vec, vlen, flags);
 -
 -	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
 +		return do_writev(fd, vec, vlen, flags);
 +	return do_pwritev(fd, vec, vlen, pos, flags);
  }
  #endif
  
  COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
 -		const struct compat_iovec __user *,vec,
 +		const struct iovec __user *,vec,
  		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
  {
  	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  
  	if (pos == -1)
 -		return do_compat_writev(fd, vec, vlen, flags);
 -
 -	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
 +		return do_writev(fd, vec, vlen, flags);
 +	return do_pwritev(fd, vec, vlen, pos, flags);
  }
 -
 -#endif
 +#endif /* CONFIG_COMPAT */
  
  static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
  		  	   size_t count, loff_t max)
@@@ -1410,6 -1701,59 +1410,59 @@@ static ssize_t do_copy_file_range(struc
  				       flags);
  }
  
+ /*
+  * Performs necessary checks before doing a file copy
+  *
+  * Can adjust amount of bytes to copy via @req_count argument.
+  * Returns appropriate error code that caller should return or
+  * zero in case the copy should be allowed.
+  */
+ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
+ 				    struct file *file_out, loff_t pos_out,
+ 				    size_t *req_count, unsigned int flags)
+ {
+ 	struct inode *inode_in = file_inode(file_in);
+ 	struct inode *inode_out = file_inode(file_out);
+ 	uint64_t count = *req_count;
+ 	loff_t size_in;
+ 	int ret;
+ 
+ 	ret = generic_file_rw_checks(file_in, file_out);
+ 	if (ret)
+ 		return ret;
+ 
+ 	/* Don't touch certain kinds of inodes */
+ 	if (IS_IMMUTABLE(inode_out))
+ 		return -EPERM;
+ 
+ 	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ 		return -ETXTBSY;
+ 
+ 	/* Ensure offsets don't wrap. */
+ 	if (pos_in + count < pos_in || pos_out + count < pos_out)
+ 		return -EOVERFLOW;
+ 
+ 	/* Shorten the copy to EOF */
+ 	size_in = i_size_read(inode_in);
+ 	if (pos_in >= size_in)
+ 		count = 0;
+ 	else
+ 		count = min(count, size_in - (uint64_t)pos_in);
+ 
+ 	ret = generic_write_check_limits(file_out, pos_out, &count);
+ 	if (ret)
+ 		return ret;
+ 
+ 	/* Don't allow overlapped copying within the same file. */
+ 	if (inode_in == inode_out &&
+ 	    pos_out + count > pos_in &&
+ 	    pos_out < pos_in + count)
+ 		return -EINVAL;
+ 
+ 	*req_count = count;
+ 	return 0;
+ }
+ 
  /*
   * copy_file_range() differs from regular file read and write in that it
   * specifically allows return partial success.  When it does so is up to
@@@ -1542,475 -1886,92 +1595,92 @@@ out2
  	return ret;
  }
  
- static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
- 			     bool write)
- {
- 	struct inode *inode = file_inode(file);
- 
- 	if (unlikely(pos < 0 || len < 0))
- 		return -EINVAL;
- 
- 	 if (unlikely((loff_t) (pos + len) < 0))
- 		return -EINVAL;
- 
- 	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
- 		loff_t end = len ? pos + len - 1 : OFFSET_MAX;
- 		int retval;
- 
- 		retval = locks_mandatory_area(inode, file, pos, end,
- 				write ? F_WRLCK : F_RDLCK);
- 		if (retval < 0)
- 			return retval;
- 	}
- 
- 	return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
- }
- /*
-  * Ensure that we don't remap a partial EOF block in the middle of something
-  * else.  Assume that the offsets have already been checked for block
-  * alignment.
-  *
-  * For clone we only link a partial EOF block above or at the destination file's
-  * EOF.  For deduplication we accept a partial EOF block only if it ends at the
-  * destination file's EOF (can not link it into the middle of a file).
-  *
-  * Shorten the request if possible.
-  */
- static int generic_remap_check_len(struct inode *inode_in,
- 				   struct inode *inode_out,
- 				   loff_t pos_out,
- 				   loff_t *len,
- 				   unsigned int remap_flags)
- {
- 	u64 blkmask = i_blocksize(inode_in) - 1;
- 	loff_t new_len = *len;
- 
- 	if ((*len & blkmask) == 0)
- 		return 0;
- 
- 	if (pos_out + *len < i_size_read(inode_out))
- 		new_len &= ~blkmask;
- 
- 	if (new_len == *len)
- 		return 0;
- 
- 	if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
- 		*len = new_len;
- 		return 0;
- 	}
- 
- 	return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
- }
- 
- /* Read a page's worth of file data into the page cache. */
- static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
- {
- 	struct page *page;
- 
- 	page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
- 	if (IS_ERR(page))
- 		return page;
- 	if (!PageUptodate(page)) {
- 		put_page(page);
- 		return ERR_PTR(-EIO);
- 	}
- 	return page;
- }
- 
  /*
-  * Lock two pages, ensuring that we lock in offset order if the pages are from
-  * the same file.
+  * Don't operate on ranges the page cache doesn't support, and don't exceed the
+  * LFS limits.  If pos is under the limit it becomes a short access.  If it
+  * exceeds the limit we return -EFBIG.
   */
- static void vfs_lock_two_pages(struct page *page1, struct page *page2)
- {
- 	/* Always lock in order of increasing index. */
- 	if (page1->index > page2->index)
- 		swap(page1, page2);
- 
- 	lock_page(page1);
- 	if (page1 != page2)
- 		lock_page(page2);
- }
- 
- /* Unlock two pages, being careful not to unlock the same page twice. */
- static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
+ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
  {
- 	unlock_page(page1);
- 	if (page1 != page2)
- 		unlock_page(page2);
- }
- 
- /*
-  * Compare extents of two files to see if they are the same.
-  * Caller must have locked both inodes to prevent write races.
-  */
- static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- 					 struct inode *dest, loff_t destoff,
- 					 loff_t len, bool *is_same)
- {
- 	loff_t src_poff;
- 	loff_t dest_poff;
- 	void *src_addr;
- 	void *dest_addr;
- 	struct page *src_page;
- 	struct page *dest_page;
- 	loff_t cmp_len;
- 	bool same;
- 	int error;
- 
- 	error = -EINVAL;
- 	same = true;
- 	while (len) {
- 		src_poff = srcoff & (PAGE_SIZE - 1);
- 		dest_poff = destoff & (PAGE_SIZE - 1);
- 		cmp_len = min(PAGE_SIZE - src_poff,
- 			      PAGE_SIZE - dest_poff);
- 		cmp_len = min(cmp_len, len);
- 		if (cmp_len <= 0)
- 			goto out_error;
- 
- 		src_page = vfs_dedupe_get_page(src, srcoff);
- 		if (IS_ERR(src_page)) {
- 			error = PTR_ERR(src_page);
- 			goto out_error;
- 		}
- 		dest_page = vfs_dedupe_get_page(dest, destoff);
- 		if (IS_ERR(dest_page)) {
- 			error = PTR_ERR(dest_page);
- 			put_page(src_page);
- 			goto out_error;
- 		}
- 
- 		vfs_lock_two_pages(src_page, dest_page);
+ 	struct inode *inode = file->f_mapping->host;
+ 	loff_t max_size = inode->i_sb->s_maxbytes;
+ 	loff_t limit = rlimit(RLIMIT_FSIZE);
  
- 		/*
- 		 * Now that we've locked both pages, make sure they're still
- 		 * mapped to the file data we're interested in.  If not,
- 		 * someone is invalidating pages on us and we lose.
- 		 */
- 		if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
- 		    src_page->mapping != src->i_mapping ||
- 		    dest_page->mapping != dest->i_mapping) {
- 			same = false;
- 			goto unlock;
+ 	if (limit != RLIM_INFINITY) {
+ 		if (pos >= limit) {
+ 			send_sig(SIGXFSZ, current, 0);
+ 			return -EFBIG;
  		}
+ 		*count = min(*count, limit - pos);
+ 	}
  
- 		src_addr = kmap_atomic(src_page);
- 		dest_addr = kmap_atomic(dest_page);
- 
- 		flush_dcache_page(src_page);
- 		flush_dcache_page(dest_page);
- 
- 		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
- 			same = false;
+ 	if (!(file->f_flags & O_LARGEFILE))
+ 		max_size = MAX_NON_LFS;
  
- 		kunmap_atomic(dest_addr);
- 		kunmap_atomic(src_addr);
- unlock:
- 		vfs_unlock_two_pages(src_page, dest_page);
- 		put_page(dest_page);
- 		put_page(src_page);
+ 	if (unlikely(pos >= max_size))
+ 		return -EFBIG;
  
- 		if (!same)
- 			break;
+ 	*count = min(*count, max_size - pos);
  
- 		srcoff += cmp_len;
- 		destoff += cmp_len;
- 		len -= cmp_len;
- 	}
- 
- 	*is_same = same;
  	return 0;
- 
- out_error:
- 	return error;
  }
  
  /*
-  * Check that the two inodes are eligible for cloning, the ranges make
-  * sense, and then flush all dirty data.  Caller must ensure that the
-  * inodes have been locked against any other modifications.
+  * Performs necessary checks before doing a write
   *
-  * If there's an error, then the usual negative error code is returned.
-  * Otherwise returns 0 with *len set to the request length.
+  * Can adjust writing position or amount of bytes to write.
+  * Returns appropriate error code that caller should return or
+  * zero in case that write should be allowed.
   */
- int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- 				  struct file *file_out, loff_t pos_out,
- 				  loff_t *len, unsigned int remap_flags)
+ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
  {
- 	struct inode *inode_in = file_inode(file_in);
- 	struct inode *inode_out = file_inode(file_out);
- 	bool same_inode = (inode_in == inode_out);
+ 	struct file *file = iocb->ki_filp;
+ 	struct inode *inode = file->f_mapping->host;
+ 	loff_t count;
  	int ret;
  
- 	/* Don't touch certain kinds of inodes */
- 	if (IS_IMMUTABLE(inode_out))
- 		return -EPERM;
- 
- 	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ 	if (IS_SWAPFILE(inode))
  		return -ETXTBSY;
  
- 	/* Don't reflink dirs, pipes, sockets... */
- 	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- 		return -EISDIR;
- 	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- 		return -EINVAL;
- 
- 	/* Zero length dedupe exits immediately; reflink goes to EOF. */
- 	if (*len == 0) {
- 		loff_t isize = i_size_read(inode_in);
- 
- 		if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
- 			return 0;
- 		if (pos_in > isize)
- 			return -EINVAL;
- 		*len = isize - pos_in;
- 		if (*len == 0)
- 			return 0;
- 	}
- 
- 	/* Check that we don't violate system file offset limits. */
- 	ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
- 			remap_flags);
- 	if (ret)
- 		return ret;
- 
- 	/* Wait for the completion of any pending IOs on both files */
- 	inode_dio_wait(inode_in);
- 	if (!same_inode)
- 		inode_dio_wait(inode_out);
- 
- 	ret = filemap_write_and_wait_range(inode_in->i_mapping,
- 			pos_in, pos_in + *len - 1);
- 	if (ret)
- 		return ret;
- 
- 	ret = filemap_write_and_wait_range(inode_out->i_mapping,
- 			pos_out, pos_out + *len - 1);
- 	if (ret)
- 		return ret;
- 
- 	/*
- 	 * Check that the extents are the same.
- 	 */
- 	if (remap_flags & REMAP_FILE_DEDUP) {
- 		bool		is_same = false;
- 
- 		ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- 				inode_out, pos_out, *len, &is_same);
- 		if (ret)
- 			return ret;
- 		if (!is_same)
- 			return -EBADE;
- 	}
- 
- 	ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
- 			remap_flags);
- 	if (ret)
- 		return ret;
- 
- 	/* If can't alter the file contents, we're done. */
- 	if (!(remap_flags & REMAP_FILE_DEDUP))
- 		ret = file_modified(file_out);
- 
- 	return ret;
- }
- EXPORT_SYMBOL(generic_remap_file_range_prep);
- 
- loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
- 			   struct file *file_out, loff_t pos_out,
- 			   loff_t len, unsigned int remap_flags)
- {
- 	loff_t ret;
- 
- 	WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
- 
- 	/*
- 	 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
- 	 * the same mount. Practically, they only need to be on the same file
- 	 * system.
- 	 */
- 	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
- 		return -EXDEV;
- 
- 	ret = generic_file_rw_checks(file_in, file_out);
- 	if (ret < 0)
- 		return ret;
- 
- 	if (!file_in->f_op->remap_file_range)
- 		return -EOPNOTSUPP;
- 
- 	ret = remap_verify_area(file_in, pos_in, len, false);
- 	if (ret)
- 		return ret;
- 
- 	ret = remap_verify_area(file_out, pos_out, len, true);
- 	if (ret)
- 		return ret;
- 
- 	ret = file_in->f_op->remap_file_range(file_in, pos_in,
- 			file_out, pos_out, len, remap_flags);
- 	if (ret < 0)
- 		return ret;
- 
- 	fsnotify_access(file_in);
- 	fsnotify_modify(file_out);
- 	return ret;
- }
- EXPORT_SYMBOL(do_clone_file_range);
- 
- loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
- 			    struct file *file_out, loff_t pos_out,
- 			    loff_t len, unsigned int remap_flags)
- {
- 	loff_t ret;
- 
- 	file_start_write(file_out);
- 	ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
- 				  remap_flags);
- 	file_end_write(file_out);
- 
- 	return ret;
- }
- EXPORT_SYMBOL(vfs_clone_file_range);
- 
- /* Check whether we are allowed to dedupe the destination file */
- static bool allow_file_dedupe(struct file *file)
- {
- 	if (capable(CAP_SYS_ADMIN))
- 		return true;
- 	if (file->f_mode & FMODE_WRITE)
- 		return true;
- 	if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
- 		return true;
- 	if (!inode_permission(file_inode(file), MAY_WRITE))
- 		return true;
- 	return false;
- }
+ 	if (!iov_iter_count(from))
+ 		return 0;
  
- loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
- 				 struct file *dst_file, loff_t dst_pos,
- 				 loff_t len, unsigned int remap_flags)
- {
- 	loff_t ret;
+ 	/* FIXME: this is for backwards compatibility with 2.4 */
+ 	if (iocb->ki_flags & IOCB_APPEND)
+ 		iocb->ki_pos = i_size_read(inode);
  
- 	WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
- 				     REMAP_FILE_CAN_SHORTEN));
+ 	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ 		return -EINVAL;
  
- 	ret = mnt_want_write_file(dst_file);
+ 	count = iov_iter_count(from);
+ 	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
  	if (ret)
  		return ret;
  
- 	ret = remap_verify_area(dst_file, dst_pos, len, true);
- 	if (ret < 0)
- 		goto out_drop_write;
- 
- 	ret = -EPERM;
- 	if (!allow_file_dedupe(dst_file))
- 		goto out_drop_write;
- 
- 	ret = -EXDEV;
- 	if (src_file->f_path.mnt != dst_file->f_path.mnt)
- 		goto out_drop_write;
- 
- 	ret = -EISDIR;
- 	if (S_ISDIR(file_inode(dst_file)->i_mode))
- 		goto out_drop_write;
- 
- 	ret = -EINVAL;
- 	if (!dst_file->f_op->remap_file_range)
- 		goto out_drop_write;
- 
- 	if (len == 0) {
- 		ret = 0;
- 		goto out_drop_write;
- 	}
- 
- 	ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
- 			dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
- out_drop_write:
- 	mnt_drop_write_file(dst_file);
- 
- 	return ret;
+ 	iov_iter_truncate(from, count);
+ 	return iov_iter_count(from);
  }
- EXPORT_SYMBOL(vfs_dedupe_file_range_one);
+ EXPORT_SYMBOL(generic_write_checks);
  
- int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
+ /*
+  * Performs common checks before doing a file copy/clone
+  * from @file_in to @file_out.
+  */
+ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
  {
- 	struct file_dedupe_range_info *info;
- 	struct inode *src = file_inode(file);
- 	u64 off;
- 	u64 len;
- 	int i;
- 	int ret;
- 	u16 count = same->dest_count;
- 	loff_t deduped;
- 
- 	if (!(file->f_mode & FMODE_READ))
- 		return -EINVAL;
- 
- 	if (same->reserved1 || same->reserved2)
- 		return -EINVAL;
- 
- 	off = same->src_offset;
- 	len = same->src_length;
+ 	struct inode *inode_in = file_inode(file_in);
+ 	struct inode *inode_out = file_inode(file_out);
  
- 	if (S_ISDIR(src->i_mode))
+ 	/* Don't copy dirs, pipes, sockets... */
+ 	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
  		return -EISDIR;
- 
- 	if (!S_ISREG(src->i_mode))
- 		return -EINVAL;
- 
- 	if (!file->f_op->remap_file_range)
- 		return -EOPNOTSUPP;
- 
- 	ret = remap_verify_area(file, off, len, false);
- 	if (ret < 0)
- 		return ret;
- 	ret = 0;
- 
- 	if (off + len > i_size_read(src))
+ 	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
  		return -EINVAL;
  
- 	/* Arbitrary 1G limit on a single dedupe request, can be raised. */
- 	len = min_t(u64, len, 1 << 30);
- 
- 	/* pre-format output fields to sane values */
- 	for (i = 0; i < count; i++) {
- 		same->info[i].bytes_deduped = 0ULL;
- 		same->info[i].status = FILE_DEDUPE_RANGE_SAME;
- 	}
- 
- 	for (i = 0, info = same->info; i < count; i++, info++) {
- 		struct fd dst_fd = fdget(info->dest_fd);
- 		struct file *dst_file = dst_fd.file;
- 
- 		if (!dst_file) {
- 			info->status = -EBADF;
- 			goto next_loop;
- 		}
- 
- 		if (info->reserved) {
- 			info->status = -EINVAL;
- 			goto next_fdput;
- 		}
- 
- 		deduped = vfs_dedupe_file_range_one(file, off, dst_file,
- 						    info->dest_offset, len,
- 						    REMAP_FILE_CAN_SHORTEN);
- 		if (deduped == -EBADE)
- 			info->status = FILE_DEDUPE_RANGE_DIFFERS;
- 		else if (deduped < 0)
- 			info->status = deduped;
- 		else
- 			info->bytes_deduped = len;
+ 	if (!(file_in->f_mode & FMODE_READ) ||
+ 	    !(file_out->f_mode & FMODE_WRITE) ||
+ 	    (file_out->f_flags & O_APPEND))
+ 		return -EBADF;
  
- next_fdput:
- 		fdput(dst_fd);
- next_loop:
- 		if (fatal_signal_pending(current))
- 			break;
- 	}
- 	return ret;
+ 	return 0;
  }
- EXPORT_SYMBOL(vfs_dedupe_file_range);
diff --combined include/linux/fs.h
index 83817d24e902,8fb063ab7d50..16e3789634d3
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -178,6 -178,14 +178,6 @@@ typedef int (dio_iodone_t)(struct kioc
  /* File supports async buffered reads */
  #define FMODE_BUF_RASYNC	((__force fmode_t)0x40000000)
  
 -/*
 - * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
 - * that indicates that they should check the contents of the iovec are
 - * valid, but not check the memory that the iovec elements
 - * points too.
 - */
 -#define CHECK_IOVEC_ONLY -1
 -
  /*
   * Attribute flags.  These should be or-ed together to figure out what
   * has been changed!
@@@ -302,20 -310,17 +302,20 @@@ enum rw_hint 
  	WRITE_LIFE_EXTREME	= RWH_WRITE_LIFE_EXTREME,
  };
  
 -#define IOCB_EVENTFD		(1 << 0)
 -#define IOCB_APPEND		(1 << 1)
 -#define IOCB_DIRECT		(1 << 2)
 -#define IOCB_HIPRI		(1 << 3)
 -#define IOCB_DSYNC		(1 << 4)
 -#define IOCB_SYNC		(1 << 5)
 -#define IOCB_WRITE		(1 << 6)
 -#define IOCB_NOWAIT		(1 << 7)
 +/* Match RWF_* bits to IOCB bits */
 +#define IOCB_HIPRI		(__force int) RWF_HIPRI
 +#define IOCB_DSYNC		(__force int) RWF_DSYNC
 +#define IOCB_SYNC		(__force int) RWF_SYNC
 +#define IOCB_NOWAIT		(__force int) RWF_NOWAIT
 +#define IOCB_APPEND		(__force int) RWF_APPEND
 +
 +/* non-RWF related bits - start at 16 */
 +#define IOCB_EVENTFD		(1 << 16)
 +#define IOCB_DIRECT		(1 << 17)
 +#define IOCB_WRITE		(1 << 18)
  /* iocb->ki_waitq is valid */
 -#define IOCB_WAITQ		(1 << 8)
 -#define IOCB_NOIO		(1 << 9)
 +#define IOCB_WAITQ		(1 << 19)
 +#define IOCB_NOIO		(1 << 20)
  
  struct kiocb {
  	struct file		*ki_filp;
@@@ -1366,12 -1371,6 +1366,12 @@@ extern int send_sigurg(struct fown_stru
  #define SB_ACTIVE	(1<<30)
  #define SB_NOUSER	(1<<31)
  
 +/* These flags relate to encoding and casefolding */
 +#define SB_ENC_STRICT_MODE_FL	(1 << 0)
 +
 +#define sb_has_strict_encoding(sb) \
 +	(sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)
 +
  /*
   *	Umount options
   */
@@@ -1386,7 -1385,7 +1386,7 @@@
  #define SB_I_CGROUPWB	0x00000001	/* cgroup-aware writeback enabled */
  #define SB_I_NOEXEC	0x00000002	/* Ignore executables on this fs */
  #define SB_I_NODEV	0x00000004	/* Ignore devices on this fs */
 -#define SB_I_MULTIROOT	0x00000008	/* Multiple roots to the dentry tree */
 +#define SB_I_STABLE_WRITES 0x00000008	/* don't modify blks until WB is done */
  
  /* sb->s_iflags to limit user namespace mounts */
  #define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
@@@ -1441,10 -1440,6 +1441,10 @@@ struct super_block 
  #endif
  #ifdef CONFIG_FS_VERITY
  	const struct fsverity_operations *s_vop;
 +#endif
 +#ifdef CONFIG_UNICODE
 +	struct unicode_map *s_encoding;
 +	__u16 s_encoding_flags;
  #endif
  	struct hlist_bl_head	s_roots;	/* alternate root dentries for NFS */
  	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
@@@ -1892,8 -1887,15 +1892,8 @@@ static inline int call_mmap(struct fil
  	return file->f_op->mmap(file, vma);
  }
  
 -ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 -			      unsigned long nr_segs, unsigned long fast_segs,
 -			      struct iovec *fast_pointer,
 -			      struct iovec **ret_pointer);
 -
  extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
  extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
 -extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
 -		unsigned long, loff_t *, rwf_t);
  extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
  				   loff_t, size_t, unsigned int);
  extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
@@@ -2217,7 -2219,6 +2217,7 @@@ struct file_system_type 
  #define FS_HAS_SUBTYPE		4
  #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
  #define FS_DISALLOW_NOTIFY_PERM	16	/* Disable fanotify permission events */
 +#define FS_THP_SUPPORT		8192	/* Remove once all fs converted */
  #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
  	int (*init_fs_context)(struct fs_context *);
  	const struct fs_parameter_spec *parameters;
@@@ -2590,10 -2591,6 +2590,10 @@@ extern bool is_bad_inode(struct inode *
  unsigned long invalidate_mapping_pages(struct address_space *mapping,
  					pgoff_t start, pgoff_t end);
  
 +void invalidate_mapping_pagevec(struct address_space *mapping,
 +				pgoff_t start, pgoff_t end,
 +				unsigned long *nr_pagevec);
 +
  static inline void invalidate_remote_inode(struct inode *inode)
  {
  	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@@ -2705,6 -2702,33 +2705,6 @@@ static inline errseq_t file_sample_sb_e
  	return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
  }
  
 -static inline int filemap_nr_thps(struct address_space *mapping)
 -{
 -#ifdef CONFIG_READ_ONLY_THP_FOR_FS
 -	return atomic_read(&mapping->nr_thps);
 -#else
 -	return 0;
 -#endif
 -}
 -
 -static inline void filemap_nr_thps_inc(struct address_space *mapping)
 -{
 -#ifdef CONFIG_READ_ONLY_THP_FOR_FS
 -	atomic_inc(&mapping->nr_thps);
 -#else
 -	WARN_ON_ONCE(1);
 -#endif
 -}
 -
 -static inline void filemap_nr_thps_dec(struct address_space *mapping)
 -{
 -#ifdef CONFIG_READ_ONLY_THP_FOR_FS
 -	atomic_dec(&mapping->nr_thps);
 -#else
 -	WARN_ON_ONCE(1);
 -#endif
 -}
 -
  extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
  			   int datasync);
  extern int vfs_fsync(struct file *file, int datasync);
@@@ -2834,6 -2858,45 +2834,6 @@@ static inline void i_readcount_inc(stru
  #endif
  extern int do_pipe_flags(int *, int);
  
 -#define __kernel_read_file_id(id) \
 -	id(UNKNOWN, unknown)		\
 -	id(FIRMWARE, firmware)		\
 -	id(FIRMWARE_PREALLOC_BUFFER, firmware)	\
 -	id(FIRMWARE_EFI_EMBEDDED, firmware)	\
 -	id(MODULE, kernel-module)		\
 -	id(KEXEC_IMAGE, kexec-image)		\
 -	id(KEXEC_INITRAMFS, kexec-initramfs)	\
 -	id(POLICY, security-policy)		\
 -	id(X509_CERTIFICATE, x509-certificate)	\
 -	id(MAX_ID, )
 -
 -#define __fid_enumify(ENUM, dummy) READING_ ## ENUM,
 -#define __fid_stringify(dummy, str) #str,
 -
 -enum kernel_read_file_id {
 -	__kernel_read_file_id(__fid_enumify)
 -};
 -
 -static const char * const kernel_read_file_str[] = {
 -	__kernel_read_file_id(__fid_stringify)
 -};
 -
 -static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id)
 -{
 -	if ((unsigned)id >= READING_MAX_ID)
 -		return kernel_read_file_str[READING_UNKNOWN];
 -
 -	return kernel_read_file_str[id];
 -}
 -
 -extern int kernel_read_file(struct file *, void **, loff_t *, loff_t,
 -			    enum kernel_read_file_id);
 -extern int kernel_read_file_from_path(const char *, void **, loff_t *, loff_t,
 -				      enum kernel_read_file_id);
 -extern int kernel_read_file_from_path_initns(const char *, void **, loff_t *, loff_t,
 -					     enum kernel_read_file_id);
 -extern int kernel_read_file_from_fd(int, void **, loff_t *, loff_t,
 -				    enum kernel_read_file_id);
  extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
  ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos);
  extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
@@@ -2946,13 -3009,9 +2946,9 @@@ extern int sb_min_blocksize(struct supe
  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
- extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
- 				struct file *file_out, loff_t pos_out,
- 				loff_t *count, unsigned int remap_flags);
+ extern int generic_write_check_limits(struct file *file, loff_t pos,
+ 		loff_t *count);
  extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
- extern int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
- 				    struct file *file_out, loff_t pos_out,
- 				    size_t *count, unsigned int flags);
  extern ssize_t generic_file_buffered_read(struct kiocb *iocb,
  		struct iov_iter *to, ssize_t already_read);
  extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
@@@ -3016,6 -3075,8 +3012,6 @@@ enum 
  	DIO_SKIP_HOLES	= 0x02,
  };
  
 -void dio_end_io(struct bio *bio);
 -
  ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
  			     struct block_device *bdev, struct iov_iter *iter,
  			     get_block_t get_block,
@@@ -3197,12 -3258,6 +3193,12 @@@ extern int generic_file_fsync(struct fi
  
  extern int generic_check_addressable(unsigned, u64);
  
 +#ifdef CONFIG_UNICODE
 +extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
 +extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
 +				const char *str, const struct qstr *name);
 +#endif
 +
  #ifdef CONFIG_MIGRATION
  extern int buffer_migrate_page(struct address_space *,
  				struct page *, struct page *,
@@@ -3258,9 -3313,6 +3254,9 @@@ static inline int kiocb_set_rw_flags(st
  {
  	int kiocb_flags = 0;
  
 +	/* make sure there's no overlap between RWF and private IOCB flags */
 +	BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD);
 +
  	if (!flags)
  		return 0;
  	if (unlikely(flags & ~RWF_SUPPORTED))
@@@ -3269,11 -3321,16 +3265,11 @@@
  	if (flags & RWF_NOWAIT) {
  		if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
  			return -EOPNOTSUPP;
 -		kiocb_flags |= IOCB_NOWAIT | IOCB_NOIO;
 +		kiocb_flags |= IOCB_NOIO;
  	}
 -	if (flags & RWF_HIPRI)
 -		kiocb_flags |= IOCB_HIPRI;
 -	if (flags & RWF_DSYNC)
 -		kiocb_flags |= IOCB_DSYNC;
 +	kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
  	if (flags & RWF_SYNC)
 -		kiocb_flags |= (IOCB_DSYNC | IOCB_SYNC);
 -	if (flags & RWF_APPEND)
 -		kiocb_flags |= IOCB_APPEND;
 +		kiocb_flags |= IOCB_DSYNC;
  
  	ki->ki_flags |= kiocb_flags;
  	return 0;
@@@ -3453,6 -3510,15 +3449,6 @@@ extern int vfs_fadvise(struct file *fil
  extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
  			   int advice);
  
 -#if defined(CONFIG_IO_URING)
 -extern struct sock *io_uring_get_socket(struct file *file);
 -#else
 -static inline struct sock *io_uring_get_socket(struct file *file)
 -{
 -	return NULL;
 -}
 -#endif
 -
  int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
  			     unsigned int flags);
  
diff --combined mm/filemap.c
index e4101b5bfa82,9962fd682f20..d5e7c2029d16
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@@ -249,7 -249,7 +249,7 @@@ static void page_cache_free_page(struc
  		freepage(page);
  
  	if (PageTransHuge(page) && !PageHuge(page)) {
 -		page_ref_sub(page, HPAGE_PMD_NR);
 +		page_ref_sub(page, thp_nr_pages(page));
  		VM_BUG_ON_PAGE(page_count(page) <= 0, page);
  	} else {
  		put_page(page);
@@@ -414,7 -414,7 +414,7 @@@ int __filemap_fdatawrite_range(struct a
  		.range_end = end,
  	};
  
 -	if (!mapping_cap_writeback_dirty(mapping) ||
 +	if (!mapping_can_writeback(mapping) ||
  	    !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
  		return 0;
  
@@@ -827,14 -827,15 +827,14 @@@ int replace_page_cache_page(struct pag
  }
  EXPORT_SYMBOL_GPL(replace_page_cache_page);
  
 -static int __add_to_page_cache_locked(struct page *page,
 -				      struct address_space *mapping,
 -				      pgoff_t offset, gfp_t gfp_mask,
 -				      void **shadowp)
 +noinline int __add_to_page_cache_locked(struct page *page,
 +					struct address_space *mapping,
 +					pgoff_t offset, gfp_t gfp,
 +					void **shadowp)
  {
  	XA_STATE(xas, &mapping->i_pages, offset);
  	int huge = PageHuge(page);
  	int error;
 -	void *old;
  
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  	VM_BUG_ON_PAGE(PageSwapBacked(page), page);
@@@ -845,46 -846,25 +845,46 @@@
  	page->index = offset;
  
  	if (!huge) {
 -		error = mem_cgroup_charge(page, current->mm, gfp_mask);
 +		error = mem_cgroup_charge(page, current->mm, gfp);
  		if (error)
  			goto error;
  	}
  
 +	gfp &= GFP_RECLAIM_MASK;
 +
  	do {
 +		unsigned int order = xa_get_order(xas.xa, xas.xa_index);
 +		void *entry, *old = NULL;
 +
 +		if (order > thp_order(page))
 +			xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
 +					order, gfp);
  		xas_lock_irq(&xas);
 -		old = xas_load(&xas);
 -		if (old && !xa_is_value(old))
 -			xas_set_err(&xas, -EEXIST);
 +		xas_for_each_conflict(&xas, entry) {
 +			old = entry;
 +			if (!xa_is_value(entry)) {
 +				xas_set_err(&xas, -EEXIST);
 +				goto unlock;
 +			}
 +		}
 +
 +		if (old) {
 +			if (shadowp)
 +				*shadowp = old;
 +			/* entry may have been split before we acquired lock */
 +			order = xa_get_order(xas.xa, xas.xa_index);
 +			if (order > thp_order(page)) {
 +				xas_split(&xas, old, order);
 +				xas_reset(&xas);
 +			}
 +		}
 +
  		xas_store(&xas, page);
  		if (xas_error(&xas))
  			goto unlock;
  
 -		if (xa_is_value(old)) {
 +		if (old)
  			mapping->nrexceptional--;
 -			if (shadowp)
 -				*shadowp = old;
 -		}
  		mapping->nrpages++;
  
  		/* hugetlb pages do not participate in page cache accounting */
@@@ -892,7 -872,7 +892,7 @@@
  			__inc_lruvec_page_state(page, NR_FILE_PAGES);
  unlock:
  		xas_unlock_irq(&xas);
 -	} while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
 +	} while (xas_nomem(&xas, gfp));
  
  	if (xas_error(&xas)) {
  		error = xas_error(&xas);
@@@ -1445,7 -1425,7 +1445,7 @@@ static inline bool clear_bit_unlock_is_
   * unlock_page - unlock a locked page
   * @page: the page
   *
 - * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
 + * Unlocks the page and wakes up sleepers in wait_on_page_locked().
   * Also wakes sleepers in wait_on_page_writeback() because the wakeup
   * mechanism between PageLocked pages and PageWriteback pages is shared.
   * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
@@@ -1665,19 -1645,19 +1665,19 @@@ EXPORT_SYMBOL(page_cache_prev_miss)
  /**
   * find_get_entry - find and get a page cache entry
   * @mapping: the address_space to search
 - * @offset: the page cache index
 + * @index: The page cache index.
   *
   * Looks up the page cache slot at @mapping & @offset.  If there is a
 - * page cache page, it is returned with an increased refcount.
 + * page cache page, the head page is returned with an increased refcount.
   *
   * If the slot holds a shadow entry of a previously evicted page, or a
   * swap entry from shmem/tmpfs, it is returned.
   *
 - * Return: the found page or shadow entry, %NULL if nothing is found.
 + * Return: The head page or shadow entry, %NULL if nothing is found.
   */
 -struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
 +struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
  {
 -	XA_STATE(xas, &mapping->i_pages, offset);
 +	XA_STATE(xas, &mapping->i_pages, index);
  	struct page *page;
  
  	rcu_read_lock();
@@@ -1705,6 -1685,7 +1705,6 @@@ repeat
  		put_page(page);
  		goto repeat;
  	}
 -	page = find_subpage(page, offset);
  out:
  	rcu_read_unlock();
  
@@@ -1712,37 -1693,40 +1712,37 @@@
  }
  
  /**
 - * find_lock_entry - locate, pin and lock a page cache entry
 - * @mapping: the address_space to search
 - * @offset: the page cache index
 + * find_lock_entry - Locate and lock a page cache entry.
 + * @mapping: The address_space to search.
 + * @index: The page cache index.
   *
 - * Looks up the page cache slot at @mapping & @offset.  If there is a
 - * page cache page, it is returned locked and with an increased
 - * refcount.
 + * Looks up the page at @mapping & @index.  If there is a page in the
 + * cache, the head page is returned locked and with an increased refcount.
   *
   * If the slot holds a shadow entry of a previously evicted page, or a
   * swap entry from shmem/tmpfs, it is returned.
   *
 - * find_lock_entry() may sleep.
 - *
 - * Return: the found page or shadow entry, %NULL if nothing is found.
 + * Context: May sleep.
 + * Return: The head page or shadow entry, %NULL if nothing is found.
   */
 -struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
 +struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
  {
  	struct page *page;
  
  repeat:
 -	page = find_get_entry(mapping, offset);
 +	page = find_get_entry(mapping, index);
  	if (page && !xa_is_value(page)) {
  		lock_page(page);
  		/* Has the page been truncated? */
 -		if (unlikely(page_mapping(page) != mapping)) {
 +		if (unlikely(page->mapping != mapping)) {
  			unlock_page(page);
  			put_page(page);
  			goto repeat;
  		}
 -		VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
 +		VM_BUG_ON_PAGE(!thp_contains(page, index), page);
  	}
  	return page;
  }
 -EXPORT_SYMBOL(find_lock_entry);
  
  /**
   * pagecache_get_page - Find and get a reference to a page.
@@@ -1757,8 -1741,6 +1757,8 @@@
   *
   * * %FGP_ACCESSED - The page will be marked accessed.
   * * %FGP_LOCK - The page is returned locked.
 + * * %FGP_HEAD - If the page is present and a THP, return the head page
 + *   rather than the exact page specified by the index.
   * * %FGP_CREAT - If no page is present then a new page is allocated using
   *   @gfp_mask and added to the page cache and the VM's LRU list.
   *   The page is returned locked and with an increased refcount.
@@@ -1799,12 -1781,12 +1799,12 @@@ repeat
  		}
  
  		/* Has the page been truncated? */
 -		if (unlikely(compound_head(page)->mapping != mapping)) {
 +		if (unlikely(page->mapping != mapping)) {
  			unlock_page(page);
  			put_page(page);
  			goto repeat;
  		}
 -		VM_BUG_ON_PAGE(page->index != index, page);
 +		VM_BUG_ON_PAGE(!thp_contains(page, index), page);
  	}
  
  	if (fgp_flags & FGP_ACCESSED)
@@@ -1814,13 -1796,11 +1814,13 @@@
  		if (page_is_idle(page))
  			clear_page_idle(page);
  	}
 +	if (!(fgp_flags & FGP_HEAD))
 +		page = find_subpage(page, index);
  
  no_page:
  	if (!page && (fgp_flags & FGP_CREAT)) {
  		int err;
 -		if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
 +		if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
  			gfp_mask |= __GFP_WRITE;
  		if (fgp_flags & FGP_NOFS)
  			gfp_mask &= ~__GFP_FS;
@@@ -2199,14 -2179,6 +2199,14 @@@ ssize_t generic_file_buffered_read(stru
  	last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
  	offset = *ppos & ~PAGE_MASK;
  
 +	/*
 +	 * If we've already successfully copied some data, then we
 +	 * can no longer safely return -EIOCBQUEUED. Hence mark
 +	 * an async read NOWAIT at that point.
 +	 */
 +	if (written && (iocb->ki_flags & IOCB_WAITQ))
 +		iocb->ki_flags |= IOCB_NOWAIT;
 +
  	for (;;) {
  		struct page *page;
  		pgoff_t end_index;
@@@ -2596,8 -2568,8 +2596,8 @@@ static struct file *do_sync_mmap_readah
  	struct file *file = vmf->vma->vm_file;
  	struct file_ra_state *ra = &file->f_ra;
  	struct address_space *mapping = file->f_mapping;
 +	DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
  	struct file *fpin = NULL;
 -	pgoff_t offset = vmf->pgoff;
  	unsigned int mmap_miss;
  
  	/* If we don't want any read-ahead, don't bother */
@@@ -2608,7 -2580,8 +2608,7 @@@
  
  	if (vmf->vma->vm_flags & VM_SEQ_READ) {
  		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
 -		page_cache_sync_readahead(mapping, ra, file, offset,
 -					  ra->ra_pages);
 +		page_cache_sync_ra(&ractl, ra, ra->ra_pages);
  		return fpin;
  	}
  
@@@ -2628,11 -2601,10 +2628,11 @@@
  	 * mmap read-around
  	 */
  	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
 -	ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
 +	ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
  	ra->size = ra->ra_pages;
  	ra->async_size = ra->ra_pages / 4;
 -	ra_submit(ra, mapping, file);
 +	ractl._index = ra->start;
 +	do_page_cache_ra(&ractl, ra->size, ra->async_size);
  	return fpin;
  }
  
@@@ -2821,42 -2793,42 +2821,42 @@@ void filemap_map_pages(struct vm_fault 
  	pgoff_t last_pgoff = start_pgoff;
  	unsigned long max_idx;
  	XA_STATE(xas, &mapping->i_pages, start_pgoff);
 -	struct page *page;
 +	struct page *head, *page;
  	unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
  
  	rcu_read_lock();
 -	xas_for_each(&xas, page, end_pgoff) {
 -		if (xas_retry(&xas, page))
 +	xas_for_each(&xas, head, end_pgoff) {
 +		if (xas_retry(&xas, head))
  			continue;
 -		if (xa_is_value(page))
 +		if (xa_is_value(head))
  			goto next;
  
  		/*
  		 * Check for a locked page first, as a speculative
  		 * reference may adversely influence page migration.
  		 */
 -		if (PageLocked(page))
 +		if (PageLocked(head))
  			goto next;
 -		if (!page_cache_get_speculative(page))
 +		if (!page_cache_get_speculative(head))
  			goto next;
  
  		/* Has the page moved or been split? */
 -		if (unlikely(page != xas_reload(&xas)))
 +		if (unlikely(head != xas_reload(&xas)))
  			goto skip;
 -		page = find_subpage(page, xas.xa_index);
 +		page = find_subpage(head, xas.xa_index);
  
 -		if (!PageUptodate(page) ||
 +		if (!PageUptodate(head) ||
  				PageReadahead(page) ||
  				PageHWPoison(page))
  			goto skip;
 -		if (!trylock_page(page))
 +		if (!trylock_page(head))
  			goto skip;
  
 -		if (page->mapping != mapping || !PageUptodate(page))
 +		if (head->mapping != mapping || !PageUptodate(head))
  			goto unlock;
  
  		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
 -		if (page->index >= max_idx)
 +		if (xas.xa_index >= max_idx)
  			goto unlock;
  
  		if (mmap_miss > 0)
@@@ -2868,12 -2840,12 +2868,12 @@@
  		last_pgoff = xas.xa_index;
  		if (alloc_set_pte(vmf, page))
  			goto unlock;
 -		unlock_page(page);
 +		unlock_page(head);
  		goto next;
  unlock:
 -		unlock_page(page);
 +		unlock_page(head);
  skip:
 -		put_page(page);
 +		put_page(head);
  next:
  		/* Huge page is mapped? No need to proceed. */
  		if (pmd_trans_huge(*vmf->pmd))
@@@ -3012,7 -2984,7 +3012,7 @@@ filler
  		goto out;
  
  	/*
 -	 * Page is not up to date and may be locked due one of the following
 +	 * Page is not up to date and may be locked due to one of the following
  	 * case a: Page is being filled and the page lock is held
  	 * case b: Read/write error clearing the page uptodate status
  	 * case c: Truncation in progress (page locked)
@@@ -3121,228 -3093,6 +3121,6 @@@ struct page *read_cache_page_gfp(struc
  }
  EXPORT_SYMBOL(read_cache_page_gfp);
  
- /*
-  * Don't operate on ranges the page cache doesn't support, and don't exceed the
-  * LFS limits.  If pos is under the limit it becomes a short access.  If it
-  * exceeds the limit we return -EFBIG.
-  */
- static int generic_write_check_limits(struct file *file, loff_t pos,
- 				      loff_t *count)
- {
- 	struct inode *inode = file->f_mapping->host;
- 	loff_t max_size = inode->i_sb->s_maxbytes;
- 	loff_t limit = rlimit(RLIMIT_FSIZE);
- 
- 	if (limit != RLIM_INFINITY) {
- 		if (pos >= limit) {
- 			send_sig(SIGXFSZ, current, 0);
- 			return -EFBIG;
- 		}
- 		*count = min(*count, limit - pos);
- 	}
- 
- 	if (!(file->f_flags & O_LARGEFILE))
- 		max_size = MAX_NON_LFS;
- 
- 	if (unlikely(pos >= max_size))
- 		return -EFBIG;
- 
- 	*count = min(*count, max_size - pos);
- 
- 	return 0;
- }
- 
- /*
-  * Performs necessary checks before doing a write
-  *
-  * Can adjust writing position or amount of bytes to write.
-  * Returns appropriate error code that caller should return or
-  * zero in case that write should be allowed.
-  */
- inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
- {
- 	struct file *file = iocb->ki_filp;
- 	struct inode *inode = file->f_mapping->host;
- 	loff_t count;
- 	int ret;
- 
- 	if (IS_SWAPFILE(inode))
- 		return -ETXTBSY;
- 
- 	if (!iov_iter_count(from))
- 		return 0;
- 
- 	/* FIXME: this is for backwards compatibility with 2.4 */
- 	if (iocb->ki_flags & IOCB_APPEND)
- 		iocb->ki_pos = i_size_read(inode);
- 
- 	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
- 		return -EINVAL;
- 
- 	count = iov_iter_count(from);
- 	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
- 	if (ret)
- 		return ret;
- 
- 	iov_iter_truncate(from, count);
- 	return iov_iter_count(from);
- }
- EXPORT_SYMBOL(generic_write_checks);
- 
- /*
-  * Performs necessary checks before doing a clone.
-  *
-  * Can adjust amount of bytes to clone via @req_count argument.
-  * Returns appropriate error code that caller should return or
-  * zero in case the clone should be allowed.
-  */
- int generic_remap_checks(struct file *file_in, loff_t pos_in,
- 			 struct file *file_out, loff_t pos_out,
- 			 loff_t *req_count, unsigned int remap_flags)
- {
- 	struct inode *inode_in = file_in->f_mapping->host;
- 	struct inode *inode_out = file_out->f_mapping->host;
- 	uint64_t count = *req_count;
- 	uint64_t bcount;
- 	loff_t size_in, size_out;
- 	loff_t bs = inode_out->i_sb->s_blocksize;
- 	int ret;
- 
- 	/* The start of both ranges must be aligned to an fs block. */
- 	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
- 		return -EINVAL;
- 
- 	/* Ensure offsets don't wrap. */
- 	if (pos_in + count < pos_in || pos_out + count < pos_out)
- 		return -EINVAL;
- 
- 	size_in = i_size_read(inode_in);
- 	size_out = i_size_read(inode_out);
- 
- 	/* Dedupe requires both ranges to be within EOF. */
- 	if ((remap_flags & REMAP_FILE_DEDUP) &&
- 	    (pos_in >= size_in || pos_in + count > size_in ||
- 	     pos_out >= size_out || pos_out + count > size_out))
- 		return -EINVAL;
- 
- 	/* Ensure the infile range is within the infile. */
- 	if (pos_in >= size_in)
- 		return -EINVAL;
- 	count = min(count, size_in - (uint64_t)pos_in);
- 
- 	ret = generic_write_check_limits(file_out, pos_out, &count);
- 	if (ret)
- 		return ret;
- 
- 	/*
- 	 * If the user wanted us to link to the infile's EOF, round up to the
- 	 * next block boundary for this check.
- 	 *
- 	 * Otherwise, make sure the count is also block-aligned, having
- 	 * already confirmed the starting offsets' block alignment.
- 	 */
- 	if (pos_in + count == size_in) {
- 		bcount = ALIGN(size_in, bs) - pos_in;
- 	} else {
- 		if (!IS_ALIGNED(count, bs))
- 			count = ALIGN_DOWN(count, bs);
- 		bcount = count;
- 	}
- 
- 	/* Don't allow overlapped cloning within the same file. */
- 	if (inode_in == inode_out &&
- 	    pos_out + bcount > pos_in &&
- 	    pos_out < pos_in + bcount)
- 		return -EINVAL;
- 
- 	/*
- 	 * We shortened the request but the caller can't deal with that, so
- 	 * bounce the request back to userspace.
- 	 */
- 	if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
- 		return -EINVAL;
- 
- 	*req_count = count;
- 	return 0;
- }
- 
- 
- /*
-  * Performs common checks before doing a file copy/clone
-  * from @file_in to @file_out.
-  */
- int generic_file_rw_checks(struct file *file_in, struct file *file_out)
- {
- 	struct inode *inode_in = file_inode(file_in);
- 	struct inode *inode_out = file_inode(file_out);
- 
- 	/* Don't copy dirs, pipes, sockets... */
- 	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- 		return -EISDIR;
- 	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- 		return -EINVAL;
- 
- 	if (!(file_in->f_mode & FMODE_READ) ||
- 	    !(file_out->f_mode & FMODE_WRITE) ||
- 	    (file_out->f_flags & O_APPEND))
- 		return -EBADF;
- 
- 	return 0;
- }
- 
- /*
-  * Performs necessary checks before doing a file copy
-  *
-  * Can adjust amount of bytes to copy via @req_count argument.
-  * Returns appropriate error code that caller should return or
-  * zero in case the copy should be allowed.
-  */
- int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
- 			     struct file *file_out, loff_t pos_out,
- 			     size_t *req_count, unsigned int flags)
- {
- 	struct inode *inode_in = file_inode(file_in);
- 	struct inode *inode_out = file_inode(file_out);
- 	uint64_t count = *req_count;
- 	loff_t size_in;
- 	int ret;
- 
- 	ret = generic_file_rw_checks(file_in, file_out);
- 	if (ret)
- 		return ret;
- 
- 	/* Don't touch certain kinds of inodes */
- 	if (IS_IMMUTABLE(inode_out))
- 		return -EPERM;
- 
- 	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- 		return -ETXTBSY;
- 
- 	/* Ensure offsets don't wrap. */
- 	if (pos_in + count < pos_in || pos_out + count < pos_out)
- 		return -EOVERFLOW;
- 
- 	/* Shorten the copy to EOF */
- 	size_in = i_size_read(inode_in);
- 	if (pos_in >= size_in)
- 		count = 0;
- 	else
- 		count = min(count, size_in - (uint64_t)pos_in);
- 
- 	ret = generic_write_check_limits(file_out, pos_out, &count);
- 	if (ret)
- 		return ret;
- 
- 	/* Don't allow overlapped copying within the same file. */
- 	if (inode_in == inode_out &&
- 	    pos_out + count > pos_in &&
- 	    pos_out < pos_in + count)
- 		return -EINVAL;
- 
- 	*req_count = count;
- 	return 0;
- }
- 
  int pagecache_write_begin(struct file *file, struct address_space *mapping,
  				loff_t pos, unsigned len, unsigned flags,
  				struct page **pagep, void **fsdata)