return ret;
}
+static int warn_unsupported(struct file *file, const char *op)
+{
+ pr_warn_ratelimited(
+ "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
+ op, file, current->pid, current->comm);
+ return -EINVAL;
+}
+
ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
- mm_segment_t old_fs = get_fs();
+ struct kvec iov = {
+ .iov_base = buf,
+ .iov_len = min_t(size_t, count, MAX_RW_COUNT),
+ };
+ struct kiocb kiocb;
+ struct iov_iter iter;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
return -EINVAL;
if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
+ /*
+ * Also fail if ->read_iter and ->read are both wired up as that
+ * implies very convoluted semantics.
+ */
+ if (unlikely(!file->f_op->read_iter || file->f_op->read))
+ return warn_unsupported(file, "read");
- if (count > MAX_RW_COUNT)
- count = MAX_RW_COUNT;
- set_fs(KERNEL_DS);
- if (file->f_op->read)
- ret = file->f_op->read(file, (void __user *)buf, count, pos);
- else if (file->f_op->read_iter)
- ret = new_sync_read(file, (void __user *)buf, count, pos);
- else
- ret = -EINVAL;
- set_fs(old_fs);
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos ? *pos : 0;
+ iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
+ ret = file->f_op->read_iter(&kiocb, &iter);
if (ret > 0) {
+ if (pos)
+ *pos = kiocb.ki_pos;
fsnotify_access(file);
add_rchar(current, ret);
}
/* caller is responsible for file_start_write/file_end_write */
ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
{
- mm_segment_t old_fs;
- const char __user *p;
+ struct kvec iov = {
+ .iov_base = (void *)buf,
+ .iov_len = min_t(size_t, count, MAX_RW_COUNT),
+ };
+ struct kiocb kiocb;
+ struct iov_iter iter;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
+ /*
+ * Also fail if ->write_iter and ->write are both wired up as that
+ * implies very convoluted semantics.
+ */
+ if (unlikely(!file->f_op->write_iter || file->f_op->write))
+ return warn_unsupported(file, "write");
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- p = (__force const char __user *)buf;
- if (count > MAX_RW_COUNT)
- count = MAX_RW_COUNT;
- if (file->f_op->write)
- ret = file->f_op->write(file, p, count, pos);
- else if (file->f_op->write_iter)
- ret = new_sync_write(file, p, count, pos);
- else
- ret = -EINVAL;
- set_fs(old_fs);
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos ? *pos : 0;
+ iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
+ ret = file->f_op->write_iter(&kiocb, &iter);
if (ret > 0) {
+ if (pos)
+ *pos = kiocb.ki_pos;
fsnotify_modify(file);
add_wchar(current, ret);
}
return ret;
}
-/**
- * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
- * into the kernel and check that it is valid.
- *
- * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
- * @uvector: Pointer to the userspace array.
- * @nr_segs: Number of elements in userspace array.
- * @fast_segs: Number of elements in @fast_pointer.
- * @fast_pointer: Pointer to (usually small on-stack) kernel array.
- * @ret_pointer: (output parameter) Pointer to a variable that will point to
- * either @fast_pointer, a newly allocated kernel array, or NULL,
- * depending on which array was used.
- *
- * This function copies an array of &struct iovec of @nr_segs from
- * userspace into the kernel and checks that each element is valid (e.g.
- * it does not point to a kernel address or cause overflow by being too
- * large, etc.).
- *
- * As an optimization, the caller may provide a pointer to a small
- * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
- * (the size of this array, or 0 if unused, should be given in @fast_segs).
- *
- * @ret_pointer will always point to the array that was used, so the
- * caller must take care not to call kfree() on it e.g. in case the
- * @fast_pointer array was used and it was allocated on the stack.
- *
- * Return: The total number of bytes covered by the iovec array on success
- * or a negative error code on error.
- */
-ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
- unsigned long nr_segs, unsigned long fast_segs,
- struct iovec *fast_pointer,
- struct iovec **ret_pointer)
-{
- unsigned long seg;
- ssize_t ret;
- struct iovec *iov = fast_pointer;
-
- /*
- * SuS says "The readv() function *may* fail if the iovcnt argument
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
- * traditionally returned zero for zero segments, so...
- */
- if (nr_segs == 0) {
- ret = 0;
- goto out;
- }
-
- /*
- * First get the "struct iovec" from user memory and
- * verify all the pointers
- */
- if (nr_segs > UIO_MAXIOV) {
- ret = -EINVAL;
- goto out;
- }
- if (nr_segs > fast_segs) {
- iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
- if (iov == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- }
- if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
- ret = -EFAULT;
- goto out;
- }
-
- /*
- * According to the Single Unix Specification we should return EINVAL
- * if an element length is < 0 when cast to ssize_t or if the
- * total length would overflow the ssize_t return value of the
- * system call.
- *
- * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
- * overflow case.
- */
- ret = 0;
- for (seg = 0; seg < nr_segs; seg++) {
- void __user *buf = iov[seg].iov_base;
- ssize_t len = (ssize_t)iov[seg].iov_len;
-
- /* see if we we're about to use an invalid len or if
- * it's about to overflow ssize_t */
- if (len < 0) {
- ret = -EINVAL;
- goto out;
- }
- if (type >= 0
- && unlikely(!access_ok(buf, len))) {
- ret = -EFAULT;
- goto out;
- }
- if (len > MAX_RW_COUNT - ret) {
- len = MAX_RW_COUNT - ret;
- iov[seg].iov_len = len;
- }
- ret += len;
- }
-out:
- *ret_pointer = iov;
- return ret;
-}
-
-#ifdef CONFIG_COMPAT
-ssize_t compat_rw_copy_check_uvector(int type,
- const struct compat_iovec __user *uvector, unsigned long nr_segs,
- unsigned long fast_segs, struct iovec *fast_pointer,
- struct iovec **ret_pointer)
-{
- compat_ssize_t tot_len;
- struct iovec *iov = *ret_pointer = fast_pointer;
- ssize_t ret = 0;
- int seg;
-
- /*
- * SuS says "The readv() function *may* fail if the iovcnt argument
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
- * traditionally returned zero for zero segments, so...
- */
- if (nr_segs == 0)
- goto out;
-
- ret = -EINVAL;
- if (nr_segs > UIO_MAXIOV)
- goto out;
- if (nr_segs > fast_segs) {
- ret = -ENOMEM;
- iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
- if (iov == NULL)
- goto out;
- }
- *ret_pointer = iov;
-
- ret = -EFAULT;
- if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
- goto out;
-
- /*
- * Single unix specification:
- * We should -EINVAL if an element length is not >= 0 and fitting an
- * ssize_t.
- *
- * In Linux, the total length is limited to MAX_RW_COUNT, there is
- * no overflow possibility.
- */
- tot_len = 0;
- ret = -EINVAL;
- for (seg = 0; seg < nr_segs; seg++) {
- compat_uptr_t buf;
- compat_ssize_t len;
-
- if (__get_user(len, &uvector->iov_len) ||
- __get_user(buf, &uvector->iov_base)) {
- ret = -EFAULT;
- goto out;
- }
- if (len < 0) /* size_t not fitting in compat_ssize_t .. */
- goto out;
- if (type >= 0 &&
- !access_ok(compat_ptr(buf), len)) {
- ret = -EFAULT;
- goto out;
- }
- if (len > MAX_RW_COUNT - tot_len)
- len = MAX_RW_COUNT - tot_len;
- tot_len += len;
- iov->iov_base = compat_ptr(buf);
- iov->iov_len = (compat_size_t) len;
- uvector++;
- iov++;
- }
- ret = tot_len;
-
-out:
- return ret;
-}
-#endif
-
static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
loff_t *pos, rwf_t flags)
{
}
EXPORT_SYMBOL(vfs_iter_write);
-ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
+static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
unsigned long vlen, loff_t *pos, rwf_t flags)
{
struct iovec iovstack[UIO_FASTIOV];
return do_pwritev(fd, vec, vlen, pos, flags);
}
+/*
+ * Various compat syscalls. Note that they all pretend to take a native
+ * iovec - import_iovec will properly treat those as compat_iovecs based on
+ * in_compat_syscall().
+ */
#ifdef CONFIG_COMPAT
-static size_t compat_readv(struct file *file,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos, rwf_t flags)
-{
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
- ssize_t ret;
-
- ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
- if (ret >= 0) {
- ret = do_iter_read(file, &iter, pos, flags);
- kfree(iov);
- }
- if (ret > 0)
- add_rchar(current, ret);
- inc_syscr(current);
- return ret;
-}
-
-static size_t do_compat_readv(compat_ulong_t fd,
- const struct compat_iovec __user *vec,
- compat_ulong_t vlen, rwf_t flags)
-{
- struct fd f = fdget_pos(fd);
- ssize_t ret;
- loff_t pos;
-
- if (!f.file)
- return -EBADF;
- pos = f.file->f_pos;
- ret = compat_readv(f.file, vec, vlen, &pos, flags);
- if (ret >= 0)
- f.file->f_pos = pos;
- fdput_pos(f);
- return ret;
-
-}
-
-COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
- compat_ulong_t, vlen)
-{
- return do_compat_readv(fd, vec, vlen, 0);
-}
-
-static long do_compat_preadv64(unsigned long fd,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos, rwf_t flags)
-{
- struct fd f;
- ssize_t ret;
-
- if (pos < 0)
- return -EINVAL;
- f = fdget(fd);
- if (!f.file)
- return -EBADF;
- ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PREAD)
- ret = compat_readv(f.file, vec, vlen, &pos, flags);
- fdput(f);
- return ret;
-}
-
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos)
{
- return do_compat_preadv64(fd, vec, vlen, pos, 0);
+ return do_preadv(fd, vec, vlen, pos, 0);
}
#endif
COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return do_compat_preadv64(fd, vec, vlen, pos, 0);
+ return do_preadv(fd, vec, vlen, pos, 0);
}
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
if (pos == -1)
- return do_compat_readv(fd, vec, vlen, flags);
-
- return do_compat_preadv64(fd, vec, vlen, pos, flags);
+ return do_readv(fd, vec, vlen, flags);
+ return do_preadv(fd, vec, vlen, pos, flags);
}
#endif
COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
rwf_t, flags)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
if (pos == -1)
- return do_compat_readv(fd, vec, vlen, flags);
-
- return do_compat_preadv64(fd, vec, vlen, pos, flags);
-}
-
-static size_t compat_writev(struct file *file,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos, rwf_t flags)
-{
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
- ssize_t ret;
-
- ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
- if (ret >= 0) {
- file_start_write(file);
- ret = do_iter_write(file, &iter, pos, flags);
- file_end_write(file);
- kfree(iov);
- }
- if (ret > 0)
- add_wchar(current, ret);
- inc_syscw(current);
- return ret;
-}
-
-static size_t do_compat_writev(compat_ulong_t fd,
- const struct compat_iovec __user* vec,
- compat_ulong_t vlen, rwf_t flags)
-{
- struct fd f = fdget_pos(fd);
- ssize_t ret;
- loff_t pos;
-
- if (!f.file)
- return -EBADF;
- pos = f.file->f_pos;
- ret = compat_writev(f.file, vec, vlen, &pos, flags);
- if (ret >= 0)
- f.file->f_pos = pos;
- fdput_pos(f);
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
- const struct compat_iovec __user *, vec,
- compat_ulong_t, vlen)
-{
- return do_compat_writev(fd, vec, vlen, 0);
-}
-
-static long do_compat_pwritev64(unsigned long fd,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos, rwf_t flags)
-{
- struct fd f;
- ssize_t ret;
-
- if (pos < 0)
- return -EINVAL;
- f = fdget(fd);
- if (!f.file)
- return -EBADF;
- ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PWRITE)
- ret = compat_writev(f.file, vec, vlen, &pos, flags);
- fdput(f);
- return ret;
+ return do_readv(fd, vec, vlen, flags);
+ return do_preadv(fd, vec, vlen, pos, flags);
}
#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos)
{
- return do_compat_pwritev64(fd, vec, vlen, pos, 0);
+ return do_pwritev(fd, vec, vlen, pos, 0);
}
#endif
COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *,vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return do_compat_pwritev64(fd, vec, vlen, pos, 0);
+ return do_pwritev(fd, vec, vlen, pos, 0);
}
#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
if (pos == -1)
- return do_compat_writev(fd, vec, vlen, flags);
-
- return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+ return do_writev(fd, vec, vlen, flags);
+ return do_pwritev(fd, vec, vlen, pos, flags);
}
#endif
COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *,vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
if (pos == -1)
- return do_compat_writev(fd, vec, vlen, flags);
-
- return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+ return do_writev(fd, vec, vlen, flags);
+ return do_pwritev(fd, vec, vlen, pos, flags);
}
-
-#endif
+#endif /* CONFIG_COMPAT */
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
size_t count, loff_t max)
flags);
}
+ /*
+ * Performs necessary checks before doing a file copy
+ *
+ * Can adjust amount of bytes to copy via @req_count argument.
+ * Returns appropriate error code that caller should return or
+ * zero in case the copy should be allowed.
+ */
+ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t *req_count, unsigned int flags)
+ {
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ uint64_t count = *req_count;
+ loff_t size_in;
+ int ret;
+
+ ret = generic_file_rw_checks(file_in, file_out);
+ if (ret)
+ return ret;
+
+ /* Don't touch certain kinds of inodes */
+ if (IS_IMMUTABLE(inode_out))
+ return -EPERM;
+
+ if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ return -ETXTBSY;
+
+ /* Ensure offsets don't wrap. */
+ if (pos_in + count < pos_in || pos_out + count < pos_out)
+ return -EOVERFLOW;
+
+ /* Shorten the copy to EOF */
+ size_in = i_size_read(inode_in);
+ if (pos_in >= size_in)
+ count = 0;
+ else
+ count = min(count, size_in - (uint64_t)pos_in);
+
+ ret = generic_write_check_limits(file_out, pos_out, &count);
+ if (ret)
+ return ret;
+
+ /* Don't allow overlapped copying within the same file. */
+ if (inode_in == inode_out &&
+ pos_out + count > pos_in &&
+ pos_out < pos_in + count)
+ return -EINVAL;
+
+ *req_count = count;
+ return 0;
+ }
+
/*
* copy_file_range() differs from regular file read and write in that it
* specifically allows return partial success. When it does so is up to
return ret;
}
- static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
- bool write)
- {
- struct inode *inode = file_inode(file);
-
- if (unlikely(pos < 0 || len < 0))
- return -EINVAL;
-
- if (unlikely((loff_t) (pos + len) < 0))
- return -EINVAL;
-
- if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
- loff_t end = len ? pos + len - 1 : OFFSET_MAX;
- int retval;
-
- retval = locks_mandatory_area(inode, file, pos, end,
- write ? F_WRLCK : F_RDLCK);
- if (retval < 0)
- return retval;
- }
-
- return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
- }
- /*
- * Ensure that we don't remap a partial EOF block in the middle of something
- * else. Assume that the offsets have already been checked for block
- * alignment.
- *
- * For clone we only link a partial EOF block above or at the destination file's
- * EOF. For deduplication we accept a partial EOF block only if it ends at the
- * destination file's EOF (can not link it into the middle of a file).
- *
- * Shorten the request if possible.
- */
- static int generic_remap_check_len(struct inode *inode_in,
- struct inode *inode_out,
- loff_t pos_out,
- loff_t *len,
- unsigned int remap_flags)
- {
- u64 blkmask = i_blocksize(inode_in) - 1;
- loff_t new_len = *len;
-
- if ((*len & blkmask) == 0)
- return 0;
-
- if (pos_out + *len < i_size_read(inode_out))
- new_len &= ~blkmask;
-
- if (new_len == *len)
- return 0;
-
- if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
- *len = new_len;
- return 0;
- }
-
- return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
- }
-
- /* Read a page's worth of file data into the page cache. */
- static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
- {
- struct page *page;
-
- page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
- if (IS_ERR(page))
- return page;
- if (!PageUptodate(page)) {
- put_page(page);
- return ERR_PTR(-EIO);
- }
- return page;
- }
-
/*
- * Lock two pages, ensuring that we lock in offset order if the pages are from
- * the same file.
+ * Don't operate on ranges the page cache doesn't support, and don't exceed the
+ * LFS limits. If pos is under the limit it becomes a short access. If it
+ * exceeds the limit we return -EFBIG.
*/
- static void vfs_lock_two_pages(struct page *page1, struct page *page2)
- {
- /* Always lock in order of increasing index. */
- if (page1->index > page2->index)
- swap(page1, page2);
-
- lock_page(page1);
- if (page1 != page2)
- lock_page(page2);
- }
-
- /* Unlock two pages, being careful not to unlock the same page twice. */
- static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
+ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
{
- unlock_page(page1);
- if (page1 != page2)
- unlock_page(page2);
- }
-
- /*
- * Compare extents of two files to see if they are the same.
- * Caller must have locked both inodes to prevent write races.
- */
- static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- struct inode *dest, loff_t destoff,
- loff_t len, bool *is_same)
- {
- loff_t src_poff;
- loff_t dest_poff;
- void *src_addr;
- void *dest_addr;
- struct page *src_page;
- struct page *dest_page;
- loff_t cmp_len;
- bool same;
- int error;
-
- error = -EINVAL;
- same = true;
- while (len) {
- src_poff = srcoff & (PAGE_SIZE - 1);
- dest_poff = destoff & (PAGE_SIZE - 1);
- cmp_len = min(PAGE_SIZE - src_poff,
- PAGE_SIZE - dest_poff);
- cmp_len = min(cmp_len, len);
- if (cmp_len <= 0)
- goto out_error;
-
- src_page = vfs_dedupe_get_page(src, srcoff);
- if (IS_ERR(src_page)) {
- error = PTR_ERR(src_page);
- goto out_error;
- }
- dest_page = vfs_dedupe_get_page(dest, destoff);
- if (IS_ERR(dest_page)) {
- error = PTR_ERR(dest_page);
- put_page(src_page);
- goto out_error;
- }
-
- vfs_lock_two_pages(src_page, dest_page);
+ struct inode *inode = file->f_mapping->host;
+ loff_t max_size = inode->i_sb->s_maxbytes;
+ loff_t limit = rlimit(RLIMIT_FSIZE);
- /*
- * Now that we've locked both pages, make sure they're still
- * mapped to the file data we're interested in. If not,
- * someone is invalidating pages on us and we lose.
- */
- if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
- src_page->mapping != src->i_mapping ||
- dest_page->mapping != dest->i_mapping) {
- same = false;
- goto unlock;
+ if (limit != RLIM_INFINITY) {
+ if (pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ return -EFBIG;
}
+ *count = min(*count, limit - pos);
+ }
- src_addr = kmap_atomic(src_page);
- dest_addr = kmap_atomic(dest_page);
-
- flush_dcache_page(src_page);
- flush_dcache_page(dest_page);
-
- if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
- same = false;
+ if (!(file->f_flags & O_LARGEFILE))
+ max_size = MAX_NON_LFS;
- kunmap_atomic(dest_addr);
- kunmap_atomic(src_addr);
- unlock:
- vfs_unlock_two_pages(src_page, dest_page);
- put_page(dest_page);
- put_page(src_page);
+ if (unlikely(pos >= max_size))
+ return -EFBIG;
- if (!same)
- break;
+ *count = min(*count, max_size - pos);
- srcoff += cmp_len;
- destoff += cmp_len;
- len -= cmp_len;
- }
-
- *is_same = same;
return 0;
-
- out_error:
- return error;
}
/*
- * Check that the two inodes are eligible for cloning, the ranges make
- * sense, and then flush all dirty data. Caller must ensure that the
- * inodes have been locked against any other modifications.
+ * Performs necessary checks before doing a write
*
- * If there's an error, then the usual negative error code is returned.
- * Otherwise returns 0 with *len set to the request length.
+ * Can adjust writing position or amount of bytes to write.
+ * Returns appropriate error code that caller should return or
+ * zero in case that write should be allowed.
*/
- int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
+ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- bool same_inode = (inode_in == inode_out);
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ loff_t count;
int ret;
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ if (IS_SWAPFILE(inode))
return -ETXTBSY;
- /* Don't reflink dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- /* Zero length dedupe exits immediately; reflink goes to EOF. */
- if (*len == 0) {
- loff_t isize = i_size_read(inode_in);
-
- if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
- return 0;
- if (pos_in > isize)
- return -EINVAL;
- *len = isize - pos_in;
- if (*len == 0)
- return 0;
- }
-
- /* Check that we don't violate system file offset limits. */
- ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* Wait for the completion of any pending IOs on both files */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
- pos_in, pos_in + *len - 1);
- if (ret)
- return ret;
-
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
- pos_out, pos_out + *len - 1);
- if (ret)
- return ret;
-
- /*
- * Check that the extents are the same.
- */
- if (remap_flags & REMAP_FILE_DEDUP) {
- bool is_same = false;
-
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
- if (ret)
- return ret;
- if (!is_same)
- return -EBADE;
- }
-
- ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* If can't alter the file contents, we're done. */
- if (!(remap_flags & REMAP_FILE_DEDUP))
- ret = file_modified(file_out);
-
- return ret;
- }
- EXPORT_SYMBOL(generic_remap_file_range_prep);
-
- loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags)
- {
- loff_t ret;
-
- WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
-
- /*
- * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
- * the same mount. Practically, they only need to be on the same file
- * system.
- */
- if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
- return -EXDEV;
-
- ret = generic_file_rw_checks(file_in, file_out);
- if (ret < 0)
- return ret;
-
- if (!file_in->f_op->remap_file_range)
- return -EOPNOTSUPP;
-
- ret = remap_verify_area(file_in, pos_in, len, false);
- if (ret)
- return ret;
-
- ret = remap_verify_area(file_out, pos_out, len, true);
- if (ret)
- return ret;
-
- ret = file_in->f_op->remap_file_range(file_in, pos_in,
- file_out, pos_out, len, remap_flags);
- if (ret < 0)
- return ret;
-
- fsnotify_access(file_in);
- fsnotify_modify(file_out);
- return ret;
- }
- EXPORT_SYMBOL(do_clone_file_range);
-
- loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags)
- {
- loff_t ret;
-
- file_start_write(file_out);
- ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- file_end_write(file_out);
-
- return ret;
- }
- EXPORT_SYMBOL(vfs_clone_file_range);
-
- /* Check whether we are allowed to dedupe the destination file */
- static bool allow_file_dedupe(struct file *file)
- {
- if (capable(CAP_SYS_ADMIN))
- return true;
- if (file->f_mode & FMODE_WRITE)
- return true;
- if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
- return true;
- if (!inode_permission(file_inode(file), MAY_WRITE))
- return true;
- return false;
- }
+ if (!iov_iter_count(from))
+ return 0;
- loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
- struct file *dst_file, loff_t dst_pos,
- loff_t len, unsigned int remap_flags)
- {
- loff_t ret;
+ /* FIXME: this is for backwards compatibility with 2.4 */
+ if (iocb->ki_flags & IOCB_APPEND)
+ iocb->ki_pos = i_size_read(inode);
- WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
- REMAP_FILE_CAN_SHORTEN));
+ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ return -EINVAL;
- ret = mnt_want_write_file(dst_file);
+ count = iov_iter_count(from);
+ ret = generic_write_check_limits(file, iocb->ki_pos, &count);
if (ret)
return ret;
- ret = remap_verify_area(dst_file, dst_pos, len, true);
- if (ret < 0)
- goto out_drop_write;
-
- ret = -EPERM;
- if (!allow_file_dedupe(dst_file))
- goto out_drop_write;
-
- ret = -EXDEV;
- if (src_file->f_path.mnt != dst_file->f_path.mnt)
- goto out_drop_write;
-
- ret = -EISDIR;
- if (S_ISDIR(file_inode(dst_file)->i_mode))
- goto out_drop_write;
-
- ret = -EINVAL;
- if (!dst_file->f_op->remap_file_range)
- goto out_drop_write;
-
- if (len == 0) {
- ret = 0;
- goto out_drop_write;
- }
-
- ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
- dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
- out_drop_write:
- mnt_drop_write_file(dst_file);
-
- return ret;
+ iov_iter_truncate(from, count);
+ return iov_iter_count(from);
}
- EXPORT_SYMBOL(vfs_dedupe_file_range_one);
+ EXPORT_SYMBOL(generic_write_checks);
- int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
+ /*
+ * Performs common checks before doing a file copy/clone
+ * from @file_in to @file_out.
+ */
+ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
{
- struct file_dedupe_range_info *info;
- struct inode *src = file_inode(file);
- u64 off;
- u64 len;
- int i;
- int ret;
- u16 count = same->dest_count;
- loff_t deduped;
-
- if (!(file->f_mode & FMODE_READ))
- return -EINVAL;
-
- if (same->reserved1 || same->reserved2)
- return -EINVAL;
-
- off = same->src_offset;
- len = same->src_length;
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
- if (S_ISDIR(src->i_mode))
+ /* Don't copy dirs, pipes, sockets... */
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
-
- if (!S_ISREG(src->i_mode))
- return -EINVAL;
-
- if (!file->f_op->remap_file_range)
- return -EOPNOTSUPP;
-
- ret = remap_verify_area(file, off, len, false);
- if (ret < 0)
- return ret;
- ret = 0;
-
- if (off + len > i_size_read(src))
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
- /* Arbitrary 1G limit on a single dedupe request, can be raised. */
- len = min_t(u64, len, 1 << 30);
-
- /* pre-format output fields to sane values */
- for (i = 0; i < count; i++) {
- same->info[i].bytes_deduped = 0ULL;
- same->info[i].status = FILE_DEDUPE_RANGE_SAME;
- }
-
- for (i = 0, info = same->info; i < count; i++, info++) {
- struct fd dst_fd = fdget(info->dest_fd);
- struct file *dst_file = dst_fd.file;
-
- if (!dst_file) {
- info->status = -EBADF;
- goto next_loop;
- }
-
- if (info->reserved) {
- info->status = -EINVAL;
- goto next_fdput;
- }
-
- deduped = vfs_dedupe_file_range_one(file, off, dst_file,
- info->dest_offset, len,
- REMAP_FILE_CAN_SHORTEN);
- if (deduped == -EBADE)
- info->status = FILE_DEDUPE_RANGE_DIFFERS;
- else if (deduped < 0)
- info->status = deduped;
- else
- info->bytes_deduped = len;
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND))
+ return -EBADF;
- next_fdput:
- fdput(dst_fd);
- next_loop:
- if (fatal_signal_pending(current))
- break;
- }
- return ret;
+ return 0;
}
- EXPORT_SYMBOL(vfs_dedupe_file_range);
/* File supports async buffered reads */
#define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000)
-/*
- * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
- * that indicates that they should check the contents of the iovec are
- * valid, but not check the memory that the iovec elements
- * points too.
- */
-#define CHECK_IOVEC_ONLY -1
-
/*
* Attribute flags. These should be or-ed together to figure out what
* has been changed!
WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME,
};
-#define IOCB_EVENTFD (1 << 0)
-#define IOCB_APPEND (1 << 1)
-#define IOCB_DIRECT (1 << 2)
-#define IOCB_HIPRI (1 << 3)
-#define IOCB_DSYNC (1 << 4)
-#define IOCB_SYNC (1 << 5)
-#define IOCB_WRITE (1 << 6)
-#define IOCB_NOWAIT (1 << 7)
+/* Match RWF_* bits to IOCB bits */
+#define IOCB_HIPRI (__force int) RWF_HIPRI
+#define IOCB_DSYNC (__force int) RWF_DSYNC
+#define IOCB_SYNC (__force int) RWF_SYNC
+#define IOCB_NOWAIT (__force int) RWF_NOWAIT
+#define IOCB_APPEND (__force int) RWF_APPEND
+
+/* non-RWF related bits - start at 16 */
+#define IOCB_EVENTFD (1 << 16)
+#define IOCB_DIRECT (1 << 17)
+#define IOCB_WRITE (1 << 18)
/* iocb->ki_waitq is valid */
-#define IOCB_WAITQ (1 << 8)
-#define IOCB_NOIO (1 << 9)
+#define IOCB_WAITQ (1 << 19)
+#define IOCB_NOIO (1 << 20)
struct kiocb {
struct file *ki_filp;
#define SB_ACTIVE (1<<30)
#define SB_NOUSER (1<<31)
+/* These flags relate to encoding and casefolding */
+#define SB_ENC_STRICT_MODE_FL (1 << 0)
+
+#define sb_has_strict_encoding(sb) \
+ (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)
+
/*
* Umount options
*/
#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */
#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */
#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */
-#define SB_I_MULTIROOT 0x00000008 /* Multiple roots to the dentry tree */
+#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */
/* sb->s_iflags to limit user namespace mounts */
#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */
#endif
#ifdef CONFIG_FS_VERITY
const struct fsverity_operations *s_vop;
+#endif
+#ifdef CONFIG_UNICODE
+ struct unicode_map *s_encoding;
+ __u16 s_encoding_flags;
#endif
struct hlist_bl_head s_roots; /* alternate root dentries for NFS */
struct list_head s_mounts; /* list of mounts; _not_ for fs use */
return file->f_op->mmap(file, vma);
}
-ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
- unsigned long nr_segs, unsigned long fast_segs,
- struct iovec *fast_pointer,
- struct iovec **ret_pointer);
-
extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
-extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
- unsigned long, loff_t *, rwf_t);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
loff_t, size_t, unsigned int);
extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
#define FS_HAS_SUBTYPE 4
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */
+#define FS_THP_SUPPORT 8192 /* Remove once all fs converted */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
int (*init_fs_context)(struct fs_context *);
const struct fs_parameter_spec *parameters;
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end);
+void invalidate_mapping_pagevec(struct address_space *mapping,
+ pgoff_t start, pgoff_t end,
+ unsigned long *nr_pagevec);
+
static inline void invalidate_remote_inode(struct inode *inode)
{
if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
}
-static inline int filemap_nr_thps(struct address_space *mapping)
-{
-#ifdef CONFIG_READ_ONLY_THP_FOR_FS
- return atomic_read(&mapping->nr_thps);
-#else
- return 0;
-#endif
-}
-
-static inline void filemap_nr_thps_inc(struct address_space *mapping)
-{
-#ifdef CONFIG_READ_ONLY_THP_FOR_FS
- atomic_inc(&mapping->nr_thps);
-#else
- WARN_ON_ONCE(1);
-#endif
-}
-
-static inline void filemap_nr_thps_dec(struct address_space *mapping)
-{
-#ifdef CONFIG_READ_ONLY_THP_FOR_FS
- atomic_dec(&mapping->nr_thps);
-#else
- WARN_ON_ONCE(1);
-#endif
-}
-
extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
int datasync);
extern int vfs_fsync(struct file *file, int datasync);
#endif
extern int do_pipe_flags(int *, int);
-#define __kernel_read_file_id(id) \
- id(UNKNOWN, unknown) \
- id(FIRMWARE, firmware) \
- id(FIRMWARE_PREALLOC_BUFFER, firmware) \
- id(FIRMWARE_EFI_EMBEDDED, firmware) \
- id(MODULE, kernel-module) \
- id(KEXEC_IMAGE, kexec-image) \
- id(KEXEC_INITRAMFS, kexec-initramfs) \
- id(POLICY, security-policy) \
- id(X509_CERTIFICATE, x509-certificate) \
- id(MAX_ID, )
-
-#define __fid_enumify(ENUM, dummy) READING_ ## ENUM,
-#define __fid_stringify(dummy, str) #str,
-
-enum kernel_read_file_id {
- __kernel_read_file_id(__fid_enumify)
-};
-
-static const char * const kernel_read_file_str[] = {
- __kernel_read_file_id(__fid_stringify)
-};
-
-static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id)
-{
- if ((unsigned)id >= READING_MAX_ID)
- return kernel_read_file_str[READING_UNKNOWN];
-
- return kernel_read_file_str[id];
-}
-
-extern int kernel_read_file(struct file *, void **, loff_t *, loff_t,
- enum kernel_read_file_id);
-extern int kernel_read_file_from_path(const char *, void **, loff_t *, loff_t,
- enum kernel_read_file_id);
-extern int kernel_read_file_from_path_initns(const char *, void **, loff_t *, loff_t,
- enum kernel_read_file_id);
-extern int kernel_read_file_from_fd(int, void **, loff_t *, loff_t,
- enum kernel_read_file_id);
extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos);
extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
- extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *count, unsigned int remap_flags);
+ extern int generic_write_check_limits(struct file *file, loff_t pos,
+ loff_t *count);
extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
- extern int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t *count, unsigned int flags);
extern ssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *to, ssize_t already_read);
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
DIO_SKIP_HOLES = 0x02,
};
-void dio_end_io(struct bio *bio);
-
ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, struct iov_iter *iter,
get_block_t get_block,
extern int generic_check_addressable(unsigned, u64);
+#ifdef CONFIG_UNICODE
+extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
+extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+ const char *str, const struct qstr *name);
+#endif
+
#ifdef CONFIG_MIGRATION
extern int buffer_migrate_page(struct address_space *,
struct page *, struct page *,
{
int kiocb_flags = 0;
+ /* make sure there's no overlap between RWF and private IOCB flags */
+ BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD);
+
if (!flags)
return 0;
if (unlikely(flags & ~RWF_SUPPORTED))
if (flags & RWF_NOWAIT) {
if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
return -EOPNOTSUPP;
- kiocb_flags |= IOCB_NOWAIT | IOCB_NOIO;
+ kiocb_flags |= IOCB_NOIO;
}
- if (flags & RWF_HIPRI)
- kiocb_flags |= IOCB_HIPRI;
- if (flags & RWF_DSYNC)
- kiocb_flags |= IOCB_DSYNC;
+ kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
if (flags & RWF_SYNC)
- kiocb_flags |= (IOCB_DSYNC | IOCB_SYNC);
- if (flags & RWF_APPEND)
- kiocb_flags |= IOCB_APPEND;
+ kiocb_flags |= IOCB_DSYNC;
ki->ki_flags |= kiocb_flags;
return 0;
extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
int advice);
-#if defined(CONFIG_IO_URING)
-extern struct sock *io_uring_get_socket(struct file *file);
-#else
-static inline struct sock *io_uring_get_socket(struct file *file)
-{
- return NULL;
-}
-#endif
-
int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
unsigned int flags);
freepage(page);
if (PageTransHuge(page) && !PageHuge(page)) {
- page_ref_sub(page, HPAGE_PMD_NR);
+ page_ref_sub(page, thp_nr_pages(page));
VM_BUG_ON_PAGE(page_count(page) <= 0, page);
} else {
put_page(page);
.range_end = end,
};
- if (!mapping_cap_writeback_dirty(mapping) ||
+ if (!mapping_can_writeback(mapping) ||
!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
-static int __add_to_page_cache_locked(struct page *page,
- struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask,
- void **shadowp)
+noinline int __add_to_page_cache_locked(struct page *page,
+ struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp,
+ void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
int error;
- void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
page->index = offset;
if (!huge) {
- error = mem_cgroup_charge(page, current->mm, gfp_mask);
+ error = mem_cgroup_charge(page, current->mm, gfp);
if (error)
goto error;
}
+ gfp &= GFP_RECLAIM_MASK;
+
do {
+ unsigned int order = xa_get_order(xas.xa, xas.xa_index);
+ void *entry, *old = NULL;
+
+ if (order > thp_order(page))
+ xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
+ order, gfp);
xas_lock_irq(&xas);
- old = xas_load(&xas);
- if (old && !xa_is_value(old))
- xas_set_err(&xas, -EEXIST);
+ xas_for_each_conflict(&xas, entry) {
+ old = entry;
+ if (!xa_is_value(entry)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ }
+
+ if (old) {
+ if (shadowp)
+ *shadowp = old;
+ /* entry may have been split before we acquired lock */
+ order = xa_get_order(xas.xa, xas.xa_index);
+ if (order > thp_order(page)) {
+ xas_split(&xas, old, order);
+ xas_reset(&xas);
+ }
+ }
+
xas_store(&xas, page);
if (xas_error(&xas))
goto unlock;
- if (xa_is_value(old)) {
+ if (old)
mapping->nrexceptional--;
- if (shadowp)
- *shadowp = old;
- }
mapping->nrpages++;
/* hugetlb pages do not participate in page cache accounting */
__inc_lruvec_page_state(page, NR_FILE_PAGES);
unlock:
xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+ } while (xas_nomem(&xas, gfp));
if (xas_error(&xas)) {
error = xas_error(&xas);
* unlock_page - unlock a locked page
* @page: the page
*
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
+ * Unlocks the page and wakes up sleepers in wait_on_page_locked().
* Also wakes sleepers in wait_on_page_writeback() because the wakeup
* mechanism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
/**
* find_get_entry - find and get a page cache entry
* @mapping: the address_space to search
- * @offset: the page cache index
+ * @index: The page cache index.
*
* Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned with an increased refcount.
+ * page cache page, the head page is returned with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * Return: the found page or shadow entry, %NULL if nothing is found.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
*/
-struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
{
- XA_STATE(xas, &mapping->i_pages, offset);
+ XA_STATE(xas, &mapping->i_pages, index);
struct page *page;
rcu_read_lock();
put_page(page);
goto repeat;
}
- page = find_subpage(page, offset);
out:
rcu_read_unlock();
}
/**
- * find_lock_entry - locate, pin and lock a page cache entry
- * @mapping: the address_space to search
- * @offset: the page cache index
+ * find_lock_entry - Locate and lock a page cache entry.
+ * @mapping: The address_space to search.
+ * @index: The page cache index.
*
- * Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned locked and with an increased
- * refcount.
+ * Looks up the page at @mapping & @index. If there is a page in the
+ * cache, the head page is returned locked and with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * find_lock_entry() may sleep.
- *
- * Return: the found page or shadow entry, %NULL if nothing is found.
+ * Context: May sleep.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
*/
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
{
struct page *page;
repeat:
- page = find_get_entry(mapping, offset);
+ page = find_get_entry(mapping, index);
if (page && !xa_is_value(page)) {
lock_page(page);
/* Has the page been truncated? */
- if (unlikely(page_mapping(page) != mapping)) {
+ if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
}
return page;
}
-EXPORT_SYMBOL(find_lock_entry);
/**
* pagecache_get_page - Find and get a reference to a page.
*
* * %FGP_ACCESSED - The page will be marked accessed.
* * %FGP_LOCK - The page is returned locked.
+ * * %FGP_HEAD - If the page is present and a THP, return the head page
+ * rather than the exact page specified by the index.
* * %FGP_CREAT - If no page is present then a new page is allocated using
* @gfp_mask and added to the page cache and the VM's LRU list.
* The page is returned locked and with an increased refcount.
}
/* Has the page been truncated? */
- if (unlikely(compound_head(page)->mapping != mapping)) {
+ if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page->index != index, page);
+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
}
if (fgp_flags & FGP_ACCESSED)
if (page_is_idle(page))
clear_page_idle(page);
}
+ if (!(fgp_flags & FGP_HEAD))
+ page = find_subpage(page, index);
no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
int err;
- if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
+ if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
gfp_mask |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
gfp_mask &= ~__GFP_FS;
last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
offset = *ppos & ~PAGE_MASK;
+ /*
+ * If we've already successfully copied some data, then we
+ * can no longer safely return -EIOCBQUEUED. Hence mark
+ * an async read NOWAIT at that point.
+ */
+ if (written && (iocb->ki_flags & IOCB_WAITQ))
+ iocb->ki_flags |= IOCB_NOWAIT;
+
for (;;) {
struct page *page;
pgoff_t end_index;
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
struct file *fpin = NULL;
- pgoff_t offset = vmf->pgoff;
unsigned int mmap_miss;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_SEQ_READ) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- page_cache_sync_readahead(mapping, ra, file, offset,
- ra->ra_pages);
+ page_cache_sync_ra(&ractl, ra, ra->ra_pages);
return fpin;
}
* mmap read-around
*/
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
- ra_submit(ra, mapping, file);
+ ractl._index = ra->start;
+ do_page_cache_ra(&ractl, ra->size, ra->async_size);
return fpin;
}
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
- struct page *page;
+ struct page *head, *page;
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
rcu_read_lock();
- xas_for_each(&xas, page, end_pgoff) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, head, end_pgoff) {
+ if (xas_retry(&xas, head))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(head))
goto next;
/*
* Check for a locked page first, as a speculative
* reference may adversely influence page migration.
*/
- if (PageLocked(page))
+ if (PageLocked(head))
goto next;
- if (!page_cache_get_speculative(page))
+ if (!page_cache_get_speculative(head))
goto next;
/* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
+ if (unlikely(head != xas_reload(&xas)))
goto skip;
- page = find_subpage(page, xas.xa_index);
+ page = find_subpage(head, xas.xa_index);
- if (!PageUptodate(page) ||
+ if (!PageUptodate(head) ||
PageReadahead(page) ||
PageHWPoison(page))
goto skip;
- if (!trylock_page(page))
+ if (!trylock_page(head))
goto skip;
- if (page->mapping != mapping || !PageUptodate(page))
+ if (head->mapping != mapping || !PageUptodate(head))
goto unlock;
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
- if (page->index >= max_idx)
+ if (xas.xa_index >= max_idx)
goto unlock;
if (mmap_miss > 0)
last_pgoff = xas.xa_index;
if (alloc_set_pte(vmf, page))
goto unlock;
- unlock_page(page);
+ unlock_page(head);
goto next;
unlock:
- unlock_page(page);
+ unlock_page(head);
skip:
- put_page(page);
+ put_page(head);
next:
/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd))
goto out;
/*
- * Page is not up to date and may be locked due one of the following
+ * Page is not up to date and may be locked due to one of the following
* case a: Page is being filled and the page lock is held
* case b: Read/write error clearing the page uptodate status
* case c: Truncation in progress (page locked)
}
EXPORT_SYMBOL(read_cache_page_gfp);
- /*
- * Don't operate on ranges the page cache doesn't support, and don't exceed the
- * LFS limits. If pos is under the limit it becomes a short access. If it
- * exceeds the limit we return -EFBIG.
- */
- static int generic_write_check_limits(struct file *file, loff_t pos,
- loff_t *count)
- {
- struct inode *inode = file->f_mapping->host;
- loff_t max_size = inode->i_sb->s_maxbytes;
- loff_t limit = rlimit(RLIMIT_FSIZE);
-
- if (limit != RLIM_INFINITY) {
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- *count = min(*count, limit - pos);
- }
-
- if (!(file->f_flags & O_LARGEFILE))
- max_size = MAX_NON_LFS;
-
- if (unlikely(pos >= max_size))
- return -EFBIG;
-
- *count = min(*count, max_size - pos);
-
- return 0;
- }
-
- /*
- * Performs necessary checks before doing a write
- *
- * Can adjust writing position or amount of bytes to write.
- * Returns appropriate error code that caller should return or
- * zero in case that write should be allowed.
- */
- inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
- {
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- loff_t count;
- int ret;
-
- if (IS_SWAPFILE(inode))
- return -ETXTBSY;
-
- if (!iov_iter_count(from))
- return 0;
-
- /* FIXME: this is for backwards compatibility with 2.4 */
- if (iocb->ki_flags & IOCB_APPEND)
- iocb->ki_pos = i_size_read(inode);
-
- if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
- return -EINVAL;
-
- count = iov_iter_count(from);
- ret = generic_write_check_limits(file, iocb->ki_pos, &count);
- if (ret)
- return ret;
-
- iov_iter_truncate(from, count);
- return iov_iter_count(from);
- }
- EXPORT_SYMBOL(generic_write_checks);
-
- /*
- * Performs necessary checks before doing a clone.
- *
- * Can adjust amount of bytes to clone via @req_count argument.
- * Returns appropriate error code that caller should return or
- * zero in case the clone should be allowed.
- */
- int generic_remap_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *req_count, unsigned int remap_flags)
- {
- struct inode *inode_in = file_in->f_mapping->host;
- struct inode *inode_out = file_out->f_mapping->host;
- uint64_t count = *req_count;
- uint64_t bcount;
- loff_t size_in, size_out;
- loff_t bs = inode_out->i_sb->s_blocksize;
- int ret;
-
- /* The start of both ranges must be aligned to an fs block. */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
- return -EINVAL;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EINVAL;
-
- size_in = i_size_read(inode_in);
- size_out = i_size_read(inode_out);
-
- /* Dedupe requires both ranges to be within EOF. */
- if ((remap_flags & REMAP_FILE_DEDUP) &&
- (pos_in >= size_in || pos_in + count > size_in ||
- pos_out >= size_out || pos_out + count > size_out))
- return -EINVAL;
-
- /* Ensure the infile range is within the infile. */
- if (pos_in >= size_in)
- return -EINVAL;
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /*
- * If the user wanted us to link to the infile's EOF, round up to the
- * next block boundary for this check.
- *
- * Otherwise, make sure the count is also block-aligned, having
- * already confirmed the starting offsets' block alignment.
- */
- if (pos_in + count == size_in) {
- bcount = ALIGN(size_in, bs) - pos_in;
- } else {
- if (!IS_ALIGNED(count, bs))
- count = ALIGN_DOWN(count, bs);
- bcount = count;
- }
-
- /* Don't allow overlapped cloning within the same file. */
- if (inode_in == inode_out &&
- pos_out + bcount > pos_in &&
- pos_out < pos_in + bcount)
- return -EINVAL;
-
- /*
- * We shortened the request but the caller can't deal with that, so
- * bounce the request back to userspace.
- */
- if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
- return -EINVAL;
-
- *req_count = count;
- return 0;
- }
-
-
- /*
- * Performs common checks before doing a file copy/clone
- * from @file_in to @file_out.
- */
- int generic_file_rw_checks(struct file *file_in, struct file *file_out)
- {
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
-
- /* Don't copy dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- if (!(file_in->f_mode & FMODE_READ) ||
- !(file_out->f_mode & FMODE_WRITE) ||
- (file_out->f_flags & O_APPEND))
- return -EBADF;
-
- return 0;
- }
-
- /*
- * Performs necessary checks before doing a file copy
- *
- * Can adjust amount of bytes to copy via @req_count argument.
- * Returns appropriate error code that caller should return or
- * zero in case the copy should be allowed.
- */
- int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t *req_count, unsigned int flags)
- {
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- uint64_t count = *req_count;
- loff_t size_in;
- int ret;
-
- ret = generic_file_rw_checks(file_in, file_out);
- if (ret)
- return ret;
-
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- return -ETXTBSY;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EOVERFLOW;
-
- /* Shorten the copy to EOF */
- size_in = i_size_read(inode_in);
- if (pos_in >= size_in)
- count = 0;
- else
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /* Don't allow overlapped copying within the same file. */
- if (inode_in == inode_out &&
- pos_out + count > pos_in &&
- pos_out < pos_in + count)
- return -EINVAL;
-
- *req_count = count;
- return 0;
- }
-
int pagecache_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)