1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_format.h"
14 #include "scrub/xfile.h"
15 #include "scrub/xfarray.h"
16 #include "scrub/scrub.h"
17 #include "scrub/trace.h"
18 #include <linux/shmem_fs.h>
21 * Swappable Temporary Memory
22 * ==========================
24 * Online checking sometimes needs to be able to stage a large amount of data
25 * in memory. This information might not fit in the available memory and it
26 * doesn't all need to be accessible at all times. In other words, we want an
27 * indexed data buffer to store data that can be paged out.
29 * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
30 * requirements. Therefore, the xfile mechanism uses an unlinked shmem file to
31 * store our staging data. This file is not installed in the file descriptor
32 * table so that user programs cannot access the data, which means that the
33 * xfile must be freed with xfile_destroy.
35 * xfiles assume that the caller will handle all required concurrency
36 * management; standard vfs locks (freezer and inode) are not taken. Reads
37 * and writes are satisfied directly from the page cache.
39 * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
40 * of a hole cause a page to be mapped into the file. If you are going to
41 * create a sparse xfile, please be careful about reading from uninitialized
42 * parts of the file. These pages are !Uptodate and will eventually be
43 * reclaimed if not written, but in the short term this boosts memory
48 * xfiles must not be exposed to userspace and require upper layers to
49 * coordinate access to the one handle returned by the constructor, so
50 * establish a separate lock class for xfiles to avoid confusing lockdep.
52 static struct lock_class_key xfile_i_mutex_key;
55 * Create an xfile of the given size. The description will be used in the
60 const char *description,
62 struct xfile **xfilep)
68 xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
72 xf->file = shmem_file_setup(description, isize, 0);
75 if (IS_ERR(xf->file)) {
76 error = PTR_ERR(xf->file);
81 * We want a large sparse file that we can pread, pwrite, and seek.
82 * xfile users are responsible for keeping the xfile hidden away from
83 * all other callers, so we skip timestamp updates and security checks.
84 * Make the inode only accessible by root, just in case the xfile ever
87 xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
89 xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
90 inode = file_inode(xf->file);
91 inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
92 inode->i_mode &= ~0177;
93 inode->i_uid = GLOBAL_ROOT_UID;
94 inode->i_gid = GLOBAL_ROOT_GID;
96 lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
98 trace_xfile_create(xf);
107 /* Close the file and release all resources. */
112 struct inode *inode = file_inode(xf->file);
114 trace_xfile_destroy(xf);
116 lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key);
122 * Read a memory object directly from the xfile's page cache. Unlike regular
123 * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
124 * high an offset, instead of truncating the read. Otherwise, we return
125 * bytes read or an error code, like regular pread.
134 struct inode *inode = file_inode(xf->file);
135 struct address_space *mapping = inode->i_mapping;
136 struct page *page = NULL;
141 if (count > MAX_RW_COUNT)
143 if (inode->i_sb->s_maxbytes - pos < count)
146 trace_xfile_pread(xf, pos, count);
148 pflags = memalloc_nofs_save();
153 len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
156 * In-kernel reads of a shmem file cause it to allocate a page
157 * if the mapping shows a hole. Therefore, if we hit ENOMEM
158 * we can continue by zeroing the caller's buffer.
160 page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
163 error = PTR_ERR(page);
164 if (error != -ENOMEM)
171 if (PageUptodate(page)) {
173 * xfile pages must never be mapped into userspace, so
174 * we skip the dcache flush.
176 kaddr = kmap_local_page(page);
177 p = kaddr + offset_in_page(pos);
191 memalloc_nofs_restore(pflags);
199 * Write a memory object directly to the xfile's page cache. Unlike regular
200 * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
201 * high an offset, instead of truncating the write. Otherwise, we return
202 * bytes written or an error code, like regular pwrite.
211 struct inode *inode = file_inode(xf->file);
212 struct address_space *mapping = inode->i_mapping;
213 const struct address_space_operations *aops = mapping->a_ops;
214 struct page *page = NULL;
219 if (count > MAX_RW_COUNT)
221 if (inode->i_sb->s_maxbytes - pos < count)
224 trace_xfile_pwrite(xf, pos, count);
226 pflags = memalloc_nofs_save();
233 len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
236 * We call write_begin directly here to avoid all the freezer
237 * protection lock-taking that happens in the normal path.
238 * shmem doesn't support fs freeze, but lockdep doesn't know
239 * that and will trip over that.
241 error = aops->write_begin(NULL, mapping, pos, len, &page,
247 * xfile pages must never be mapped into userspace, so we skip
248 * the dcache flush. If the page is not uptodate, zero it
249 * before writing data.
251 kaddr = kmap_local_page(page);
252 if (!PageUptodate(page)) {
253 memset(kaddr, 0, PAGE_SIZE);
254 SetPageUptodate(page);
256 p = kaddr + offset_in_page(pos);
260 ret = aops->write_end(NULL, mapping, pos, len, len, page,
275 memalloc_nofs_restore(pflags);
282 /* Find the next written area in the xfile data for a given offset. */
290 ret = vfs_llseek(xf->file, pos, SEEK_DATA);
291 trace_xfile_seek_data(xf, pos, ret);
295 /* Query stat information for an xfile. */
299 struct xfile_stat *statbuf)
304 error = vfs_getattr_nosec(&xf->file->f_path, &ks,
305 STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
309 statbuf->size = ks.size;
310 statbuf->bytes = ks.blocks << SECTOR_SHIFT;
315 * Grab the (locked) page for a memory object. The object cannot span a page
316 * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we
317 * cannot grab the page, or the usual negative errno.
324 struct xfile_page *xfpage)
326 struct inode *inode = file_inode(xf->file);
327 struct address_space *mapping = inode->i_mapping;
328 const struct address_space_operations *aops = mapping->a_ops;
329 struct page *page = NULL;
331 loff_t key = round_down(pos, PAGE_SIZE);
335 if (inode->i_sb->s_maxbytes - pos < len)
337 if (len > PAGE_SIZE - offset_in_page(pos))
340 trace_xfile_get_page(xf, pos, len);
342 pflags = memalloc_nofs_save();
345 * We call write_begin directly here to avoid all the freezer
346 * protection lock-taking that happens in the normal path. shmem
347 * doesn't support fs freeze, but lockdep doesn't know that and will
350 error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
355 /* We got the page, so make sure we push out EOF. */
356 if (i_size_read(inode) < pos + len)
357 i_size_write(inode, pos + len);
360 * If the page isn't up to date, fill it with zeroes before we hand it
361 * to the caller and make sure the backing store will hold on to them.
363 if (!PageUptodate(page)) {
366 kaddr = kmap_local_page(page);
367 memset(kaddr, 0, PAGE_SIZE);
369 SetPageUptodate(page);
373 * Mark each page dirty so that the contents are written to some
374 * backing store when we drop this buffer, and take an extra reference
375 * to prevent the xfile page from being swapped or removed from the
376 * page cache by reclaim if the caller unlocks the page.
378 set_page_dirty(page);
382 xfpage->fsdata = fsdata;
385 memalloc_nofs_restore(pflags);
390 * Release the (locked) page for a memory object. Returns 0 or a negative
396 struct xfile_page *xfpage)
398 struct inode *inode = file_inode(xf->file);
399 struct address_space *mapping = inode->i_mapping;
400 const struct address_space_operations *aops = mapping->a_ops;
404 trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
406 /* Give back the reference that we took in xfile_get_page. */
407 put_page(xfpage->page);
409 pflags = memalloc_nofs_save();
410 ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
411 xfpage->page, xfpage->fsdata);
412 memalloc_nofs_restore(pflags);
413 memset(xfpage, 0, sizeof(struct xfile_page));
417 if (ret != PAGE_SIZE)