such `reservations
because writeback will not consume the reservation.
-The ``iomap_file_buffered_write_punch_delalloc`` can be called from a
+The ``iomap_write_delalloc_release`` can be called from a
``->iomap_end`` function to find all the clean areas of the folios
caching a fresh (``IOMAP_F_NEW``) delalloc mapping.
It takes the ``invalidate_lock``.
if the mapping is unwritten and the filesystem cannot handle zeroing
the unaligned regions without exposing stale contents.
+ * ``IOMAP_ATOMIC``: This write is being issued with torn-write
+ protection.
+ Only a single bio can be created for the write, and the write must
+ not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
+ set.
+ The file range to write must be aligned to satisfy the requirements
+ of both the filesystem and the underlying block device's atomic
+ commit capabilities.
+ If filesystem metadata updates are required (e.g. unwritten extent
+ conversion or copy on write), all updates for the entire file range
+ must be committed atomically as well.
+ Only one space mapping is allowed per untorn write.
+ Untorn writes must be aligned to, and must not be longer than, a
+ single file block.
+
Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
calling this function.
return 0;
}
+ /*
+ * ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
+ * @sb: super block
+ * TODO: Later add support for bigalloc
+ */
+ static void ext4_atomic_write_init(struct super_block *sb)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct block_device *bdev = sb->s_bdev;
+
+ if (!bdev_can_atomic_write(bdev))
+ return;
+
+ if (!ext4_has_feature_extents(sb))
+ return;
+
+ sbi->s_awu_min = max(sb->s_blocksize,
+ bdev_atomic_write_unit_min_bytes(bdev));
+ sbi->s_awu_max = min(sb->s_blocksize,
+ bdev_atomic_write_unit_max_bytes(bdev));
+ if (sbi->s_awu_min && sbi->s_awu_max &&
+ sbi->s_awu_min <= sbi->s_awu_max) {
+ ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u",
+ sbi->s_awu_min, sbi->s_awu_max);
+ } else {
+ sbi->s_awu_min = 0;
+ sbi->s_awu_max = 0;
+ }
+ }
+
static void ext4_fast_commit_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
spin_lock_init(&sbi->s_bdev_wb_lock);
+ ext4_atomic_write_init(sb);
ext4_fast_commit_init(sb);
sb->s_root = NULL;
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
.kill_sb = ext4_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("ext4");
return ret;
}
+/*
+ * Take care of zeroing post-EOF blocks when they might exist.
+ *
+ * Returns 0 if successfully, a negative error for a failure, or 1 if this
+ * function dropped the iolock and reacquired it exclusively and the caller
+ * needs to restart the write sanity checks.
+ */
+static ssize_t
+xfs_file_write_zero_eof(
+ struct kiocb *iocb,
+ struct iov_iter *from,
+ unsigned int *iolock,
+ size_t count,
+ bool *drained_dio)
+{
+ struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+ loff_t isize;
+ int error;
+
+ /*
+ * We need to serialise against EOF updates that occur in IO completions
+ * here. We want to make sure that nobody is changing the size while
+ * we do this check until we have placed an IO barrier (i.e. hold
+ * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
+ * spinlock effectively forms a memory barrier once we have
+ * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
+ * hence be able to correctly determine if we need to run zeroing.
+ */
+ spin_lock(&ip->i_flags_lock);
+ isize = i_size_read(VFS_I(ip));
+ if (iocb->ki_pos <= isize) {
+ spin_unlock(&ip->i_flags_lock);
+ return 0;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
+ if (!*drained_dio) {
+ /*
+ * If zeroing is needed and we are currently holding the iolock
+ * shared, we need to update it to exclusive which implies
+ * having to redo all checks before.
+ */
+ if (*iolock == XFS_IOLOCK_SHARED) {
+ xfs_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_ilock(ip, *iolock);
+ iov_iter_reexpand(from, count);
+ }
+
+ /*
+ * We now have an IO submission barrier in place, but AIO can do
+ * EOF updates during IO completion and hence we now need to
+ * wait for all of them to drain. Non-AIO DIO will have drained
+ * before we are given the XFS_IOLOCK_EXCL, and so for most
+ * cases this wait is a no-op.
+ */
+ inode_dio_wait(VFS_I(ip));
+ *drained_dio = true;
+ return 1;
+ }
+
+ trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
+
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+
+ return error;
+}
+
/*
* Common pre-write limit and setup checks.
*
- * Called with the iolocked held either shared and exclusive according to
+ * Called with the iolock held either shared and exclusive according to
* @iolock, and returns with it held. Might upgrade the iolock to exclusive
* if called for a direct write beyond i_size.
*/
struct iov_iter *from,
unsigned int *iolock)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- ssize_t error = 0;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
size_t count = iov_iter_count(from);
bool drained_dio = false;
- loff_t isize;
+ ssize_t error;
restart:
error = generic_write_checks(iocb, from);
* exclusively.
*/
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
- xfs_iunlock(ip, *iolock);
+ xfs_iunlock(XFS_I(inode), *iolock);
*iolock = XFS_IOLOCK_EXCL;
error = xfs_ilock_iocb(iocb, *iolock);
if (error) {
}
/*
- * If the offset is beyond the size of the file, we need to zero any
+ * If the offset is beyond the size of the file, we need to zero all
* blocks that fall between the existing EOF and the start of this
- * write. If zeroing is needed and we are currently holding the iolock
- * shared, we need to update it to exclusive which implies having to
- * redo all checks before.
- *
- * We need to serialise against EOF updates that occur in IO completions
- * here. We want to make sure that nobody is changing the size while we
- * do this check until we have placed an IO barrier (i.e. hold the
- * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
- * spinlock effectively forms a memory barrier once we have the
- * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
- * hence be able to correctly determine if we need to run zeroing.
+ * write.
*
- * We can do an unlocked check here safely as IO completion can only
- * extend EOF. Truncate is locked out at this point, so the EOF can
- * not move backwards, only forwards. Hence we only need to take the
- * slow path and spin locks when we are at or beyond the current EOF.
+ * We can do an unlocked check for i_size here safely as I/O completion
+ * can only extend EOF. Truncate is locked out at this point, so the
+ * EOF can not move backwards, only forwards. Hence we only need to take
+ * the slow path when we are at or beyond the current EOF.
*/
- if (iocb->ki_pos <= i_size_read(inode))
- goto out;
-
- spin_lock(&ip->i_flags_lock);
- isize = i_size_read(inode);
- if (iocb->ki_pos > isize) {
- spin_unlock(&ip->i_flags_lock);
-
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EAGAIN;
-
- if (!drained_dio) {
- if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_iunlock(ip, *iolock);
- *iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, *iolock);
- iov_iter_reexpand(from, count);
- }
- /*
- * We now have an IO submission barrier in place, but
- * AIO can do EOF updates during IO completion and hence
- * we now need to wait for all of them to drain. Non-AIO
- * DIO will have drained before we are given the
- * XFS_IOLOCK_EXCL, and so for most cases this wait is a
- * no-op.
- */
- inode_dio_wait(inode);
- drained_dio = true;
+ if (iocb->ki_pos > i_size_read(inode)) {
+ error = xfs_file_write_zero_eof(iocb, from, iolock, count,
+ &drained_dio);
+ if (error == 1)
goto restart;
- }
-
- trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
- error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
if (error)
return error;
- } else
- spin_unlock(&ip->i_flags_lock);
+ }
-out:
return kiocb_modified(iocb);
}
if (IS_DAX(inode))
return xfs_file_dax_write(iocb, from);
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ /*
+ * Currently only atomic writing of a single FS block is
+ * supported. It would be possible to atomic write smaller than
+ * a FS block, but there is no requirement to support this.
+ * Note that iomap also does not support this yet.
+ */
+ if (ocount != ip->i_mount->m_sb.sb_blocksize)
+ return -EINVAL;
+ ret = generic_atomic_write_valid(iocb, from);
+ if (ret)
+ return ret;
+ }
+
if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
+ if (xfs_inode_can_atomicwrite(XFS_I(inode)))
+ file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
return generic_file_open(inode, file);
}
return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
}
+static inline bool xfs_inode_has_filedata(const struct xfs_inode *ip)
+{
+ return ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0;
+}
+
/*
* Check if an inode has any data in the COW fork. This might be often false
* even for inodes with the reflink flag when there is no pending COW operation.
(XFS_IS_REALTIME_INODE(ip) ? \
(ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
+ static inline bool
+ xfs_inode_can_atomicwrite(
+ struct xfs_inode *ip)
+ {
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+
+ if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min)
+ return false;
+ if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max)
+ return false;
+
+ return true;
+ }
+
/*
* In-core inode flags.
*/
return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize);
}
+ static void
+ xfs_get_atomic_write_attr(
+ struct xfs_inode *ip,
+ unsigned int *unit_min,
+ unsigned int *unit_max)
+ {
+ if (!xfs_inode_can_atomicwrite(ip)) {
+ *unit_min = *unit_max = 0;
+ return;
+ }
+
+ *unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
+ }
+
STATIC int
xfs_vn_getattr(
struct mnt_idmap *idmap,
stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
stat->atime = inode_get_atime(inode);
- stat->mtime = inode_get_mtime(inode);
- stat->ctime = inode_get_ctime(inode);
+
+ fill_mg_cmtime(stat, request_mask, inode);
+
stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
if (xfs_has_v3inodes(mp)) {
}
}
- if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
- stat->change_cookie = inode_query_iversion(inode);
- stat->result_mask |= STATX_CHANGE_COOKIE;
- }
-
/*
* Note: If you add another clause to set an attribute flag, please
* update attributes_mask below.
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
stat->dio_offset_align = bdev_logical_block_size(bdev);
}
+ if (request_mask & STATX_WRITE_ATOMIC) {
+ unsigned int unit_min, unit_max;
+
+ xfs_get_atomic_write_attr(ip, &unit_min,
+ &unit_max);
+ generic_fill_statx_atomic_writes(stat,
+ unit_min, unit_max);
+ }
fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);
#include <linux/slab.h>
#include <linux/maple_tree.h>
#include <linux/rw_hint.h>
+#include <linux/file_ref.h>
+#include <linux/unicode.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
#define IOP_NOFOLLOW 0x0004
#define IOP_XATTR 0x0008
#define IOP_DEFAULT_READLINK 0x0010
+#define IOP_MGTIME 0x0020
/*
* Keep mostly read-only and often accessed (especially for
/**
* struct file - Represents a file
- * @f_count: reference count
+ * @f_ref: reference count
* @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
* @f_mode: FMODE_* flags often used in hotpaths
* @f_op: file operations
* @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
*/
struct file {
- atomic_long_t f_count;
+ file_ref_t f_ref;
spinlock_t f_lock;
fmode_t f_mode;
const struct file_operations *f_op;
static inline struct file *get_file(struct file *f)
{
- long prior = atomic_long_fetch_inc_relaxed(&f->f_count);
- WARN_ONCE(!prior, "struct file::f_count incremented from zero; use-after-free condition present!\n");
+ file_ref_inc(&f->f_ref);
return f;
}
struct file *get_file_rcu(struct file __rcu **f);
struct file *get_file_active(struct file **f);
-#define file_count(x) atomic_long_read(&(x)->f_count)
+#define file_count(f) file_ref_read(&(f)->f_ref)
#define MAX_NON_LFS ((1UL<<31) - 1)
struct timespec64 current_time(struct inode *inode);
struct timespec64 inode_set_ctime_current(struct inode *inode);
+struct timespec64 inode_set_ctime_deleg(struct inode *inode,
+ struct timespec64 update);
static inline time64_t inode_get_atime_sec(const struct inode *inode)
{
return inode_set_mtime_to_ts(inode, ts);
}
+/*
+ * Multigrain timestamps
+ *
+ * Conditionally use fine-grained ctime and mtime timestamps when there
+ * are users actively observing them via getattr. The primary use-case
+ * for this is NFS clients that use the ctime to distinguish between
+ * different states of the file, and that are often fooled by multiple
+ * operations that occur in the same coarse-grained timer tick.
+ */
+#define I_CTIME_QUERIED ((u32)BIT(31))
+
static inline time64_t inode_get_ctime_sec(const struct inode *inode)
{
return inode->i_ctime_sec;
static inline long inode_get_ctime_nsec(const struct inode *inode)
{
- return inode->i_ctime_nsec;
+ return inode->i_ctime_nsec & ~I_CTIME_QUERIED;
}
static inline struct timespec64 inode_get_ctime(const struct inode *inode)
return ts;
}
-static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
- struct timespec64 ts)
-{
- inode->i_ctime_sec = ts.tv_sec;
- inode->i_ctime_nsec = ts.tv_nsec;
- return ts;
-}
+struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts);
/**
* inode_set_ctime - set the ctime in the inode
#define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4))
/* Treat loff_t as unsigned (e.g., /dev/mem) */
#define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5))
+/* Supports asynchronous lock callbacks */
+#define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6))
/* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *,
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */
#define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */
+#define FS_MGTIME 64 /* FS uses multigrain timestamps */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
int (*init_fs_context)(struct fs_context *);
const struct fs_parameter_spec *parameters;
#define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
+/**
+ * is_mgtime: is this inode using multigrain timestamps
+ * @inode: inode to test for multigrain timestamps
+ *
+ * Return true if the inode uses multigrain timestamps, false otherwise.
+ */
+static inline bool is_mgtime(const struct inode *inode)
+{
+ return inode->i_opflags & IOP_MGTIME;
+}
+
extern struct dentry *mount_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int));
extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);
-extern int inode_init_always(struct super_block *, struct inode *);
+extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t);
+static inline int inode_init_always(struct super_block *sb, struct inode *inode)
+{
+ return inode_init_always_gfp(sb, inode, GFP_NOFS);
+}
+
extern void inode_init_once(struct inode *);
extern void address_space_init_once(struct address_space *mapping);
extern struct inode * igrab(struct inode *);
extern int page_symlink(struct inode *inode, const char *symname, int len);
extern const struct inode_operations page_symlink_inode_operations;
extern void kfree_link(void *);
+void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode);
void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
void generic_fill_statx_atomic_writes(struct kstat *stat,
const struct qstr *folded_name,
const u8 *de_name, u32 de_name_len);
+#if IS_ENABLED(CONFIG_UNICODE)
+int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
+int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+ const char *str, const struct qstr *name);
+
+/**
+ * generic_ci_validate_strict_name - Check if a given name is suitable
+ * for a directory
+ *
+ * This functions checks if the proposed filename is valid for the
+ * parent directory. That means that only valid UTF-8 filenames will be
+ * accepted for casefold directories from filesystems created with the
+ * strict encoding flag. That also means that any name will be
+ * accepted for directories that doesn't have casefold enabled, or
+ * aren't being strict with the encoding.
+ *
+ * @dir: inode of the directory where the new file will be created
+ * @name: name of the new file
+ *
+ * Return:
+ * * True: if the filename is suitable for this directory. It can be
+ * true if a given name is not suitable for a strict encoding
+ * directory, but the directory being used isn't strict
+ * * False if the filename isn't suitable for this directory. This only
+ * happens when a directory is casefolded and the filesystem is strict
+ * about its encoding.
+ */
+static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name)
+{
+ if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb))
+ return true;
+
+ /*
+ * A casefold dir must have a encoding set, unless the filesystem
+ * is corrupted
+ */
+ if (WARN_ON_ONCE(!dir->i_sb->s_encoding))
+ return true;
+
+ return !utf8_validate(dir->i_sb->s_encoding, name);
+}
+#else
+static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name)
+{
+ return true;
+}
+#endif
+
static inline bool sb_has_encoding(const struct super_block *sb)
{
#if IS_ENABLED(CONFIG_UNICODE)
return !c;
}
- bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos);
+ int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter);
#endif /* _LINUX_FS_H */
#else
#define IOMAP_DAX 0
#endif /* CONFIG_FS_DAX */
+ #define IOMAP_ATOMIC (1 << 9)
struct iomap_ops {
/*
return &i->iomap;
}
+/*
+ * Return the file offset for the first unchanged block after a short write.
+ *
+ * If nothing was written, round @pos down to point at the first block in
+ * the range, else round up to include the partially written block.
+ */
+static inline loff_t iomap_last_written_block(struct inode *inode, loff_t pos,
+ ssize_t written)
+{
+ if (unlikely(!written))
+ return round_down(pos, i_blocksize(inode));
+ return round_up(pos + written, i_blocksize(inode));
+}
+
+/*
+ * Check if the range needs to be unshared for a FALLOC_FL_UNSHARE_RANGE
+ * operation.
+ *
+ * Don't bother with blocks that are not shared to start with; or mappings that
+ * cannot be shared, such as inline data, delalloc reservations, holes or
+ * unwritten extents.
+ *
+ * Note that we use srcmap directly instead of iomap_iter_srcmap as unsharing
+ * requires providing a separate source map, and the presence of one is a good
+ * indicator that unsharing is needed, unlike IOMAP_F_SHARED which can be set
+ * for any data that goes into the COW fork for XFS.
+ */
+static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter)
+{
+ return (iter->iomap.flags & IOMAP_F_SHARED) &&
+ iter->srcmap.type == IOMAP_MAPPED;
+}
+
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
const struct iomap_ops *ops, void *private);
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
struct iomap *iomap);
-void iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos,
- loff_t length, ssize_t written, unsigned flag,
- struct iomap *iomap, iomap_punch_t punch);
+void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
+ loff_t end_byte, unsigned flags, struct iomap *iomap,
+ iomap_punch_t punch);
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len, const struct iomap_ops *ops);