Merge tag 'vfs-6.13.untorn.writes' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <[email protected]>

Mon, 18 Nov 2024 19:30:09 +0000 (11:30 -0800)

committer Linus Torvalds <[email protected]>

Mon, 18 Nov 2024 19:30:09 +0000 (11:30 -0800)
author Linus Torvalds <[email protected]>
Mon, 18 Nov 2024 19:30:09 +0000 (11:30 -0800)
committer Linus Torvalds <[email protected]>
Mon, 18 Nov 2024 19:30:09 +0000 (11:30 -0800)
diff --combined Documentation/filesystems/iomap/operations.rst

index b93115ab8748ae45ef4d40ec955232bcb9171c8b,ee790f843cfaca59ccf9f7433a62af4c52c2da75..ef082e5a4e0cf923f29f070639d66287e155531d
--- 1/Documentation/filesystems/iomap/operations.rst
--- 2/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@@ -208,7 -208,7 +208,7 @@@ The filesystem must arrange to `cance
   such `reservations
   <https://lore.kernel.org/linux-xfs/[email protected]/>`_
   because writeback will not consume the reservation.
- -The ``iomap_file_buffered_write_punch_delalloc`` can be called from a
+ +The ``iomap_write_delalloc_release`` can be called from a
   ``->iomap_end`` function to find all the clean areas of the folios
   caching a fresh (``IOMAP_F_NEW``) delalloc mapping.
   It takes the ``invalidate_lock``.
@@@ -513,6 -513,21 +513,21 @@@ IOMAP_WRITE`` with any combination of t
      if the mapping is unwritten and the filesystem cannot handle zeroing
      the unaligned regions without exposing stale contents.
   
+  * ``IOMAP_ATOMIC``: This write is being issued with torn-write
+    protection.
+    Only a single bio can be created for the write, and the write must
+    not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
+    set.
+    The file range to write must be aligned to satisfy the requirements
+    of both the filesystem and the underlying block device's atomic
+    commit capabilities.
+    If filesystem metadata updates are required (e.g. unwritten extent
+    conversion or copy on write), all updates for the entire file range
+    must be committed atomically as well.
+    Only one space mapping is allowed per untorn write.
+    Untorn writes must be aligned to, and must not be longer than, a
+    single file block.
+ 
   Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
   calling this function.
   
diff --combined fs/ext4/super.c

index b77acba4a71981a37898144fe9390af08da39f54,ebe1660bd840ae43d371c55790ea19aaef911f7e..7ea7178750f2e6a870f15a6320e3b3bbafead500
--- 1/fs/ext4/super.c
--- 2/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@@ -4425,6 -4425,36 +4425,36 @@@ static int ext4_handle_clustersize(stru
         return 0;
   }
   
+ /*
+  * ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
+  * @sb: super block
+  * TODO: Later add support for bigalloc
+  */
+ static void ext4_atomic_write_init(struct super_block *sb)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct block_device *bdev = sb->s_bdev;
+ 
+       if (!bdev_can_atomic_write(bdev))
+               return;
+ 
+       if (!ext4_has_feature_extents(sb))
+               return;
+ 
+       sbi->s_awu_min = max(sb->s_blocksize,
+                             bdev_atomic_write_unit_min_bytes(bdev));
+       sbi->s_awu_max = min(sb->s_blocksize,
+                             bdev_atomic_write_unit_max_bytes(bdev));
+       if (sbi->s_awu_min && sbi->s_awu_max &&
+           sbi->s_awu_min <= sbi->s_awu_max) {
+               ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u",
+                        sbi->s_awu_min, sbi->s_awu_max);
+       } else {
+               sbi->s_awu_min = 0;
+               sbi->s_awu_max = 0;
+       }
+ }
+ 
   static void ext4_fast_commit_init(struct super_block *sb)
   {
         struct ext4_sb_info *sbi = EXT4_SB(sb);
@@@ -5336,6 -5366,7 +5366,7 @@@ static int __ext4_fill_super(struct fs_
   
         spin_lock_init(&sbi->s_bdev_wb_lock);
   
+       ext4_atomic_write_init(sb);
         ext4_fast_commit_init(sb);
   
         sb->s_root = NULL;
@@@ -7329,7 -7360,7 +7360,7 @@@ static struct file_system_type ext4_fs_
         .init_fs_context        = ext4_init_fs_context,
         .parameters             = ext4_param_specs,
         .kill_sb                = ext4_kill_sb,
- -      .fs_flags               = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ +      .fs_flags               = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
   };
   MODULE_ALIAS_FS("ext4");
   
diff --combined fs/xfs/xfs_file.c

index b19916b11fd563d693032c52d49fa38a272a9bd3,c94019b12537d1648a6a3ffbff531fa5cbff33bc..ca47cae5a40a2bff0b5c19ac112f9edf78583d37
--- 1/fs/xfs/xfs_file.c
--- 2/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@@ -347,83 -347,10 +347,83 @@@ xfs_file_splice_read
         return ret;
   }
   
+ +/*
+ + * Take care of zeroing post-EOF blocks when they might exist.
+ + *
+ + * Returns 0 if successfully, a negative error for a failure, or 1 if this
+ + * function dropped the iolock and reacquired it exclusively and the caller
+ + * needs to restart the write sanity checks.
+ + */
+ +static ssize_t
+ +xfs_file_write_zero_eof(
+ +      struct kiocb            *iocb,
+ +      struct iov_iter         *from,
+ +      unsigned int            *iolock,
+ +      size_t                  count,
+ +      bool                    *drained_dio)
+ +{
+ +      struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+ +      loff_t                  isize;
+ +      int                     error;
+ +
+ +      /*
+ +       * We need to serialise against EOF updates that occur in IO completions
+ +       * here. We want to make sure that nobody is changing the size while
+ +       * we do this check until we have placed an IO barrier (i.e. hold
+ +       * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
+ +       * spinlock effectively forms a memory barrier once we have
+ +       * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
+ +       * hence be able to correctly determine if we need to run zeroing.
+ +       */
+ +      spin_lock(&ip->i_flags_lock);
+ +      isize = i_size_read(VFS_I(ip));
+ +      if (iocb->ki_pos <= isize) {
+ +              spin_unlock(&ip->i_flags_lock);
+ +              return 0;
+ +      }
+ +      spin_unlock(&ip->i_flags_lock);
+ +
+ +      if (iocb->ki_flags & IOCB_NOWAIT)
+ +              return -EAGAIN;
+ +
+ +      if (!*drained_dio) {
+ +              /*
+ +               * If zeroing is needed and we are currently holding the iolock
+ +               * shared, we need to update it to exclusive which implies
+ +               * having to redo all checks before.
+ +               */
+ +              if (*iolock == XFS_IOLOCK_SHARED) {
+ +                      xfs_iunlock(ip, *iolock);
+ +                      *iolock = XFS_IOLOCK_EXCL;
+ +                      xfs_ilock(ip, *iolock);
+ +                      iov_iter_reexpand(from, count);
+ +              }
+ +
+ +              /*
+ +               * We now have an IO submission barrier in place, but AIO can do
+ +               * EOF updates during IO completion and hence we now need to
+ +               * wait for all of them to drain.  Non-AIO DIO will have drained
+ +               * before we are given the XFS_IOLOCK_EXCL, and so for most
+ +               * cases this wait is a no-op.
+ +               */
+ +              inode_dio_wait(VFS_I(ip));
+ +              *drained_dio = true;
+ +              return 1;
+ +      }
+ +
+ +      trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
+ +
+ +      xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ +      error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
+ +      xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+ +
+ +      return error;
+ +}
+ +
   /*
    * Common pre-write limit and setup checks.
    *
- - * Called with the iolocked held either shared and exclusive according to
+ + * Called with the iolock held either shared and exclusive according to
    * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
    * if called for a direct write beyond i_size.
    */
@@@ -433,10 -360,13 +433,10 @@@ xfs_file_write_checks
         struct iov_iter         *from,
         unsigned int            *iolock)
   {
- -      struct file             *file = iocb->ki_filp;
- -      struct inode            *inode = file->f_mapping->host;
- -      struct xfs_inode        *ip = XFS_I(inode);
- -      ssize_t                 error = 0;
+ +      struct inode            *inode = iocb->ki_filp->f_mapping->host;
         size_t                  count = iov_iter_count(from);
         bool                    drained_dio = false;
- -      loff_t                  isize;
+ +      ssize_t                 error;
   
   restart:
         error = generic_write_checks(iocb, from);
@@@ -459,7 -389,7 +459,7 @@@
          * exclusively.
          */
         if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
- -              xfs_iunlock(ip, *iolock);
+ +              xfs_iunlock(XFS_I(inode), *iolock);
                 *iolock = XFS_IOLOCK_EXCL;
                 error = xfs_ilock_iocb(iocb, *iolock);
                 if (error) {
@@@ -470,24 -400,64 +470,24 @@@
         }
   
         /*
- -       * If the offset is beyond the size of the file, we need to zero any
+ +       * If the offset is beyond the size of the file, we need to zero all
          * blocks that fall between the existing EOF and the start of this
- -       * write.  If zeroing is needed and we are currently holding the iolock
- -       * shared, we need to update it to exclusive which implies having to
- -       * redo all checks before.
- -       *
- -       * We need to serialise against EOF updates that occur in IO completions
- -       * here. We want to make sure that nobody is changing the size while we
- -       * do this check until we have placed an IO barrier (i.e.  hold the
- -       * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
- -       * spinlock effectively forms a memory barrier once we have the
- -       * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
- -       * hence be able to correctly determine if we need to run zeroing.
+ +       * write.
          *
- -       * We can do an unlocked check here safely as IO completion can only
- -       * extend EOF. Truncate is locked out at this point, so the EOF can
- -       * not move backwards, only forwards. Hence we only need to take the
- -       * slow path and spin locks when we are at or beyond the current EOF.
+ +       * We can do an unlocked check for i_size here safely as I/O completion
+ +       * can only extend EOF.  Truncate is locked out at this point, so the
+ +       * EOF can not move backwards, only forwards. Hence we only need to take
+ +       * the slow path when we are at or beyond the current EOF.
          */
- -      if (iocb->ki_pos <= i_size_read(inode))
- -              goto out;
- -
- -      spin_lock(&ip->i_flags_lock);
- -      isize = i_size_read(inode);
- -      if (iocb->ki_pos > isize) {
- -              spin_unlock(&ip->i_flags_lock);
- -
- -              if (iocb->ki_flags & IOCB_NOWAIT)
- -                      return -EAGAIN;
- -
- -              if (!drained_dio) {
- -                      if (*iolock == XFS_IOLOCK_SHARED) {
- -                              xfs_iunlock(ip, *iolock);
- -                              *iolock = XFS_IOLOCK_EXCL;
- -                              xfs_ilock(ip, *iolock);
- -                              iov_iter_reexpand(from, count);
- -                      }
- -                      /*
- -                       * We now have an IO submission barrier in place, but
- -                       * AIO can do EOF updates during IO completion and hence
- -                       * we now need to wait for all of them to drain. Non-AIO
- -                       * DIO will have drained before we are given the
- -                       * XFS_IOLOCK_EXCL, and so for most cases this wait is a
- -                       * no-op.
- -                       */
- -                      inode_dio_wait(inode);
- -                      drained_dio = true;
+ +      if (iocb->ki_pos > i_size_read(inode)) {
+ +              error = xfs_file_write_zero_eof(iocb, from, iolock, count,
+ +                              &drained_dio);
+ +              if (error == 1)
                         goto restart;
- -              }
- -
- -              trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
- -              error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
                 if (error)
                         return error;
- -      } else
- -              spin_unlock(&ip->i_flags_lock);
+ +      }
   
- -out:
         return kiocb_modified(iocb);
   }
   
@@@ -852,6 -822,20 +852,20 @@@ xfs_file_write_iter
         if (IS_DAX(inode))
                 return xfs_file_dax_write(iocb, from);
   
+       if (iocb->ki_flags & IOCB_ATOMIC) {
+               /*
+                * Currently only atomic writing of a single FS block is
+                * supported. It would be possible to atomic write smaller than
+                * a FS block, but there is no requirement to support this.
+                * Note that iomap also does not support this yet.
+                */
+               if (ocount != ip->i_mount->m_sb.sb_blocksize)
+                       return -EINVAL;
+               ret = generic_atomic_write_valid(iocb, from);
+               if (ret)
+                       return ret;
+       }
+ 
         if (iocb->ki_flags & IOCB_DIRECT) {
                 /*
                  * Allow a directio write to fall back to a buffered
@@@ -1239,6 -1223,8 +1253,8 @@@ xfs_file_open
         if (xfs_is_shutdown(XFS_M(inode->i_sb)))
                 return -EIO;
         file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
+       if (xfs_inode_can_atomicwrite(XFS_I(inode)))
+               file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
         return generic_file_open(inode, file);
   }
   
diff --combined fs/xfs/xfs_inode.h

index 03944b6c5fba1ce4b2bf6ab94621690dcf0d0e5d,73009a25a1195275c917c5c05992f52adce739b6..a2a6b5fd2545018f7bb1352b18e03637b3c82975
--- 1/fs/xfs/xfs_inode.h
--- 2/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@@ -292,11 -292,6 +292,11 @@@ static inline bool xfs_is_cow_inode(str
         return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
   }
   
+ +static inline bool xfs_inode_has_filedata(const struct xfs_inode *ip)
+ +{
+ +      return ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0;
+ +}
+ +
   /*
    * Check if an inode has any data in the COW fork.  This might be often false
    * even for inodes with the reflink flag when there is no pending COW operation.
@@@ -332,6 -327,21 +332,21 @@@ static inline bool xfs_inode_has_bigrta
         (XFS_IS_REALTIME_INODE(ip) ? \
                 (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
   
+ static inline bool
+ xfs_inode_can_atomicwrite(
+       struct xfs_inode        *ip)
+ {
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+ 
+       if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min)
+               return false;
+       if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max)
+               return false;
+ 
+       return true;
+ }
+ 
   /*
    * In-core inode flags.
    */
diff --combined fs/xfs/xfs_iops.c

index b5d0c5c157e7edefec7555c0e4c662a22695def0,5cd804812efdce0eb4b5a9666156daa4c5776cc3..4084d26f0d788aec7f7aca26cfa812bf48959092
--- 1/fs/xfs/xfs_iops.c
--- 2/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@@ -570,6 -570,20 +570,20 @@@ xfs_stat_blksize
         return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize);
   }
   
+ static void
+ xfs_get_atomic_write_attr(
+       struct xfs_inode        *ip,
+       unsigned int            *unit_min,
+       unsigned int            *unit_max)
+ {
+       if (!xfs_inode_can_atomicwrite(ip)) {
+               *unit_min = *unit_max = 0;
+               return;
+       }
+ 
+       *unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
+ }
+ 
   STATIC int
   xfs_vn_getattr(
         struct mnt_idmap        *idmap,
@@@ -597,9 -611,8 +611,9 @@@
         stat->gid = vfsgid_into_kgid(vfsgid);
         stat->ino = ip->i_ino;
         stat->atime = inode_get_atime(inode);
- -      stat->mtime = inode_get_mtime(inode);
- -      stat->ctime = inode_get_ctime(inode);
+ +
+ +      fill_mg_cmtime(stat, request_mask, inode);
+ +
         stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
   
         if (xfs_has_v3inodes(mp)) {
@@@ -609,6 -622,11 +623,6 @@@
                 }
         }
   
- -      if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
- -              stat->change_cookie = inode_query_iversion(inode);
- -              stat->result_mask |= STATX_CHANGE_COOKIE;
- -      }
- -
         /*
          * Note: If you add another clause to set an attribute flag, please
          * update attributes_mask below.
@@@ -639,6 -657,14 +653,14 @@@
                         stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
                         stat->dio_offset_align = bdev_logical_block_size(bdev);
                 }
+               if (request_mask & STATX_WRITE_ATOMIC) {
+                       unsigned int    unit_min, unit_max;
+ 
+                       xfs_get_atomic_write_attr(ip, &unit_min,
+                                       &unit_max);
+                       generic_fill_statx_atomic_writes(stat,
+                                       unit_min, unit_max);
+               }
                 fallthrough;
         default:
                 stat->blksize = xfs_stat_blksize(ip);
diff --combined include/linux/fs.h

index eae7ce884030759c40b57571e1a38b69237d7740,ba47fb283730d514ebdd62e75853b27abcb50961..db4c0d45107a02c2cef618e9547a3cb66c64e076
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -45,8 -45,6 +45,8 @@@
   #include <linux/slab.h>
   #include <linux/maple_tree.h>
   #include <linux/rw_hint.h>
+ +#include <linux/file_ref.h>
+ +#include <linux/unicode.h>
   
   #include <asm/byteorder.h>
   #include <uapi/linux/fs.h>
@@@ -625,7 -623,6 +625,7 @@@ is_uncached_acl(struct posix_acl *acl
   #define IOP_NOFOLLOW  0x0004
   #define IOP_XATTR     0x0008
   #define IOP_DEFAULT_READLINK  0x0010
+ +#define IOP_MGTIME    0x0020
   
   /*
    * Keep mostly read-only and often accessed (especially for
@@@ -1008,7 -1005,7 +1008,7 @@@ static inline int ra_has_index(struct f
   
   /**
    * struct file - Represents a file
- - * @f_count: reference count
+ + * @f_ref: reference count
    * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
    * @f_mode: FMODE_* flags often used in hotpaths
    * @f_op: file operations
@@@ -1033,7 -1030,7 +1033,7 @@@
    * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
    */
   struct file {
- -      atomic_long_t                   f_count;
+ +      file_ref_t                      f_ref;
         spinlock_t                      f_lock;
         fmode_t                         f_mode;
         const struct file_operations    *f_op;
@@@ -1081,14 -1078,15 +1081,14 @@@ struct file_handle 
   
   static inline struct file *get_file(struct file *f)
   {
- -      long prior = atomic_long_fetch_inc_relaxed(&f->f_count);
- -      WARN_ONCE(!prior, "struct file::f_count incremented from zero; use-after-free condition present!\n");
+ +      file_ref_inc(&f->f_ref);
         return f;
   }
   
   struct file *get_file_rcu(struct file __rcu **f);
   struct file *get_file_active(struct file **f);
   
- -#define file_count(x) atomic_long_read(&(x)->f_count)
+ +#define file_count(f) file_ref_read(&(f)->f_ref)
   
   #define       MAX_NON_LFS     ((1UL<<31) - 1)
   
@@@ -1586,8 -1584,6 +1586,8 @@@ static inline bool fsuidgid_has_mapping
   
   struct timespec64 current_time(struct inode *inode);
   struct timespec64 inode_set_ctime_current(struct inode *inode);
+ +struct timespec64 inode_set_ctime_deleg(struct inode *inode,
+ +                                      struct timespec64 update);
   
   static inline time64_t inode_get_atime_sec(const struct inode *inode)
   {
@@@ -1657,17 -1653,6 +1657,17 @@@ static inline struct timespec64 inode_s
         return inode_set_mtime_to_ts(inode, ts);
   }
   
+ +/*
+ + * Multigrain timestamps
+ + *
+ + * Conditionally use fine-grained ctime and mtime timestamps when there
+ + * are users actively observing them via getattr. The primary use-case
+ + * for this is NFS clients that use the ctime to distinguish between
+ + * different states of the file, and that are often fooled by multiple
+ + * operations that occur in the same coarse-grained timer tick.
+ + */
+ +#define I_CTIME_QUERIED               ((u32)BIT(31))
+ +
   static inline time64_t inode_get_ctime_sec(const struct inode *inode)
   {
         return inode->i_ctime_sec;
@@@ -1675,7 -1660,7 +1675,7 @@@
   
   static inline long inode_get_ctime_nsec(const struct inode *inode)
   {
- -      return inode->i_ctime_nsec;
+ +      return inode->i_ctime_nsec & ~I_CTIME_QUERIED;
   }
   
   static inline struct timespec64 inode_get_ctime(const struct inode *inode)
@@@ -1686,7 -1671,13 +1686,7 @@@
         return ts;
   }
   
- -static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
- -                                                    struct timespec64 ts)
- -{
- -      inode->i_ctime_sec = ts.tv_sec;
- -      inode->i_ctime_nsec = ts.tv_nsec;
- -      return ts;
- -}
+ +struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts);
   
   /**
    * inode_set_ctime - set the ctime in the inode
@@@ -2125,8 -2116,6 +2125,8 @@@ struct file_operations 
   #define FOP_HUGE_PAGES                ((__force fop_flags_t)(1 << 4))
   /* Treat loff_t as unsigned (e.g., /dev/mem) */
   #define FOP_UNSIGNED_OFFSET   ((__force fop_flags_t)(1 << 5))
+ +/* Supports asynchronous lock callbacks */
+ +#define FOP_ASYNC_LOCK                ((__force fop_flags_t)(1 << 6))
   
   /* Wrap a directory iterator that needs exclusive inode access */
   int wrap_directory_iterator(struct file *, struct dir_context *,
@@@ -2553,7 -2542,6 +2553,7 @@@ struct file_system_type 
   #define FS_USERNS_MOUNT               8       /* Can be mounted by userns root */
   #define FS_DISALLOW_NOTIFY_PERM       16      /* Disable fanotify permission events */
   #define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
+ +#define FS_MGTIME             64      /* FS uses multigrain timestamps */
   #define FS_RENAME_DOES_D_MOVE 32768   /* FS will handle d_move() during rename() internally. */
         int (*init_fs_context)(struct fs_context *);
         const struct fs_parameter_spec *parameters;
@@@ -2577,17 -2565,6 +2577,17 @@@
   
   #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
   
+ +/**
+ + * is_mgtime: is this inode using multigrain timestamps
+ + * @inode: inode to test for multigrain timestamps
+ + *
+ + * Return true if the inode uses multigrain timestamps, false otherwise.
+ + */
+ +static inline bool is_mgtime(const struct inode *inode)
+ +{
+ +      return inode->i_opflags & IOP_MGTIME;
+ +}
+ +
   extern struct dentry *mount_bdev(struct file_system_type *fs_type,
         int flags, const char *dev_name, void *data,
         int (*fill_super)(struct super_block *, void *, int));
@@@ -3105,12 -3082,7 +3105,12 @@@ extern loff_t default_llseek(struct fil
   
   extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);
   
- -extern int inode_init_always(struct super_block *, struct inode *);
+ +extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t);
+ +static inline int inode_init_always(struct super_block *sb, struct inode *inode)
+ +{
+ +      return inode_init_always_gfp(sb, inode, GFP_NOFS);
+ +}
+ +
   extern void inode_init_once(struct inode *);
   extern void address_space_init_once(struct address_space *mapping);
   extern struct inode * igrab(struct inode *);
@@@ -3349,7 -3321,6 +3349,7 @@@ extern void page_put_link(void *)
   extern int page_symlink(struct inode *inode, const char *symname, int len);
   extern const struct inode_operations page_symlink_inode_operations;
   extern void kfree_link(void *);
+ +void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode);
   void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
   void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
   void generic_fill_statx_atomic_writes(struct kstat *stat,
@@@ -3480,54 -3451,6 +3480,54 @@@ extern int generic_ci_match(const struc
                             const struct qstr *folded_name,
                             const u8 *de_name, u32 de_name_len);
   
+ +#if IS_ENABLED(CONFIG_UNICODE)
+ +int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
+ +int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+ +                       const char *str, const struct qstr *name);
+ +
+ +/**
+ + * generic_ci_validate_strict_name - Check if a given name is suitable
+ + * for a directory
+ + *
+ + * This functions checks if the proposed filename is valid for the
+ + * parent directory. That means that only valid UTF-8 filenames will be
+ + * accepted for casefold directories from filesystems created with the
+ + * strict encoding flag.  That also means that any name will be
+ + * accepted for directories that doesn't have casefold enabled, or
+ + * aren't being strict with the encoding.
+ + *
+ + * @dir: inode of the directory where the new file will be created
+ + * @name: name of the new file
+ + *
+ + * Return:
+ + * * True: if the filename is suitable for this directory. It can be
+ + *   true if a given name is not suitable for a strict encoding
+ + *   directory, but the directory being used isn't strict
+ + * * False if the filename isn't suitable for this directory. This only
+ + *   happens when a directory is casefolded and the filesystem is strict
+ + *   about its encoding.
+ + */
+ +static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name)
+ +{
+ +      if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb))
+ +              return true;
+ +
+ +      /*
+ +       * A casefold dir must have a encoding set, unless the filesystem
+ +       * is corrupted
+ +       */
+ +      if (WARN_ON_ONCE(!dir->i_sb->s_encoding))
+ +              return true;
+ +
+ +      return !utf8_validate(dir->i_sb->s_encoding, name);
+ +}
+ +#else
+ +static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name)
+ +{
+ +      return true;
+ +}
+ +#endif
+ +
   static inline bool sb_has_encoding(const struct super_block *sb)
   {
   #if IS_ENABLED(CONFIG_UNICODE)
@@@ -3798,6 -3721,6 +3798,6 @@@ static inline bool vfs_empty_path(int d
         return !c;
   }
   
- bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos);
+ int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter);
   
   #endif /* _LINUX_FS_H */
diff --combined include/linux/iomap.h

index f61407e3b12192d78932d8c7ac737a5161da0bc9,c7644bdcfca3463146c7abded24eb500b325046b..27048ec10e1c7cda27c098d6cec0baf0b99fc0b1
--- 1/include/linux/iomap.h
--- 2/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@@ -178,6 -178,7 +178,7 @@@ struct iomap_folio_ops 
   #else
   #define IOMAP_DAX             0
   #endif /* CONFIG_FS_DAX */
+ #define IOMAP_ATOMIC          (1 << 9)
   
   struct iomap_ops {
         /*
@@@ -256,39 -257,6 +257,39 @@@ static inline const struct iomap *iomap
         return &i->iomap;
   }
   
+ +/*
+ + * Return the file offset for the first unchanged block after a short write.
+ + *
+ + * If nothing was written, round @pos down to point at the first block in
+ + * the range, else round up to include the partially written block.
+ + */
+ +static inline loff_t iomap_last_written_block(struct inode *inode, loff_t pos,
+ +              ssize_t written)
+ +{
+ +      if (unlikely(!written))
+ +              return round_down(pos, i_blocksize(inode));
+ +      return round_up(pos + written, i_blocksize(inode));
+ +}
+ +
+ +/*
+ + * Check if the range needs to be unshared for a FALLOC_FL_UNSHARE_RANGE
+ + * operation.
+ + *
+ + * Don't bother with blocks that are not shared to start with; or mappings that
+ + * cannot be shared, such as inline data, delalloc reservations, holes or
+ + * unwritten extents.
+ + *
+ + * Note that we use srcmap directly instead of iomap_iter_srcmap as unsharing
+ + * requires providing a separate source map, and the presence of one is a good
+ + * indicator that unsharing is needed, unlike IOMAP_F_SHARED which can be set
+ + * for any data that goes into the COW fork for XFS.
+ + */
+ +static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter)
+ +{
+ +      return (iter->iomap.flags & IOMAP_F_SHARED) &&
+ +              iter->srcmap.type == IOMAP_MAPPED;
+ +}
+ +
   ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
                 const struct iomap_ops *ops, void *private);
   int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
@@@ -309,9 -277,9 +310,9 @@@ vm_fault_t iomap_page_mkwrite(struct vm
   
   typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
                 struct iomap *iomap);
- -void iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos,
- -              loff_t length, ssize_t written, unsigned flag,
- -              struct iomap *iomap, iomap_punch_t punch);
+ +void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
+ +              loff_t end_byte, unsigned flags, struct iomap *iomap,
+ +              iomap_punch_t punch);
   
   int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 u64 start, u64 len, const struct iomap_ops *ops);
author	Linus Torvalds <[email protected]>
	Mon, 18 Nov 2024 19:30:09 +0000 (11:30 -0800)
committer	Linus Torvalds <[email protected]>
	Mon, 18 Nov 2024 19:30:09 +0000 (11:30 -0800)
		1	2
Documentation/filesystems/iomap/operations.rst	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_inode.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_iops.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/iomap.h	patch \|	diff1 \|	diff2 \|	blob \| history