]> Git Repo - linux.git/commitdiff
ext4: partial zero eof block on unaligned inode size extension
authorBrian Foster <[email protected]>
Thu, 19 Sep 2024 16:07:40 +0000 (12:07 -0400)
committerTheodore Ts'o <[email protected]>
Wed, 13 Nov 2024 04:54:14 +0000 (23:54 -0500)
Using mapped writes, it's technically possible to expose stale
post-eof data on a truncate up operation. Consider the following
example:

$ xfs_io -fc "pwrite 0 2k" -c "mmap 0 4k" -c "mwrite 2k 2k" \
-c "truncate 8k" -c "pread -v 2k 16" <file>
...
00000800:  58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58  XXXXXXXXXXXXXXXX
...

This shows that the post-eof data written via mwrite lands within
EOF after a truncate up. While this is deliberate of the test case,
behavior is somewhat unpredictable because writeback does post-eof
zeroing, and writeback can occur at any time in the background. For
example, an fsync inserted between the mwrite and truncate causes
the subsequent read to instead return zeroes. This basically means
that there is a race window in this situation between any subsequent
extending operation and writeback that dictates whether post-eof
data is exposed to the file or zeroed.

To prevent this problem, perform partial block zeroing as part of
the various inode size extending operations that are susceptible to
it. For truncate extension, zero around the original eof similar to
how truncate down does partial zeroing of the new eof. For extension
via writes and fallocate related operations, zero the newly exposed
range of the file to cover any partial zeroing that must occur at
the original and new eof blocks.

Signed-off-by: Brian Foster <[email protected]>
Link: https://patch.msgid.link/[email protected]
Signed-off-by: Theodore Ts'o <[email protected]>
fs/ext4/extents.c
fs/ext4/inode.c

index 1c8123866d81a140cd914a6f65af6ed866a21e77..a07a98a4b97a5fd243f824ef1ab7689310e00d38 100644 (file)
@@ -4482,7 +4482,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
        int depth = 0;
        struct ext4_map_blocks map;
        unsigned int credits;
-       loff_t epos;
+       loff_t epos, old_size = i_size_read(inode);
 
        BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
        map.m_lblk = offset;
@@ -4541,6 +4541,11 @@ retry:
                        if (ext4_update_inode_size(inode, epos) & 0x1)
                                inode_set_mtime_to_ts(inode,
                                                      inode_get_ctime(inode));
+                       if (epos > old_size) {
+                               pagecache_isize_extended(inode, old_size, epos);
+                               ext4_zero_partial_blocks(handle, inode,
+                                                    old_size, epos - old_size);
+                       }
                }
                ret2 = ext4_mark_inode_dirty(handle, inode);
                ext4_update_inode_fsync_trans(handle, inode, 1);
index 569887741bf14642e269ccc36368980f58efe01e..aadd8bb3c679fe0f04f88514cc1965c75425a8ef 100644 (file)
@@ -1314,8 +1314,10 @@ static int ext4_write_end(struct file *file,
        folio_unlock(folio);
        folio_put(folio);
 
-       if (old_size < pos && !verity)
+       if (old_size < pos && !verity) {
                pagecache_isize_extended(inode, old_size, pos);
+               ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+       }
        /*
         * Don't mark the inode dirty under folio lock. First, it unnecessarily
         * makes the holding time of folio lock longer. Second, it forces lock
@@ -1430,8 +1432,10 @@ static int ext4_journalled_write_end(struct file *file,
        folio_unlock(folio);
        folio_put(folio);
 
-       if (old_size < pos && !verity)
+       if (old_size < pos && !verity) {
                pagecache_isize_extended(inode, old_size, pos);
+               ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+       }
 
        if (size_changed) {
                ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -2992,7 +2996,8 @@ static int ext4_da_do_write_end(struct address_space *mapping,
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        bool disksize_changed = false;
-       loff_t new_i_size;
+       loff_t new_i_size, zero_len = 0;
+       handle_t *handle;
 
        if (unlikely(!folio_buffers(folio))) {
                folio_unlock(folio);
@@ -3036,18 +3041,21 @@ static int ext4_da_do_write_end(struct address_space *mapping,
        folio_unlock(folio);
        folio_put(folio);
 
-       if (old_size < pos)
+       if (pos > old_size) {
                pagecache_isize_extended(inode, old_size, pos);
+               zero_len = pos - old_size;
+       }
 
-       if (disksize_changed) {
-               handle_t *handle;
+       if (!disksize_changed && !zero_len)
+               return copied;
 
-               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-               if (IS_ERR(handle))
-                       return PTR_ERR(handle);
-               ext4_mark_inode_dirty(handle, inode);
-               ext4_journal_stop(handle);
-       }
+       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+       if (zero_len)
+               ext4_zero_partial_blocks(handle, inode, old_size, zero_len);
+       ext4_mark_inode_dirty(handle, inode);
+       ext4_journal_stop(handle);
 
        return copied;
 }
@@ -5433,6 +5441,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                }
 
                if (attr->ia_size != inode->i_size) {
+                       /* attach jbd2 jinode for EOF folio tail zeroing */
+                       if (attr->ia_size & (inode->i_sb->s_blocksize - 1) ||
+                           oldsize & (inode->i_sb->s_blocksize - 1)) {
+                               error = ext4_inode_attach_jinode(inode);
+                               if (error)
+                                       goto err_out;
+                       }
+
                        handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
                        if (IS_ERR(handle)) {
                                error = PTR_ERR(handle);
@@ -5443,12 +5459,17 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                                orphan = 1;
                        }
                        /*
-                        * Update c/mtime on truncate up, ext4_truncate() will
-                        * update c/mtime in shrink case below
+                        * Update c/mtime and tail zero the EOF folio on
+                        * truncate up. ext4_truncate() handles the shrink case
+                        * below.
                         */
-                       if (!shrink)
+                       if (!shrink) {
                                inode_set_mtime_to_ts(inode,
                                                      inode_set_ctime_current(inode));
+                               if (oldsize & (inode->i_sb->s_blocksize - 1))
+                                       ext4_block_truncate_page(handle,
+                                                       inode->i_mapping, oldsize);
+                       }
 
                        if (shrink)
                                ext4_fc_track_range(handle, inode,
This page took 0.077685 seconds and 4 git commands to generate.