Merge tag 'for-5.15-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

author Linus Torvalds <[email protected]>

Tue, 31 Aug 2021 16:41:22 +0000 (09:41 -0700)

committer Linus Torvalds <[email protected]>

Tue, 31 Aug 2021 16:41:22 +0000 (09:41 -0700)
author Linus Torvalds <[email protected]>
Tue, 31 Aug 2021 16:41:22 +0000 (09:41 -0700)
committer Linus Torvalds <[email protected]>
Tue, 31 Aug 2021 16:41:22 +0000 (09:41 -0700)
diff --combined block/bio.c

index 1319dd2aa0b8d0d8f658c925cf6124bf47360568,77cadcba93b9ff965355dd08b7585c0b7fc787f1..e16849f46b0e63b2a7ecf144b4fe408c3be5c88a
--- 1/block/bio.c
--- 2/block/bio.c
+++ b/block/bio.c
@@@ -25,11 -25,6 +25,11 @@@
   #include "blk.h"
   #include "blk-rq-qos.h"
   
+ +struct bio_alloc_cache {
+ +      struct bio_list         free_list;
+ +      unsigned int            nr;
+ +};
+ +
   static struct biovec_slab {
         int nr_vecs;
         char *name;
@@@ -251,40 -246,12 +251,40 @@@ static void bio_free(struct bio *bio
   void bio_init(struct bio *bio, struct bio_vec *table,
               unsigned short max_vecs)
   {
- -      memset(bio, 0, sizeof(*bio));
+ +      bio->bi_next = NULL;
+ +      bio->bi_bdev = NULL;
+ +      bio->bi_opf = 0;
+ +      bio->bi_flags = 0;
+ +      bio->bi_ioprio = 0;
+ +      bio->bi_write_hint = 0;
+ +      bio->bi_status = 0;
+ +      bio->bi_iter.bi_sector = 0;
+ +      bio->bi_iter.bi_size = 0;
+ +      bio->bi_iter.bi_idx = 0;
+ +      bio->bi_iter.bi_bvec_done = 0;
+ +      bio->bi_end_io = NULL;
+ +      bio->bi_private = NULL;
+ +#ifdef CONFIG_BLK_CGROUP
+ +      bio->bi_blkg = NULL;
+ +      bio->bi_issue.value = 0;
+ +#ifdef CONFIG_BLK_CGROUP_IOCOST
+ +      bio->bi_iocost_cost = 0;
+ +#endif
+ +#endif
+ +#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+ +      bio->bi_crypt_context = NULL;
+ +#endif
+ +#ifdef CONFIG_BLK_DEV_INTEGRITY
+ +      bio->bi_integrity = NULL;
+ +#endif
+ +      bio->bi_vcnt = 0;
+ +
         atomic_set(&bio->__bi_remaining, 1);
         atomic_set(&bio->__bi_cnt, 1);
   
- -      bio->bi_io_vec = table;
         bio->bi_max_vecs = max_vecs;
+ +      bio->bi_io_vec = table;
+ +      bio->bi_pool = NULL;
   }
   EXPORT_SYMBOL(bio_init);
   
@@@ -528,11 -495,16 +528,11 @@@ EXPORT_SYMBOL(bio_kmalloc)
   
   void zero_fill_bio(struct bio *bio)
   {
- -      unsigned long flags;
         struct bio_vec bv;
         struct bvec_iter iter;
   
- -      bio_for_each_segment(bv, bio, iter) {
- -              char *data = bvec_kmap_irq(&bv, &flags);
- -              memset(data, 0, bv.bv_len);
- -              flush_dcache_page(bv.bv_page);
- -              bvec_kunmap_irq(data, &flags);
- -      }
+ +      bio_for_each_segment(bv, bio, iter)
+ +              memzero_bvec(&bv);
   }
   EXPORT_SYMBOL(zero_fill_bio);
   
@@@ -619,53 -591,6 +619,53 @@@ void guard_bio_eod(struct bio *bio
         bio_truncate(bio, maxsector << 9);
   }
   
+ +#define ALLOC_CACHE_MAX               512
+ +#define ALLOC_CACHE_SLACK      64
+ +
+ +static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+ +                                unsigned int nr)
+ +{
+ +      unsigned int i = 0;
+ +      struct bio *bio;
+ +
+ +      while ((bio = bio_list_pop(&cache->free_list)) != NULL) {
+ +              cache->nr--;
+ +              bio_free(bio);
+ +              if (++i == nr)
+ +                      break;
+ +      }
+ +}
+ +
+ +static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
+ +{
+ +      struct bio_set *bs;
+ +
+ +      bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead);
+ +      if (bs->cache) {
+ +              struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu);
+ +
+ +              bio_alloc_cache_prune(cache, -1U);
+ +      }
+ +      return 0;
+ +}
+ +
+ +static void bio_alloc_cache_destroy(struct bio_set *bs)
+ +{
+ +      int cpu;
+ +
+ +      if (!bs->cache)
+ +              return;
+ +
+ +      cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
+ +      for_each_possible_cpu(cpu) {
+ +              struct bio_alloc_cache *cache;
+ +
+ +              cache = per_cpu_ptr(bs->cache, cpu);
+ +              bio_alloc_cache_prune(cache, -1U);
+ +      }
+ +      free_percpu(bs->cache);
+ +}
+ +
   /**
    * bio_put - release a reference to a bio
    * @bio:   bio to release reference to
@@@ -676,23 -601,16 +676,23 @@@
    **/
   void bio_put(struct bio *bio)
   {
- -      if (!bio_flagged(bio, BIO_REFFED))
- -              bio_free(bio);
- -      else {
+ +      if (unlikely(bio_flagged(bio, BIO_REFFED))) {
                 BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+ +              if (!atomic_dec_and_test(&bio->__bi_cnt))
+ +                      return;
+ +      }
   
- -              /*
- -               * last put frees it
- -               */
- -              if (atomic_dec_and_test(&bio->__bi_cnt))
- -                      bio_free(bio);
+ +      if (bio_flagged(bio, BIO_PERCPU_CACHE)) {
+ +              struct bio_alloc_cache *cache;
+ +
+ +              bio_uninit(bio);
+ +              cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
+ +              bio_list_add_head(&cache->free_list, bio);
+ +              if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
+ +                      bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
+ +              put_cpu();
+ +      } else {
+ +              bio_free(bio);
         }
   }
   EXPORT_SYMBOL(bio_put);
@@@ -1061,14 -979,6 +1061,14 @@@ static int bio_iov_bvec_set_append(stru
         return 0;
   }
   
+ +static void bio_put_pages(struct page **pages, size_t size, size_t off)
+ +{
+ +      size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
+ +
+ +      for (i = 0; i < nr; i++)
+ +              put_page(pages[i]);
+ +}
+ +
   #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
   
   /**
@@@ -1113,10 -1023,8 +1113,10 @@@ static int __bio_iov_iter_get_pages(str
                         if (same_page)
                                 put_page(page);
                 } else {
- -                      if (WARN_ON_ONCE(bio_full(bio, len)))
- -                                return -EINVAL;
+ +                      if (WARN_ON_ONCE(bio_full(bio, len))) {
+ +                              bio_put_pages(pages + i, left, offset);
+ +                              return -EINVAL;
+ +                      }
                         __bio_add_page(bio, page, len, offset);
                 }
                 offset = 0;
@@@ -1161,7 -1069,6 +1161,7 @@@ static int __bio_iov_append_get_pages(s
                 len = min_t(size_t, PAGE_SIZE - offset, left);
                 if (bio_add_hw_page(q, bio, page, len, offset,
                                 max_append_sectors, &same_page) != len) {
+ +                      bio_put_pages(pages + i, left, offset);
                         ret = -EINVAL;
                         break;
                 }
@@@ -1284,15 -1191,27 +1284,15 @@@ EXPORT_SYMBOL(bio_advance)
   void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
                         struct bio *src, struct bvec_iter *src_iter)
   {
- -      struct bio_vec src_bv, dst_bv;
- -      void *src_p, *dst_p;
- -      unsigned bytes;
- -
         while (src_iter->bi_size && dst_iter->bi_size) {
- -              src_bv = bio_iter_iovec(src, *src_iter);
- -              dst_bv = bio_iter_iovec(dst, *dst_iter);
- -
- -              bytes = min(src_bv.bv_len, dst_bv.bv_len);
+ +              struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
+ +              struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
+ +              unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
+ +              void *src_buf;
   
- -              src_p = kmap_atomic(src_bv.bv_page);
- -              dst_p = kmap_atomic(dst_bv.bv_page);
- -
- -              memcpy(dst_p + dst_bv.bv_offset,
- -                     src_p + src_bv.bv_offset,
- -                     bytes);
- -
- -              kunmap_atomic(dst_p);
- -              kunmap_atomic(src_p);
- -
- -              flush_dcache_page(dst_bv.bv_page);
+ +              src_buf = bvec_kmap_local(&src_bv);
+ +              memcpy_to_bvec(&dst_bv, src_buf);
+ +              kunmap_local(src_buf);
   
                 bio_advance_iter_single(src, src_iter, bytes);
                 bio_advance_iter_single(dst, dst_iter, bytes);
@@@ -1544,12 -1463,15 +1544,15 @@@ EXPORT_SYMBOL(bio_split)
    * @bio:      bio to trim
    * @offset:   number of sectors to trim from the front of @bio
    * @size:     size we want to trim @bio to, in sectors
+  *
+  * This function is typically used for bios that are cloned and submitted
+  * to the underlying device in parts.
    */
- void bio_trim(struct bio *bio, int offset, int size)
+ void bio_trim(struct bio *bio, sector_t offset, sector_t size)
   {
-       /* 'bio' is a cloned bio which we need to trim to match
-        * the given offset and size.
-        */
+       if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
+                        offset + size > bio->bi_iter.bi_size))
+               return;
   
         size <<= 9;
         if (offset == 0 && size == bio->bi_iter.bi_size)
@@@ -1560,7 -1482,6 +1563,6 @@@
   
         if (bio_integrity(bio))
                 bio_integrity_trim(bio);
- 
   }
   EXPORT_SYMBOL_GPL(bio_trim);
   
@@@ -1583,7 -1504,6 +1585,7 @@@ int biovec_init_pool(mempool_t *pool, i
    */
   void bioset_exit(struct bio_set *bs)
   {
+ +      bio_alloc_cache_destroy(bs);
         if (bs->rescue_workqueue)
                 destroy_workqueue(bs->rescue_workqueue);
         bs->rescue_workqueue = NULL;
@@@ -1645,18 -1565,12 +1647,18 @@@ int bioset_init(struct bio_set *bs
             biovec_init_pool(&bs->bvec_pool, pool_size))
                 goto bad;
   
- -      if (!(flags & BIOSET_NEED_RESCUER))
- -              return 0;
- -
- -      bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
- -      if (!bs->rescue_workqueue)
- -              goto bad;
+ +      if (flags & BIOSET_NEED_RESCUER) {
+ +              bs->rescue_workqueue = alloc_workqueue("bioset",
+ +                                                      WQ_MEM_RECLAIM, 0);
+ +              if (!bs->rescue_workqueue)
+ +                      goto bad;
+ +      }
+ +      if (flags & BIOSET_PERCPU_CACHE) {
+ +              bs->cache = alloc_percpu(struct bio_alloc_cache);
+ +              if (!bs->cache)
+ +                      goto bad;
+ +              cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
+ +      }
   
         return 0;
   bad:
@@@ -1683,46 -1597,6 +1685,46 @@@ int bioset_init_from_src(struct bio_se
   }
   EXPORT_SYMBOL(bioset_init_from_src);
   
+ +/**
+ + * bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb
+ + * @kiocb:    kiocb describing the IO
+ + * @nr_iovecs:        number of iovecs to pre-allocate
+ + * @bs:               bio_set to allocate from
+ + *
+ + * Description:
+ + *    Like @bio_alloc_bioset, but pass in the kiocb. The kiocb is only
+ + *    used to check if we should dip into the per-cpu bio_set allocation
+ + *    cache. The allocation uses GFP_KERNEL internally. On return, the
+ + *    bio is marked BIO_PERCPU_CACHEABLE, and the final put of the bio
+ + *    MUST be done from process context, not hard/soft IRQ.
+ + *
+ + */
+ +struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
+ +                          struct bio_set *bs)
+ +{
+ +      struct bio_alloc_cache *cache;
+ +      struct bio *bio;
+ +
+ +      if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS)
+ +              return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+ +
+ +      cache = per_cpu_ptr(bs->cache, get_cpu());
+ +      bio = bio_list_pop(&cache->free_list);
+ +      if (bio) {
+ +              cache->nr--;
+ +              put_cpu();
+ +              bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
+ +              bio->bi_pool = bs;
+ +              bio_set_flag(bio, BIO_PERCPU_CACHE);
+ +              return bio;
+ +      }
+ +      put_cpu();
+ +      bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+ +      bio_set_flag(bio, BIO_PERCPU_CACHE);
+ +      return bio;
+ +}
+ +EXPORT_SYMBOL_GPL(bio_alloc_kiocb);
+ +
   static int __init init_bio(void)
   {
         int i;
@@@ -1737,9 -1611,6 +1739,9 @@@
                                 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
         }
   
+ +      cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
+ +                                      bio_cpu_dead);
+ +
         if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
                 panic("bio: can't allocate bios\n");
   
diff --combined fs/9p/vfs_file.c

index c4a2dc41beacc239a92e0b611336b38f1fb5c70b,6b64e8391f30332837fc1f41d9dab252b8269203..aab5e653866040bc337781f05937190fd1426100
--- 1/fs/9p/vfs_file.c
--- 2/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@@ -121,6 -121,10 +121,6 @@@ static int v9fs_file_lock(struct file *
   
         p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
   
- -      /* No mandatory locks */
- -      if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- -              return -ENOLCK;
- -
         if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
                 filemap_write_and_wait(inode->i_mapping);
                 invalidate_mapping_pages(&inode->i_data, 0, -1);
@@@ -308,6 -312,10 +308,6 @@@ static int v9fs_file_lock_dotl(struct f
         p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
                  filp, cmd, fl, filp);
   
- -      /* No mandatory locks */
- -      if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- -              goto out_err;
- -
         if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
                 filemap_write_and_wait(inode->i_mapping);
                 invalidate_mapping_pages(&inode->i_data, 0, -1);
@@@ -319,6 -327,7 +319,6 @@@
                 ret = v9fs_file_getlock(filp, fl);
         else
                 ret = -EINVAL;
- -out_err:
         return ret;
   }
   
@@@ -339,6 -348,10 +339,6 @@@ static int v9fs_file_flock_dotl(struct 
         p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
                  filp, cmd, fl, filp);
   
- -      /* No mandatory locks */
- -      if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- -              goto out_err;
- -
         if (!(fl->fl_flags & FL_FLOCK))
                 goto out_err;
   
@@@ -612,12 -625,7 +612,7 @@@ static void v9fs_mmap_vm_close(struct v
         p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
   
         inode = file_inode(vma->vm_file);
- 
-       if (!mapping_can_writeback(inode->i_mapping))
-               wbc.nr_to_write = 0;
- 
-       might_sleep();
-       sync_inode(inode, &wbc);
+       filemap_fdatawrite_wbc(inode->i_mapping, &wbc);
   }
   
   
diff --combined fs/btrfs/inode.c

index bd5689fa290e78c22191d1e93b5e972b6f6ca88e,2aa9646bce568d694bd97e22e4457fa4fd815407..2b7fe98adec2551c122ba8f7a6d8bbbc4b8644ab
--- 1/fs/btrfs/inode.c
--- 2/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -32,6 -32,7 +32,7 @@@
   #include <linux/sched/mm.h>
   #include <linux/iomap.h>
   #include <asm/unaligned.h>
+ #include <linux/fsverity.h>
   #include "misc.h"
   #include "ctree.h"
   #include "disk-io.h"
@@@ -286,9 -287,8 +287,8 @@@ static int insert_inline_extent(struct 
                         cur_size = min_t(unsigned long, compressed_size,
                                        PAGE_SIZE);
   
-                       kaddr = kmap_atomic(cpage);
+                       kaddr = page_address(cpage);
                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
-                       kunmap_atomic(kaddr);
   
                         i++;
                         ptr += cur_size;
@@@ -490,6 -490,9 +490,9 @@@ static noinline int add_async_extent(st
    */
   static inline bool inode_can_compress(struct btrfs_inode *inode)
   {
+       /* Subpage doesn't support compression yet */
+       if (inode->root->fs_info->sectorsize < PAGE_SIZE)
+               return false;
         if (inode->flags & BTRFS_INODE_NODATACOW ||
             inode->flags & BTRFS_INODE_NODATASUM)
                 return false;
@@@ -629,7 -632,7 +632,7 @@@ again
          * inode has not been flagged as nocompress.  This flag can
          * change at any time if we discover bad compression ratios.
          */
- -      if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) {
+ +      if (inode_need_compress(BTRFS_I(inode), start, end)) {
                 WARN_ON(pages);
                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
                 if (!pages) {
@@@ -682,7 -685,11 +685,11 @@@
                 }
         }
   cont:
-       if (start == 0) {
+       /*
+        * Check cow_file_range() for why we don't even try to create inline
+        * extent for subpage case.
+        */
+       if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
                 /* lets try to make an inline extent */
                 if (ret || total_in < actual_end) {
                         /* we didn't compress the entire range, try
@@@ -973,7 -980,7 +980,7 @@@ retry
   
                         p->mapping = inode->vfs_inode.i_mapping;
                         btrfs_writepage_endio_finish_ordered(inode, p, start,
-                                                            end, 0);
+                                                            end, false);
   
                         p->mapping = NULL;
                         extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
@@@ -1080,7 -1087,17 +1087,17 @@@ static noinline int cow_file_range(stru
   
         inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
   
-       if (start == 0) {
+       /*
+        * Due to the page size limit, for subpage we can only trigger the
+        * writeback for the dirty sectors of page, that means data writeback
+        * is doing more writeback than what we want.
+        *
+        * This is especially unexpected for some call sites like fallocate,
+        * where we only increase i_size after everything is done.
+        * This means we can trigger inline extent even if we didn't want to.
+        * So here we skip inline extent creation completely.
+        */
+       if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
                 /* lets try to make an inline extent */
                 ret = cow_file_range_inline(inode, start, end, 0,
                                             BTRFS_COMPRESS_NONE, NULL);
@@@ -1290,11 -1307,6 +1307,6 @@@ static noinline void async_cow_submit(s
         nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
                 PAGE_SHIFT;
   
-       /* atomic_sub_return implies a barrier */
-       if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
-           5 * SZ_1M)
-               cond_wake_up_nomb(&fs_info->async_submit_wait);
- 
         /*
          * ->inode could be NULL if async_chunk_start has failed to compress,
          * in which case we don't have anything to submit, yet we need to
@@@ -1303,6 -1315,11 +1315,11 @@@
          */
         if (async_chunk->inode)
                 submit_compressed_extents(async_chunk);
+ 
+       /* atomic_sub_return implies a barrier */
+       if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+           5 * SZ_1M)
+               cond_wake_up_nomb(&fs_info->async_submit_wait);
   }
   
   static noinline void async_cow_free(struct btrfs_work *work)
@@@ -1946,6 -1963,7 +1963,7 @@@ int btrfs_run_delalloc_range(struct btr
                 ret = cow_file_range_async(inode, wbc, locked_page, start, end,
                                            page_started, nr_written);
         }
+       ASSERT(ret <= 0);
         if (ret)
                 btrfs_cleanup_ordered_extents(inode, locked_page, start,
                                               end - start + 1);
@@@ -2285,7 -2303,6 +2303,6 @@@ static int split_zoned_em(struct btrfs_
         struct extent_map *split_mid = NULL;
         struct extent_map *split_post = NULL;
         int ret = 0;
-       int modified;
         unsigned long flags;
   
         /* Sanity check */
@@@ -2315,11 -2332,12 +2332,12 @@@
         ASSERT(em->len == len);
         ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
         ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+       ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+       ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+       ASSERT(!list_empty(&em->list));
   
         flags = em->flags;
         clear_bit(EXTENT_FLAG_PINNED, &em->flags);
-       clear_bit(EXTENT_FLAG_LOGGING, &flags);
-       modified = !list_empty(&em->list);
   
         /* First, replace the em with a new extent_map starting from * em->start */
         split_pre->start = em->start;
@@@ -2333,7 -2351,7 +2351,7 @@@
         split_pre->compress_type = em->compress_type;
         split_pre->generation = em->generation;
   
-       replace_extent_mapping(em_tree, em, split_pre, modified);
+       replace_extent_mapping(em_tree, em, split_pre, 1);
   
         /*
          * Now we only have an extent_map at:
@@@ -2353,7 -2371,7 +2371,7 @@@
                 split_mid->flags = flags;
                 split_mid->compress_type = em->compress_type;
                 split_mid->generation = em->generation;
-               add_extent_mapping(em_tree, split_mid, modified);
+               add_extent_mapping(em_tree, split_mid, 1);
         }
   
         if (post) {
@@@ -2367,7 -2385,7 +2385,7 @@@
                 split_post->flags = flags;
                 split_post->compress_type = em->compress_type;
                 split_post->generation = em->generation;
-               add_extent_mapping(em_tree, split_post, modified);
+               add_extent_mapping(em_tree, split_post, 1);
         }
   
         /* Once for us */
@@@ -2770,7 -2788,7 +2788,7 @@@ out_page
    * to fix it up.  The async helper will wait for ordered extents, set
    * the delalloc bit and make it safe to write the page.
    */
- int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
+ int btrfs_writepage_cow_fixup(struct page *page)
   {
         struct inode *inode = page->mapping->host;
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@@ -3171,7 -3189,7 +3189,7 @@@ static void finish_ordered_fn(struct bt
   
   void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
                                           struct page *page, u64 start,
-                                         u64 end, int uptodate)
+                                         u64 end, bool uptodate)
   {
         trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
   
@@@ -3257,25 -3275,44 +3275,44 @@@ unsigned int btrfs_verify_data_csum(str
                 return 0;
         }
   
-       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+       /*
+        * For subpage case, above PageChecked is not safe as it's not subpage
+        * compatible.
+        * But for now only cow fixup and compressed read utilize PageChecked
+        * flag, while in this context we can easily use io_bio->csum to
+        * determine if we really need to do csum verification.
+        *
+        * So for now, just exit if io_bio->csum is NULL, as it means it's
+        * compressed read, and its compressed data csum has already been
+        * verified.
+        */
+       if (io_bio->csum == NULL)
                 return 0;
   
-       if (!root->fs_info->csum_root)
+       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
                 return 0;
   
-       if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
-           test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
-               clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
+       if (!root->fs_info->csum_root)
                 return 0;
-       }
   
         ASSERT(page_offset(page) <= start &&
                end <= page_offset(page) + PAGE_SIZE - 1);
         for (pg_off = offset_in_page(start);
              pg_off < offset_in_page(end);
              pg_off += sectorsize, bio_offset += sectorsize) {
+               u64 file_offset = pg_off + page_offset(page);
                 int ret;
   
+               if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+                   test_range_bit(io_tree, file_offset,
+                                  file_offset + sectorsize - 1,
+                                  EXTENT_NODATASUM, 1, NULL)) {
+                       /* Skip the range without csum for data reloc inode */
+                       clear_extent_bits(io_tree, file_offset,
+                                         file_offset + sectorsize - 1,
+                                         EXTENT_NODATASUM);
+                       continue;
+               }
                 ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
                                       page_offset(page) + pg_off);
                 if (ret < 0) {
@@@ -3520,7 -3557,14 +3557,14 @@@ int btrfs_orphan_cleanup(struct btrfs_r
   
                 /*
                  * If we have an inode with links, there are a couple of
-                * possibilities. Old kernels (before v3.12) used to create an
+                * possibilities:
+                *
+                * 1. We were halfway through creating fsverity metadata for the
+                * file. In that case, the orphan item represents incomplete
+                * fsverity metadata which must be cleaned up with
+                * btrfs_drop_verity_items and deleting the orphan item.
+ 
+                * 2. Old kernels (before v3.12) used to create an
                  * orphan item for truncate indicating that there were possibly
                  * extent items past i_size that needed to be deleted. In v3.12,
                  * truncate was changed to update i_size in sync with the extent
@@@ -3538,8 -3582,12 +3582,12 @@@
                  * but either way, we can delete the orphan item.
                  */
                 if (ret == -ENOENT || inode->i_nlink) {
-                       if (!ret)
+                       if (!ret) {
+                               ret = btrfs_drop_verity_items(BTRFS_I(inode));
                                 iput(inode);
+                               if (ret)
+                                       goto out;
+                       }
                         trans = btrfs_start_transaction(root, 1);
                         if (IS_ERR(trans)) {
                                 ret = PTR_ERR(trans);
@@@ -3728,7 -3776,8 +3776,8 @@@ static int btrfs_read_locked_inode(stru
         rdev = btrfs_inode_rdev(leaf, inode_item);
   
         BTRFS_I(inode)->index_cnt = (u64)-1;
-       BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+       btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
+                               &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
   
   cache_index:
         /*
@@@ -3859,6 -3908,7 +3908,7 @@@ static void fill_inode_item(struct btrf
                             struct inode *inode)
   {
         struct btrfs_map_token token;
+       u64 flags;
   
         btrfs_init_map_token(&token, leaf);
   
@@@ -3894,7 -3944,9 +3944,9 @@@
         btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
         btrfs_set_token_inode_transid(&token, item, trans->transid);
         btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
-       btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+       flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+                                         BTRFS_I(inode)->ro_flags);
+       btrfs_set_token_inode_flags(&token, item, flags);
         btrfs_set_token_inode_block_group(&token, item, 0);
   }
   
@@@ -5088,15 -5140,13 +5140,13 @@@ static int maybe_insert_hole(struct btr
         int ret;
   
         /*
-        * Still need to make sure the inode looks like it's been updated so
-        * that any holes get logged if we fsync.
+        * If NO_HOLES is enabled, we don't need to do anything.
+        * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
+        * or btrfs_update_inode() will be called, which guarantee that the next
+        * fsync will know this inode was changed and needs to be logged.
          */
-       if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
-               inode->last_trans = fs_info->generation;
-               inode->last_sub_trans = root->log_transid;
-               inode->last_log_commit = root->last_log_commit;
+       if (btrfs_fs_incompat(fs_info, NO_HOLES))
                 return 0;
-       }
   
         /*
          * 1 - for the one we're dropping
@@@ -5342,7 -5392,7 +5392,7 @@@ static int btrfs_setattr(struct user_na
         if (btrfs_root_readonly(root))
                 return -EROFS;
   
-       err = setattr_prepare(&init_user_ns, dentry, attr);
+       err = setattr_prepare(mnt_userns, dentry, attr);
         if (err)
                 return err;
   
@@@ -5353,13 -5403,12 +5403,12 @@@
         }
   
         if (attr->ia_valid) {
-               setattr_copy(&init_user_ns, inode, attr);
+               setattr_copy(mnt_userns, inode, attr);
                 inode_inc_iversion(inode);
                 err = btrfs_dirty_inode(inode);
   
                 if (!err && attr->ia_valid & ATTR_MODE)
-                       err = posix_acl_chmod(&init_user_ns, inode,
-                                             inode->i_mode);
+                       err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
         }
   
         return err;
@@@ -5522,6 -5571,7 +5571,7 @@@ void btrfs_evict_inode(struct inode *in
         trace_btrfs_inode_evict(inode);
   
         if (!root) {
+               fsverity_cleanup_inode(inode);
                 clear_inode(inode);
                 return;
         }
@@@ -5604,6 -5654,7 +5654,7 @@@ no_delete
          * to retry these periodically in the future.
          */
         btrfs_remove_delayed_node(BTRFS_I(inode));
+       fsverity_cleanup_inode(inode);
         clear_inode(inode);
   }
   
@@@ -6370,6 -6421,7 +6421,7 @@@ static void btrfs_inherit_iflags(struc
   
   static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
+                                    struct user_namespace *mnt_userns,
                                      struct inode *dir,
                                      const char *name, int name_len,
                                      u64 ref_objectid, u64 objectid,
@@@ -6479,7 -6531,7 +6531,7 @@@
         if (ret != 0)
                 goto fail_unlock;
   
-       inode_init_owner(&init_user_ns, inode, dir, mode);
+       inode_init_owner(mnt_userns, inode, dir, mode);
         inode_set_bytes(inode, 0);
   
         inode->i_mtime = current_time(inode);
@@@ -6664,9 -6716,9 +6716,9 @@@ static int btrfs_mknod(struct user_name
         if (err)
                 goto out_unlock;
   
-       inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                       dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
-                       mode, &index);
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+                       dentry->d_name.name, dentry->d_name.len,
+                       btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
         if (IS_ERR(inode)) {
                 err = PTR_ERR(inode);
                 inode = NULL;
@@@ -6728,9 -6780,9 +6780,9 @@@ static int btrfs_create(struct user_nam
         if (err)
                 goto out_unlock;
   
-       inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                       dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
-                       mode, &index);
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+                       dentry->d_name.name, dentry->d_name.len,
+                       btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
         if (IS_ERR(inode)) {
                 err = PTR_ERR(inode);
                 inode = NULL;
@@@ -6873,8 -6925,9 +6925,9 @@@ static int btrfs_mkdir(struct user_name
         if (err)
                 goto out_fail;
   
-       inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                       dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+                       dentry->d_name.name, dentry->d_name.len,
+                       btrfs_ino(BTRFS_I(dir)), objectid,
                         S_IFDIR | mode, &index);
         if (IS_ERR(inode)) {
                 err = PTR_ERR(inode);
@@@ -8206,8 -8259,8 +8259,8 @@@ static blk_qc_t btrfs_submit_direct(str
         u64 start_sector;
         int async_submit = 0;
         u64 submit_len;
-       int clone_offset = 0;
-       int clone_len;
+       u64 clone_offset = 0;
+       u64 clone_len;
         u64 logical;
         int ret;
         blk_status_t status;
@@@ -8255,9 -8308,9 +8308,9 @@@
                         status = errno_to_blk_status(ret);
                         goto out_err_em;
                 }
-               ASSERT(geom.len <= INT_MAX);
   
-               clone_len = min_t(int, submit_len, geom.len);
+               clone_len = min(submit_len, geom.len);
+               ASSERT(clone_len <= UINT_MAX);
   
                 /*
                  * This will never fail as it's passing GPF_NOFS and
@@@ -8401,11 -8454,47 +8454,47 @@@ static void btrfs_readahead(struct read
         extent_readahead(rac);
   }
   
+ /*
+  * For releasepage() and invalidatepage() we have a race window where
+  * end_page_writeback() is called but the subpage spinlock is not yet released.
+  * If we continue to release/invalidate the page, we could cause use-after-free
+  * for subpage spinlock.  So this function is to spin and wait for subpage
+  * spinlock.
+  */
+ static void wait_subpage_spinlock(struct page *page)
+ {
+       struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+       struct btrfs_subpage *subpage;
+ 
+       if (fs_info->sectorsize == PAGE_SIZE)
+               return;
+ 
+       ASSERT(PagePrivate(page) && page->private);
+       subpage = (struct btrfs_subpage *)page->private;
+ 
+       /*
+        * This may look insane as we just acquire the spinlock and release it,
+        * without doing anything.  But we just want to make sure no one is
+        * still holding the subpage spinlock.
+        * And since the page is not dirty nor writeback, and we have page
+        * locked, the only possible way to hold a spinlock is from the endio
+        * function to clear page writeback.
+        *
+        * Here we just acquire the spinlock so that all existing callers
+        * should exit and we're safe to release/invalidate the page.
+        */
+       spin_lock_irq(&subpage->lock);
+       spin_unlock_irq(&subpage->lock);
+ }
+ 
   static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
   {
         int ret = try_release_extent_mapping(page, gfp_flags);
-       if (ret == 1)
+ 
+       if (ret == 1) {
+               wait_subpage_spinlock(page);
                 clear_page_extent_mapped(page);
+       }
         return ret;
   }
   
@@@ -8469,6 -8558,7 +8558,7 @@@ static void btrfs_invalidatepage(struc
          * do double ordered extent accounting on the same page.
          */
         wait_on_page_writeback(page);
+       wait_subpage_spinlock(page);
   
         /*
          * For subpage case, we have call sites like
@@@ -8557,7 -8647,7 +8647,7 @@@
                 spin_unlock_irq(&inode->ordered_tree.lock);
   
                 if (btrfs_dec_test_ordered_pending(inode, &ordered,
-                                       cur, range_end + 1 - cur, 1)) {
+                                                  cur, range_end + 1 - cur)) {
                         btrfs_finish_ordered_io(ordered);
                         /*
                          * The ordered extent has finished, now we're again
@@@ -8938,7 -9028,8 +9028,8 @@@ out
    */
   int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *new_root,
-                            struct btrfs_root *parent_root)
+                            struct btrfs_root *parent_root,
+                            struct user_namespace *mnt_userns)
   {
         struct inode *inode;
         int err;
@@@ -8949,7 -9040,8 +9040,8 @@@
         if (err < 0)
                 return err;
   
-       inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
+       inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
+                               ino, ino,
                                 S_IFDIR | (~current_umask() & S_IRWXUGO),
                                 &index);
         if (IS_ERR(inode))
@@@ -8993,6 -9085,7 +9085,7 @@@ struct inode *btrfs_alloc_inode(struct 
         ei->defrag_bytes = 0;
         ei->disk_i_size = 0;
         ei->flags = 0;
+       ei->ro_flags = 0;
         ei->csum_bytes = 0;
         ei->index_cnt = (u64)-1;
         ei->dir_index = 0;
@@@ -9174,6 -9267,7 +9267,7 @@@ static int btrfs_getattr(struct user_na
         struct inode *inode = d_inode(path->dentry);
         u32 blocksize = inode->i_sb->s_blocksize;
         u32 bi_flags = BTRFS_I(inode)->flags;
+       u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
   
         stat->result_mask |= STATX_BTIME;
         stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
@@@ -9186,13 -9280,15 +9280,15 @@@
                 stat->attributes |= STATX_ATTR_IMMUTABLE;
         if (bi_flags & BTRFS_INODE_NODUMP)
                 stat->attributes |= STATX_ATTR_NODUMP;
+       if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
+               stat->attributes |= STATX_ATTR_VERITY;
   
         stat->attributes_mask |= (STATX_ATTR_APPEND |
                                   STATX_ATTR_COMPRESSED |
                                   STATX_ATTR_IMMUTABLE |
                                   STATX_ATTR_NODUMP);
   
-       generic_fillattr(&init_user_ns, inode, stat);
+       generic_fillattr(mnt_userns, inode, stat);
         stat->dev = BTRFS_I(inode)->root->anon_dev;
   
         spin_lock(&BTRFS_I(inode)->lock);
@@@ -9280,8 -9376,6 +9376,6 @@@ static int btrfs_rename_exchange(struc
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(trans);
         } else {
-               btrfs_pin_log_trans(root);
-               root_log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, dest,
                                              new_dentry->d_name.name,
                                              new_dentry->d_name.len,
@@@ -9298,8 -9392,6 +9392,6 @@@
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(trans);
         } else {
-               btrfs_pin_log_trans(dest);
-               dest_log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, root,
                                              old_dentry->d_name.name,
                                              old_dentry->d_name.len,
@@@ -9330,6 -9422,29 +9422,29 @@@
                                 BTRFS_I(new_inode), 1);
         }
   
+       /*
+        * Now pin the logs of the roots. We do it to ensure that no other task
+        * can sync the logs while we are in progress with the rename, because
+        * that could result in an inconsistency in case any of the inodes that
+        * are part of this rename operation were logged before.
+        *
+        * We pin the logs even if at this precise moment none of the inodes was
+        * logged before. This is because right after we checked for that, some
+        * other task fsyncing some other inode not involved with this rename
+        * operation could log that one of our inodes exists.
+        *
+        * We don't need to pin the logs before the above calls to
+        * btrfs_insert_inode_ref(), since those don't ever need to change a log.
+        */
+       if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+               btrfs_pin_log_trans(root);
+               root_log_pinned = true;
+       }
+       if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+               btrfs_pin_log_trans(dest);
+               dest_log_pinned = true;
+       }
+ 
         /* src is a subvolume */
         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
                 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
@@@ -9411,8 -9526,7 +9526,7 @@@ out_fail
                 if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
                     btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
                     btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
-                   (new_inode &&
-                    btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
+                   btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
                         btrfs_set_log_full_commit(trans);
   
                 if (root_log_pinned) {
@@@ -9436,6 -9550,7 +9550,7 @@@ out_notrans
   
   static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
+                                    struct user_namespace *mnt_userns,
                                      struct inode *dir,
                                      struct dentry *dentry)
   {
@@@ -9448,7 -9563,7 +9563,7 @@@
         if (ret)
                 return ret;
   
-       inode = btrfs_new_inode(trans, root, dir,
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
                                 dentry->d_name.name,
                                 dentry->d_name.len,
                                 btrfs_ino(BTRFS_I(dir)),
@@@ -9485,9 -9600,10 +9600,10 @@@ out
         return ret;
   }
   
- static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-                          struct inode *new_dir, struct dentry *new_dentry,
-                          unsigned int flags)
+ static int btrfs_rename(struct user_namespace *mnt_userns,
+                       struct inode *old_dir, struct dentry *old_dentry,
+                       struct inode *new_dir, struct dentry *new_dentry,
+                       unsigned int flags)
   {
         struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
         struct btrfs_trans_handle *trans;
@@@ -9582,8 -9698,6 +9698,6 @@@
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(trans);
         } else {
-               btrfs_pin_log_trans(root);
-               log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, dest,
                                              new_dentry->d_name.name,
                                              new_dentry->d_name.len,
@@@ -9607,6 -9721,25 +9721,25 @@@
         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
                 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
         } else {
+               /*
+                * Now pin the log. We do it to ensure that no other task can
+                * sync the log while we are in progress with the rename, as
+                * that could result in an inconsistency in case any of the
+                * inodes that are part of this rename operation were logged
+                * before.
+                *
+                * We pin the log even if at this precise moment none of the
+                * inodes was logged before. This is because right after we
+                * checked for that, some other task fsyncing some other inode
+                * not involved with this rename operation could log that one of
+                * our inodes exists.
+                *
+                * We don't need to pin the logs before the above call to
+                * btrfs_insert_inode_ref(), since that does not need to change
+                * a log.
+                */
+               btrfs_pin_log_trans(root);
+               log_pinned = true;
                 ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
                                         BTRFS_I(d_inode(old_dentry)),
                                         old_dentry->d_name.name,
@@@ -9660,8 -9793,8 +9793,8 @@@
         }
   
         if (flags & RENAME_WHITEOUT) {
-               ret = btrfs_whiteout_for_rename(trans, root, old_dir,
-                                               old_dentry);
+               ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
+                                               old_dir, old_dentry);
   
                 if (ret) {
                         btrfs_abort_transaction(trans, ret);
@@@ -9711,7 -9844,8 +9844,8 @@@ static int btrfs_rename2(struct user_na
                 return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
                                           new_dentry);
   
-       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+       return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+                           new_dentry, flags);
   }
   
   struct btrfs_delalloc_work {
@@@ -9808,11 -9942,7 +9942,7 @@@ static int start_delalloc_inodes(struc
                         btrfs_queue_work(root->fs_info->flush_workers,
                                          &work->work);
                 } else {
-                       ret = sync_inode(inode, wbc);
-                       if (!ret &&
-                           test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                                    &BTRFS_I(inode)->runtime_flags))
-                               ret = sync_inode(inode, wbc);
+                       ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
                         btrfs_add_delayed_iput(inode);
                         if (ret || wbc->nr_to_write <= 0)
                                 goto out;
@@@ -9947,9 -10077,10 +10077,10 @@@ static int btrfs_symlink(struct user_na
         if (err)
                 goto out_unlock;
   
-       inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                               dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
-                               objectid, S_IFLNK|S_IRWXUGO, &index);
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+                               dentry->d_name.name, dentry->d_name.len,
+                               btrfs_ino(BTRFS_I(dir)), objectid,
+                               S_IFLNK | S_IRWXUGO, &index);
         if (IS_ERR(inode)) {
                 err = PTR_ERR(inode);
                 inode = NULL;
@@@ -10273,7 -10404,7 +10404,7 @@@ static int btrfs_permission(struct user
                 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
                         return -EACCES;
         }
-       return generic_permission(&init_user_ns, inode, mask);
+       return generic_permission(mnt_userns, inode, mask);
   }
   
   static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
@@@ -10298,7 -10429,7 +10429,7 @@@
         if (ret)
                 goto out;
   
-       inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+       inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
                         btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
         if (IS_ERR(inode)) {
                 ret = PTR_ERR(inode);
diff --combined fs/namei.c

index 32351c045bae2002ee29eaac1f22bbe8bd0bba2c,902df46e7dd358daa136a4280726a7aa05691da8..d049d39726957292bdd825200c64e2f9e76211d1
--- 1/fs/namei.c
--- 2/fs/namei.c
+++ b/fs/namei.c
@@@ -203,14 -203,6 +203,14 @@@ getname_flags(const char __user *filena
         return result;
   }
   
+ +struct filename *
+ +getname_uflags(const char __user *filename, int uflags)
+ +{
+ +      int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+ +
+ +      return getname_flags(filename, flags, NULL);
+ +}
+ +
   struct filename *
   getname(const char __user * filename)
   {
@@@ -255,9 -247,6 +255,9 @@@ getname_kernel(const char * filename
   
   void putname(struct filename *name)
   {
+ +      if (IS_ERR_OR_NULL(name))
+ +              return;
+ +
         BUG_ON(name->refcnt <= 0);
   
         if (--name->refcnt > 0)
@@@ -2467,7 -2456,7 +2467,7 @@@ static int path_lookupat(struct nameida
         return err;
   }
   
- -int filename_lookup(int dfd, struct filename *name, unsigned flags,
+ +static int __filename_lookup(int dfd, struct filename *name, unsigned flags,
                     struct path *path, struct path *root)
   {
         int retval;
@@@ -2485,14 -2474,6 +2485,14 @@@
                 audit_inode(name, path->dentry,
                             flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
         restore_nameidata();
+ +      return retval;
+ +}
+ +
+ +int filename_lookup(int dfd, struct filename *name, unsigned flags,
+ +                  struct path *path, struct path *root)
+ +{
+ +      int retval = __filename_lookup(dfd, name, flags, path, root);
+ +
         putname(name);
         return retval;
   }
@@@ -2514,7 -2495,7 +2514,7 @@@ static int path_parentat(struct nameida
         return err;
   }
   
- -static struct filename *filename_parentat(int dfd, struct filename *name,
+ +static int __filename_parentat(int dfd, struct filename *name,
                                 unsigned int flags, struct path *parent,
                                 struct qstr *last, int *type)
   {
@@@ -2522,7 -2503,7 +2522,7 @@@
         struct nameidata nd;
   
         if (IS_ERR(name))
- -              return name;
+ +              return PTR_ERR(name);
         set_nameidata(&nd, dfd, name, NULL);
         retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
         if (unlikely(retval == -ECHILD))
@@@ -2533,34 -2514,29 +2533,34 @@@
                 *last = nd.last;
                 *type = nd.last_type;
                 audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
- -      } else {
- -              putname(name);
- -              name = ERR_PTR(retval);
         }
         restore_nameidata();
- -      return name;
+ +      return retval;
+ +}
+ +
+ +static int filename_parentat(int dfd, struct filename *name,
+ +                              unsigned int flags, struct path *parent,
+ +                              struct qstr *last, int *type)
+ +{
+ +      int retval = __filename_parentat(dfd, name, flags, parent, last, type);
+ +
+ +      putname(name);
+ +      return retval;
   }
   
   /* does lookup, returns the object with parent locked */
   struct dentry *kern_path_locked(const char *name, struct path *path)
   {
- -      struct filename *filename;
         struct dentry *d;
         struct qstr last;
- -      int type;
+ +      int type, error;
   
- -      filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
+ +      error = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
                                     &last, &type);
- -      if (IS_ERR(filename))
- -              return ERR_CAST(filename);
+ +      if (error)
+ +              return ERR_PTR(error);
         if (unlikely(type != LAST_NORM)) {
                 path_put(path);
- -              putname(filename);
                 return ERR_PTR(-EINVAL);
         }
         inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
@@@ -2569,6 -2545,7 +2569,6 @@@
                 inode_unlock(path->dentry->d_inode);
                 path_put(path);
         }
- -      putname(filename);
         return d;
   }
   
@@@ -2598,8 -2575,9 +2598,9 @@@ int vfs_path_lookup(struct dentry *dent
   }
   EXPORT_SYMBOL(vfs_path_lookup);
   
- static int lookup_one_len_common(const char *name, struct dentry *base,
-                                int len, struct qstr *this)
+ static int lookup_one_common(struct user_namespace *mnt_userns,
+                            const char *name, struct dentry *base, int len,
+                            struct qstr *this)
   {
         this->name = name;
         this->len = len;
@@@ -2627,7 -2605,7 +2628,7 @@@
                         return err;
         }
   
-       return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC);
+       return inode_permission(mnt_userns, base->d_inode, MAY_EXEC);
   }
   
   /**
@@@ -2651,7 -2629,7 +2652,7 @@@ struct dentry *try_lookup_one_len(cons
   
         WARN_ON_ONCE(!inode_is_locked(base->d_inode));
   
-       err = lookup_one_len_common(name, base, len, &this);
+       err = lookup_one_common(&init_user_ns, name, base, len, &this);
         if (err)
                 return ERR_PTR(err);
   
@@@ -2678,7 -2656,7 +2679,7 @@@ struct dentry *lookup_one_len(const cha
   
         WARN_ON_ONCE(!inode_is_locked(base->d_inode));
   
-       err = lookup_one_len_common(name, base, len, &this);
+       err = lookup_one_common(&init_user_ns, name, base, len, &this);
         if (err)
                 return ERR_PTR(err);
   
@@@ -2687,6 -2665,36 +2688,36 @@@
   }
   EXPORT_SYMBOL(lookup_one_len);
   
+ /**
+  * lookup_one - filesystem helper to lookup single pathname component
+  * @mnt_userns:       user namespace of the mount the lookup is performed from
+  * @name:     pathname component to lookup
+  * @base:     base directory to lookup from
+  * @len:      maximum length @len should be interpreted to
+  *
+  * Note that this routine is purely a helper for filesystem usage and should
+  * not be called by generic code.
+  *
+  * The caller must hold base->i_mutex.
+  */
+ struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name,
+                         struct dentry *base, int len)
+ {
+       struct dentry *dentry;
+       struct qstr this;
+       int err;
+ 
+       WARN_ON_ONCE(!inode_is_locked(base->d_inode));
+ 
+       err = lookup_one_common(mnt_userns, name, base, len, &this);
+       if (err)
+               return ERR_PTR(err);
+ 
+       dentry = lookup_dcache(&this, base, 0);
+       return dentry ? dentry : __lookup_slow(&this, base, 0);
+ }
+ EXPORT_SYMBOL(lookup_one);
+ 
   /**
    * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
    * @name:     pathname component to lookup
@@@ -2706,7 -2714,7 +2737,7 @@@ struct dentry *lookup_one_len_unlocked(
         int err;
         struct dentry *ret;
   
-       err = lookup_one_len_common(name, base, len, &this);
+       err = lookup_one_common(&init_user_ns, name, base, len, &this);
         if (err)
                 return ERR_PTR(err);
   
@@@ -3046,7 -3054,9 +3077,7 @@@ static int handle_truncate(struct user_
         /*
          * Refuse to truncate files with mandatory locks held on them.
          */
- -      error = locks_verify_locked(filp);
- -      if (!error)
- -              error = security_path_truncate(path);
+ +      error = security_path_truncate(path);
         if (!error) {
                 error = do_truncate(mnt_userns, path->dentry, 0,
                                     ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
@@@ -3587,7 -3597,7 +3618,7 @@@ struct file *do_file_open_root(const st
         return file;
   }
   
- -static struct dentry *filename_create(int dfd, struct filename *name,
+ +static struct dentry *__filename_create(int dfd, struct filename *name,
                                 struct path *path, unsigned int lookup_flags)
   {
         struct dentry *dentry = ERR_PTR(-EEXIST);
@@@ -3603,9 -3613,9 +3634,9 @@@
          */
         lookup_flags &= LOOKUP_REVAL;
   
- -      name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
- -      if (IS_ERR(name))
- -              return ERR_CAST(name);
+ +      error = __filename_parentat(dfd, name, lookup_flags, path, &last, &type);
+ +      if (error)
+ +              return ERR_PTR(error);
   
         /*
          * Yucky last component or no last component at all?
@@@ -3643,6 -3653,7 +3674,6 @@@
                 error = err2;
                 goto fail;
         }
- -      putname(name);
         return dentry;
   fail:
         dput(dentry);
@@@ -3653,18 -3664,10 +3684,18 @@@ unlock
                 mnt_drop_write(path->mnt);
   out:
         path_put(path);
- -      putname(name);
         return dentry;
   }
   
+ +static inline struct dentry *filename_create(int dfd, struct filename *name,
+ +                              struct path *path, unsigned int lookup_flags)
+ +{
+ +      struct dentry *res = __filename_create(dfd, name, path, lookup_flags);
+ +
+ +      putname(name);
+ +      return res;
+ +}
+ +
   struct dentry *kern_path_create(int dfd, const char *pathname,
                                 struct path *path, unsigned int lookup_flags)
   {
@@@ -3753,7 -3756,7 +3784,7 @@@ static int may_mknod(umode_t mode
         }
   }
   
- -static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
+ +static int do_mknodat(int dfd, struct filename *name, umode_t mode,
                 unsigned int dev)
   {
         struct user_namespace *mnt_userns;
@@@ -3764,18 -3767,17 +3795,18 @@@
   
         error = may_mknod(mode);
         if (error)
- -              return error;
+ +              goto out1;
   retry:
- -      dentry = user_path_create(dfd, filename, &path, lookup_flags);
+ +      dentry = __filename_create(dfd, name, &path, lookup_flags);
+ +      error = PTR_ERR(dentry);
         if (IS_ERR(dentry))
- -              return PTR_ERR(dentry);
+ +              goto out1;
   
         if (!IS_POSIXACL(path.dentry->d_inode))
                 mode &= ~current_umask();
         error = security_path_mknod(&path, dentry, mode, dev);
         if (error)
- -              goto out;
+ +              goto out2;
   
         mnt_userns = mnt_user_ns(path.mnt);
         switch (mode & S_IFMT) {
@@@ -3794,26 -3796,24 +3825,26 @@@
                                           dentry, mode, 0);
                         break;
         }
- -out:
+ +out2:
         done_path_create(&path, dentry);
         if (retry_estale(error, lookup_flags)) {
                 lookup_flags |= LOOKUP_REVAL;
                 goto retry;
         }
+ +out1:
+ +      putname(name);
         return error;
   }
   
   SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
                 unsigned int, dev)
   {
- -      return do_mknodat(dfd, filename, mode, dev);
+ +      return do_mknodat(dfd, getname(filename), mode, dev);
   }
   
   SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
   {
- -      return do_mknodat(AT_FDCWD, filename, mode, dev);
+ +      return do_mknodat(AT_FDCWD, getname(filename), mode, dev);
   }
   
   /**
@@@ -3858,7 -3858,7 +3889,7 @@@ int vfs_mkdir(struct user_namespace *mn
   }
   EXPORT_SYMBOL(vfs_mkdir);
   
- -static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
+ +int do_mkdirat(int dfd, struct filename *name, umode_t mode)
   {
         struct dentry *dentry;
         struct path path;
@@@ -3866,10 -3866,9 +3897,10 @@@
         unsigned int lookup_flags = LOOKUP_DIRECTORY;
   
   retry:
- -      dentry = user_path_create(dfd, pathname, &path, lookup_flags);
+ +      dentry = __filename_create(dfd, name, &path, lookup_flags);
+ +      error = PTR_ERR(dentry);
         if (IS_ERR(dentry))
- -              return PTR_ERR(dentry);
+ +              goto out_putname;
   
         if (!IS_POSIXACL(path.dentry->d_inode))
                 mode &= ~current_umask();
@@@ -3885,19 -3884,17 +3916,19 @@@
                 lookup_flags |= LOOKUP_REVAL;
                 goto retry;
         }
+ +out_putname:
+ +      putname(name);
         return error;
   }
   
   SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
   {
- -      return do_mkdirat(dfd, pathname, mode);
+ +      return do_mkdirat(dfd, getname(pathname), mode);
   }
   
   SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
   {
- -      return do_mkdirat(AT_FDCWD, pathname, mode);
+ +      return do_mkdirat(AT_FDCWD, getname(pathname), mode);
   }
   
   /**
@@@ -3955,62 -3952,62 +3986,62 @@@ out
   }
   EXPORT_SYMBOL(vfs_rmdir);
   
- -long do_rmdir(int dfd, struct filename *name)
+ +int do_rmdir(int dfd, struct filename *name)
   {
         struct user_namespace *mnt_userns;
- -      int error = 0;
+ +      int error;
         struct dentry *dentry;
         struct path path;
         struct qstr last;
         int type;
         unsigned int lookup_flags = 0;
   retry:
- -      name = filename_parentat(dfd, name, lookup_flags,
- -                              &path, &last, &type);
- -      if (IS_ERR(name))
- -              return PTR_ERR(name);
+ +      error = __filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
+ +      if (error)
+ +              goto exit1;
   
         switch (type) {
         case LAST_DOTDOT:
                 error = -ENOTEMPTY;
- -              goto exit1;
+ +              goto exit2;
         case LAST_DOT:
                 error = -EINVAL;
- -              goto exit1;
+ +              goto exit2;
         case LAST_ROOT:
                 error = -EBUSY;
- -              goto exit1;
+ +              goto exit2;
         }
   
         error = mnt_want_write(path.mnt);
         if (error)
- -              goto exit1;
+ +              goto exit2;
   
         inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
         dentry = __lookup_hash(&last, path.dentry, lookup_flags);
         error = PTR_ERR(dentry);
         if (IS_ERR(dentry))
- -              goto exit2;
+ +              goto exit3;
         if (!dentry->d_inode) {
                 error = -ENOENT;
- -              goto exit3;
+ +              goto exit4;
         }
         error = security_path_rmdir(&path, dentry);
         if (error)
- -              goto exit3;
+ +              goto exit4;
         mnt_userns = mnt_user_ns(path.mnt);
         error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
- -exit3:
+ +exit4:
         dput(dentry);
- -exit2:
+ +exit3:
         inode_unlock(path.dentry->d_inode);
         mnt_drop_write(path.mnt);
- -exit1:
+ +exit2:
         path_put(&path);
         if (retry_estale(error, lookup_flags)) {
                 lookup_flags |= LOOKUP_REVAL;
                 goto retry;
         }
+ +exit1:
         putname(name);
         return error;
   }
@@@ -4093,7 -4090,7 +4124,7 @@@ EXPORT_SYMBOL(vfs_unlink)
    * writeout happening, and we don't want to prevent access to the directory
    * while waiting on the I/O.
    */
- -long do_unlinkat(int dfd, struct filename *name)
+ +int do_unlinkat(int dfd, struct filename *name)
   {
         int error;
         struct dentry *dentry;
@@@ -4104,17 -4101,17 +4135,17 @@@
         struct inode *delegated_inode = NULL;
         unsigned int lookup_flags = 0;
   retry:
- -      name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
- -      if (IS_ERR(name))
- -              return PTR_ERR(name);
+ +      error = __filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
+ +      if (error)
+ +              goto exit1;
   
         error = -EISDIR;
         if (type != LAST_NORM)
- -              goto exit1;
+ +              goto exit2;
   
         error = mnt_want_write(path.mnt);
         if (error)
- -              goto exit1;
+ +              goto exit2;
   retry_deleg:
         inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
         dentry = __lookup_hash(&last, path.dentry, lookup_flags);
@@@ -4131,11 -4128,11 +4162,11 @@@
                 ihold(inode);
                 error = security_path_unlink(&path, dentry);
                 if (error)
- -                      goto exit2;
+ +                      goto exit3;
                 mnt_userns = mnt_user_ns(path.mnt);
                 error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry,
                                    &delegated_inode);
- -exit2:
+ +exit3:
                 dput(dentry);
         }
         inode_unlock(path.dentry->d_inode);
@@@ -4148,14 -4145,13 +4179,14 @@@
                         goto retry_deleg;
         }
         mnt_drop_write(path.mnt);
- -exit1:
+ +exit2:
         path_put(&path);
         if (retry_estale(error, lookup_flags)) {
                 lookup_flags |= LOOKUP_REVAL;
                 inode = NULL;
                 goto retry;
         }
+ +exit1:
         putname(name);
         return error;
   
@@@ -4166,7 -4162,7 +4197,7 @@@ slashes
                 error = -EISDIR;
         else
                 error = -ENOTDIR;
- -      goto exit2;
+ +      goto exit3;
   }
   
   SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
@@@ -4221,22 -4217,23 +4252,22 @@@ int vfs_symlink(struct user_namespace *
   }
   EXPORT_SYMBOL(vfs_symlink);
   
- -static long do_symlinkat(const char __user *oldname, int newdfd,
- -                const char __user *newname)
+ +int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
   {
         int error;
- -      struct filename *from;
         struct dentry *dentry;
         struct path path;
         unsigned int lookup_flags = 0;
   
- -      from = getname(oldname);
- -      if (IS_ERR(from))
- -              return PTR_ERR(from);
+ +      if (IS_ERR(from)) {
+ +              error = PTR_ERR(from);
+ +              goto out_putnames;
+ +      }
   retry:
- -      dentry = user_path_create(newdfd, newname, &path, lookup_flags);
+ +      dentry = __filename_create(newdfd, to, &path, lookup_flags);
         error = PTR_ERR(dentry);
         if (IS_ERR(dentry))
- -              goto out_putname;
+ +              goto out_putnames;
   
         error = security_path_symlink(&path, dentry, from->name);
         if (!error) {
@@@ -4251,8 -4248,7 +4282,8 @@@
                 lookup_flags |= LOOKUP_REVAL;
                 goto retry;
         }
- -out_putname:
+ +out_putnames:
+ +      putname(to);
         putname(from);
         return error;
   }
@@@ -4260,12 -4256,12 +4291,12 @@@
   SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
                 int, newdfd, const char __user *, newname)
   {
- -      return do_symlinkat(oldname, newdfd, newname);
+ +      return do_symlinkat(getname(oldname), newdfd, getname(newname));
   }
   
   SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
   {
- -      return do_symlinkat(oldname, AT_FDCWD, newname);
+ +      return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname));
   }
   
   /**
@@@ -4366,8 -4362,8 +4397,8 @@@ EXPORT_SYMBOL(vfs_link)
    * with linux 2.0, and to avoid hard-linking to directories
    * and other special files.  --ADM
    */
- -static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
- -            const char __user *newname, int flags)
+ +int do_linkat(int olddfd, struct filename *old, int newdfd,
+ +            struct filename *new, int flags)
   {
         struct user_namespace *mnt_userns;
         struct dentry *new_dentry;
@@@ -4376,32 -4372,31 +4407,32 @@@
         int how = 0;
         int error;
   
- -      if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
- -              return -EINVAL;
+ +      if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) {
+ +              error = -EINVAL;
+ +              goto out_putnames;
+ +      }
         /*
          * To use null names we require CAP_DAC_READ_SEARCH
          * This ensures that not everyone will be able to create
          * handlink using the passed filedescriptor.
          */
- -      if (flags & AT_EMPTY_PATH) {
- -              if (!capable(CAP_DAC_READ_SEARCH))
- -                      return -ENOENT;
- -              how = LOOKUP_EMPTY;
+ +      if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) {
+ +              error = -ENOENT;
+ +              goto out_putnames;
         }
   
         if (flags & AT_SYMLINK_FOLLOW)
                 how |= LOOKUP_FOLLOW;
   retry:
- -      error = user_path_at(olddfd, oldname, how, &old_path);
+ +      error = __filename_lookup(olddfd, old, how, &old_path, NULL);
         if (error)
- -              return error;
+ +              goto out_putnames;
   
- -      new_dentry = user_path_create(newdfd, newname, &new_path,
+ +      new_dentry = __filename_create(newdfd, new, &new_path,
                                         (how & LOOKUP_REVAL));
         error = PTR_ERR(new_dentry);
         if (IS_ERR(new_dentry))
- -              goto out;
+ +              goto out_putpath;
   
         error = -EXDEV;
         if (old_path.mnt != new_path.mnt)
@@@ -4429,11 -4424,8 +4460,11 @@@ out_dput
                 how |= LOOKUP_REVAL;
                 goto retry;
         }
- -out:
+ +out_putpath:
         path_put(&old_path);
+ +out_putnames:
+ +      putname(old);
+ +      putname(new);
   
         return error;
   }
@@@ -4441,13 -4433,12 +4472,13 @@@
   SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
                 int, newdfd, const char __user *, newname, int, flags)
   {
- -      return do_linkat(olddfd, oldname, newdfd, newname, flags);
+ +      return do_linkat(olddfd, getname_uflags(oldname, flags),
+ +              newdfd, getname(newname), flags);
   }
   
   SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
   {
- -      return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+ +      return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0);
   }
   
   /**
@@@ -4642,25 -4633,29 +4673,25 @@@ int do_renameat2(int olddfd, struct fil
         int error = -EINVAL;
   
         if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
- -              goto put_both;
+ +              goto put_names;
   
         if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
             (flags & RENAME_EXCHANGE))
- -              goto put_both;
+ +              goto put_names;
   
         if (flags & RENAME_EXCHANGE)
                 target_flags = 0;
   
   retry:
- -      from = filename_parentat(olddfd, from, lookup_flags, &old_path,
+ +      error = __filename_parentat(olddfd, from, lookup_flags, &old_path,
                                         &old_last, &old_type);
- -      if (IS_ERR(from)) {
- -              error = PTR_ERR(from);
- -              goto put_new;
- -      }
+ +      if (error)
+ +              goto put_names;
   
- -      to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
+ +      error = __filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
                                 &new_type);
- -      if (IS_ERR(to)) {
- -              error = PTR_ERR(to);
+ +      if (error)
                 goto exit1;
- -      }
   
         error = -EXDEV;
         if (old_path.mnt != new_path.mnt)
@@@ -4763,9 -4758,12 +4794,9 @@@ exit1
                 lookup_flags |= LOOKUP_REVAL;
                 goto retry;
         }
- -put_both:
- -      if (!IS_ERR(from))
- -              putname(from);
- -put_new:
- -      if (!IS_ERR(to))
- -              putname(to);
+ +put_names:
+ +      putname(from);
+ +      putname(to);
         return error;
   }
   
diff --combined include/linux/bio.h

index 3d67d0fbc868caeaf7c2b347bd04bf7839997466,8a451d77b57348c2a22fe69a95eb005f1661993c..00952e92eae1b1062bb2ffe91920df9af827cfec
--- 1/include/linux/bio.h
--- 2/include/linux/bio.h
+++ b/include/linux/bio.h
@@@ -5,6 -5,7 +5,6 @@@
   #ifndef __LINUX_BIO_H
   #define __LINUX_BIO_H
   
- -#include <linux/highmem.h>
   #include <linux/mempool.h>
   #include <linux/ioprio.h>
   /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
@@@ -374,7 -375,7 +374,7 @@@ static inline void bip_set_seed(struct 
   
   #endif /* CONFIG_BLK_DEV_INTEGRITY */
   
- extern void bio_trim(struct bio *bio, int offset, int size);
+ void bio_trim(struct bio *bio, sector_t offset, sector_t size);
   extern struct bio *bio_split(struct bio *bio, int sectors,
                              gfp_t gfp, struct bio_set *bs);
   
@@@ -400,7 -401,6 +400,7 @@@ static inline struct bio *bio_next_spli
   enum {
         BIOSET_NEED_BVECS = BIT(0),
         BIOSET_NEED_RESCUER = BIT(1),
+ +      BIOSET_PERCPU_CACHE = BIT(2),
   };
   extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
   extern void bioset_exit(struct bio_set *);
@@@ -409,8 -409,6 +409,8 @@@ extern int bioset_init_from_src(struct 
   
   struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs,
                 struct bio_set *bs);
+ +struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
+ +              struct bio_set *bs);
   struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs);
   extern void bio_put(struct bio *);
   
@@@ -521,6 -519,47 +521,6 @@@ static inline void bio_clone_blkg_assoc
                                               struct bio *src) { }
   #endif        /* CONFIG_BLK_CGROUP */
   
- -#ifdef CONFIG_HIGHMEM
- -/*
- - * remember never ever reenable interrupts between a bvec_kmap_irq and
- - * bvec_kunmap_irq!
- - */
- -static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
- -{
- -      unsigned long addr;
- -
- -      /*
- -       * might not be a highmem page, but the preempt/irq count
- -       * balancing is a lot nicer this way
- -       */
- -      local_irq_save(*flags);
- -      addr = (unsigned long) kmap_atomic(bvec->bv_page);
- -
- -      BUG_ON(addr & ~PAGE_MASK);
- -
- -      return (char *) addr + bvec->bv_offset;
- -}
- -
- -static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
- -{
- -      unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
- -
- -      kunmap_atomic((void *) ptr);
- -      local_irq_restore(*flags);
- -}
- -
- -#else
- -static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
- -{
- -      return page_address(bvec->bv_page) + bvec->bv_offset;
- -}
- -
- -static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
- -{
- -      *flags = 0;
- -}
- -#endif
- -
   /*
    * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
    *
@@@ -660,11 -699,6 +660,11 @@@ struct bio_set 
         struct kmem_cache *bio_slab;
         unsigned int front_pad;
   
+ +      /*
+ +       * per-cpu bio alloc cache
+ +       */
+ +      struct bio_alloc_cache __percpu *cache;
+ +
         mempool_t bio_pool;
         mempool_t bvec_pool;
   #if defined(CONFIG_BLK_DEV_INTEGRITY)
@@@ -681,11 -715,6 +681,11 @@@
         struct bio_list         rescue_list;
         struct work_struct      rescue_work;
         struct workqueue_struct *rescue_workqueue;
+ +
+ +      /*
+ +       * Hot un-plug notifier for the per-cpu cache, if used
+ +       */
+ +      struct hlist_node cpuhp_dead;
   };
   
   static inline bool bioset_initialized(struct bio_set *bs)
diff --combined include/linux/blk_types.h

index 9e392daa1d7f59221e0580913c6d90aeb176dff5,bca4d33876d45d805d8b1295b669436854358752..be622b5a21ed5c57e3793a1a8c68a470c8b1dc34
--- 1/include/linux/blk_types.h
--- 2/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@@ -34,10 -34,14 +34,10 @@@ struct block_device 
         void *                  bd_holder;
         int                     bd_holders;
         bool                    bd_write_holder;
- -#ifdef CONFIG_SYSFS
- -      struct list_head        bd_holder_disks;
- -#endif
         struct kobject          *bd_holder_dir;
         u8                      bd_partno;
         spinlock_t              bd_size_lock; /* for bd_inode->i_size updates */
         struct gendisk *        bd_disk;
- -      struct backing_dev_info *bd_bdi;
   
         /* The counter of freeze processes */
         int                     bd_fsfreeze_count;
@@@ -277,6 -281,7 +277,7 @@@ struct bio 
   };
   
   #define BIO_RESET_BYTES               offsetof(struct bio, bi_max_vecs)
+ #define BIO_MAX_SECTORS               (UINT_MAX >> SECTOR_SHIFT)
   
   /*
    * bio flags
@@@ -297,7 -302,6 +298,7 @@@ enum 
         BIO_TRACKED,            /* set if bio goes through the rq_qos path */
         BIO_REMAPPED,
         BIO_ZONE_WRITE_LOCKED,  /* Owns a zoned device zone write lock */
+ +      BIO_PERCPU_CACHE,       /* can participate in per-cpu alloc cache */
         BIO_FLAG_LAST
   };
   
diff --combined include/linux/fs.h

index c58c2611a1959cff870f35d03e45c28babac03cd,1751addcb36e1ea1721be0bf9d990d87c1263c94..2c2bcfb12fef9d07b8215e2d39b89412a720003d
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -319,8 -319,6 +319,8 @@@ enum rw_hint 
   /* iocb->ki_waitq is valid */
   #define IOCB_WAITQ            (1 << 19)
   #define IOCB_NOIO             (1 << 20)
+ +/* can use bio alloc cache */
+ +#define IOCB_ALLOC_CACHE      (1 << 21)
   
   struct kiocb {
         struct file             *ki_filp;
@@@ -438,10 -436,6 +438,10 @@@ int pagecache_write_end(struct file *, 
    * struct address_space - Contents of a cacheable, mappable object.
    * @host: Owner, either the inode or the block_device.
    * @i_pages: Cached pages.
+ + * @invalidate_lock: Guards coherency between page cache contents and
+ + *   file offset->disk block mappings in the filesystem during invalidates.
+ + *   It is also used to block modification of page cache contents through
+ + *   memory mappings.
    * @gfp_mask: Memory allocation flags to use for allocating pages.
    * @i_mmap_writable: Number of VM_SHARED mappings.
    * @nr_thps: Number of THPs in the pagecache (non-shmem only).
@@@ -459,7 -453,6 +459,7 @@@
   struct address_space {
         struct inode            *host;
         struct xarray           i_pages;
+ +      struct rw_semaphore     invalidate_lock;
         gfp_t                   gfp_mask;
         atomic_t                i_mmap_writable;
   #ifdef CONFIG_READ_ONLY_THP_FOR_FS
@@@ -821,42 -814,9 +821,42 @@@ static inline void inode_lock_shared_ne
         down_read_nested(&inode->i_rwsem, subclass);
   }
   
+ +static inline void filemap_invalidate_lock(struct address_space *mapping)
+ +{
+ +      down_write(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline void filemap_invalidate_unlock(struct address_space *mapping)
+ +{
+ +      up_write(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline void filemap_invalidate_lock_shared(struct address_space *mapping)
+ +{
+ +      down_read(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline int filemap_invalidate_trylock_shared(
+ +                                      struct address_space *mapping)
+ +{
+ +      return down_read_trylock(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline void filemap_invalidate_unlock_shared(
+ +                                      struct address_space *mapping)
+ +{
+ +      up_read(&mapping->invalidate_lock);
+ +}
+ +
   void lock_two_nondirectories(struct inode *, struct inode*);
   void unlock_two_nondirectories(struct inode *, struct inode*);
   
+ +void filemap_invalidate_lock_two(struct address_space *mapping1,
+ +                               struct address_space *mapping2);
+ +void filemap_invalidate_unlock_two(struct address_space *mapping1,
+ +                                 struct address_space *mapping2);
+ +
+ +
   /*
    * NOTE: in a 32bit arch with a preemptable kernel and
    * an UP compile the i_size_read/write must be atomic
@@@ -1547,11 -1507,8 +1547,11 @@@ struct super_block 
         /* Number of inodes with nlink == 0 but still referenced */
         atomic_long_t s_remove_count;
   
- -      /* Pending fsnotify inode refs */
- -      atomic_long_t s_fsnotify_inode_refs;
+ +      /*
+ +       * Number of inode/mount/sb objects that are being watched, note that
+ +       * inodes objects are currently double-accounted.
+ +       */
+ +      atomic_long_t s_fsnotify_connectors;
   
         /* Being remounted read-only */
         int s_readonly_remount;
@@@ -2500,7 -2457,6 +2500,6 @@@ static inline void file_accessed(struc
   
   extern int file_modified(struct file *file);
   
- int sync_inode(struct inode *inode, struct writeback_control *wbc);
   int sync_inode_metadata(struct inode *inode, int wait);
   
   struct file_system_type {
@@@ -2530,7 -2486,6 +2529,7 @@@
   
         struct lock_class_key i_lock_key;
         struct lock_class_key i_mutex_key;
+ +      struct lock_class_key invalidate_lock_key;
         struct lock_class_key i_mutex_dir_key;
   };
   
@@@ -2614,6 -2569,90 +2613,6 @@@ extern struct kobject *fs_kobj
   
   #define MAX_RW_COUNT (INT_MAX & PAGE_MASK)
   
- -#ifdef CONFIG_MANDATORY_FILE_LOCKING
- -extern int locks_mandatory_locked(struct file *);
- -extern int locks_mandatory_area(struct inode *, struct file *, loff_t, loff_t, unsigned char);
- -
- -/*
- - * Candidates for mandatory locking have the setgid bit set
- - * but no group execute bit -  an otherwise meaningless combination.
- - */
- -
- -static inline int __mandatory_lock(struct inode *ino)
- -{
- -      return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
- -}
- -
- -/*
- - * ... and these candidates should be on SB_MANDLOCK mounted fs,
- - * otherwise these will be advisory locks
- - */
- -
- -static inline int mandatory_lock(struct inode *ino)
- -{
- -      return IS_MANDLOCK(ino) && __mandatory_lock(ino);
- -}
- -
- -static inline int locks_verify_locked(struct file *file)
- -{
- -      if (mandatory_lock(locks_inode(file)))
- -              return locks_mandatory_locked(file);
- -      return 0;
- -}
- -
- -static inline int locks_verify_truncate(struct inode *inode,
- -                                  struct file *f,
- -                                  loff_t size)
- -{
- -      if (!inode->i_flctx || !mandatory_lock(inode))
- -              return 0;
- -
- -      if (size < inode->i_size) {
- -              return locks_mandatory_area(inode, f, size, inode->i_size - 1,
- -                              F_WRLCK);
- -      } else {
- -              return locks_mandatory_area(inode, f, inode->i_size, size - 1,
- -                              F_WRLCK);
- -      }
- -}
- -
- -#else /* !CONFIG_MANDATORY_FILE_LOCKING */
- -
- -static inline int locks_mandatory_locked(struct file *file)
- -{
- -      return 0;
- -}
- -
- -static inline int locks_mandatory_area(struct inode *inode, struct file *filp,
- -                                       loff_t start, loff_t end, unsigned char type)
- -{
- -      return 0;
- -}
- -
- -static inline int __mandatory_lock(struct inode *inode)
- -{
- -      return 0;
- -}
- -
- -static inline int mandatory_lock(struct inode *inode)
- -{
- -      return 0;
- -}
- -
- -static inline int locks_verify_locked(struct file *file)
- -{
- -      return 0;
- -}
- -
- -static inline int locks_verify_truncate(struct inode *inode, struct file *filp,
- -                                      size_t size)
- -{
- -      return 0;
- -}
- -
- -#endif /* CONFIG_MANDATORY_FILE_LOCKING */
- -
- -
   #ifdef CONFIG_FILE_LOCKING
   static inline int break_lease(struct inode *inode, unsigned int mode)
   {
@@@ -2746,7 -2785,6 +2745,7 @@@ static inline struct file *file_clone_o
   extern int filp_close(struct file *, fl_owner_t id);
   
   extern struct filename *getname_flags(const char __user *, int, int *);
+ +extern struct filename *getname_uflags(const char __user *, int);
   extern struct filename *getname(const char __user *);
   extern struct filename *getname_kernel(const char *);
   extern void putname(struct filename *name);
@@@ -2852,6 -2890,8 +2851,8 @@@ extern int filemap_fdatawrite_range(str
                                 loff_t start, loff_t end);
   extern int filemap_check_errors(struct address_space *mapping);
   extern void __filemap_set_wb_err(struct address_space *mapping, int err);
+ int filemap_fdatawrite_wbc(struct address_space *mapping,
+                          struct writeback_control *wbc);
   
   static inline int filemap_write_and_wait(struct address_space *mapping)
   {
@@@ -3207,6 -3247,10 +3208,6 @@@ ssize_t vfs_iocb_iter_read(struct file 
   ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                             struct iov_iter *iter);
   
- -/* fs/block_dev.c */
- -extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
- -                      int datasync);
- -
   /* fs/splice.c */
   extern ssize_t generic_file_splice_read(struct file *, loff_t *,
                 struct pipe_inode_info *, size_t, unsigned int);
diff --combined mm/filemap.c

index 0fad08331cf441a22323e54fefdeb4608720e4fa,034d370d4ebb5bc72c92aac1fdd2de0b55553403..920e8dc03251d51469d068e47689c842b37ed24a
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -76,9 -76,8 +76,9 @@@
    *      ->swap_lock           (exclusive_swap_page, others)
    *        ->i_pages lock
    *
- - *  ->i_mutex
- - *    ->i_mmap_rwsem          (truncate->unmap_mapping_range)
+ + *  ->i_rwsem
+ + *    ->invalidate_lock               (acquired by fs in truncate path)
+ + *      ->i_mmap_rwsem                (truncate->unmap_mapping_range)
    *
    *  ->mmap_lock
    *    ->i_mmap_rwsem
@@@ -86,10 -85,9 +86,10 @@@
    *        ->i_pages lock      (arch-dependent flush_dcache_mmap_lock)
    *
    *  ->mmap_lock
- - *    ->lock_page             (access_process_vm)
+ + *    ->invalidate_lock               (filemap_fault)
+ + *      ->lock_page           (filemap_fault, access_process_vm)
    *
- - *  ->i_mutex                 (generic_perform_write)
+ + *  ->i_rwsem                 (generic_perform_write)
    *    ->mmap_lock             (fault_in_pages_readable->do_page_fault)
    *
    *  bdi->wb.list_lock
@@@ -379,6 -377,32 +379,32 @@@ static int filemap_check_and_keep_error
         return 0;
   }
   
+ /**
+  * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
+  * @mapping:  address space structure to write
+  * @wbc:      the writeback_control controlling the writeout
+  *
+  * Call writepages on the mapping using the provided wbc to control the
+  * writeout.
+  *
+  * Return: %0 on success, negative error code otherwise.
+  */
+ int filemap_fdatawrite_wbc(struct address_space *mapping,
+                          struct writeback_control *wbc)
+ {
+       int ret;
+ 
+       if (!mapping_can_writeback(mapping) ||
+           !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+               return 0;
+ 
+       wbc_attach_fdatawrite_inode(wbc, mapping->host);
+       ret = do_writepages(mapping, wbc);
+       wbc_detach_inode(wbc);
+       return ret;
+ }
+ EXPORT_SYMBOL(filemap_fdatawrite_wbc);
+ 
   /**
    * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
    * @mapping:  address space structure to write
@@@ -399,7 -423,6 +425,6 @@@
   int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
                                 loff_t end, int sync_mode)
   {
-       int ret;
         struct writeback_control wbc = {
                 .sync_mode = sync_mode,
                 .nr_to_write = LONG_MAX,
@@@ -407,14 -430,7 +432,7 @@@
                 .range_end = end,
         };
   
-       if (!mapping_can_writeback(mapping) ||
-           !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-               return 0;
- 
-       wbc_attach_fdatawrite_inode(&wbc, mapping->host);
-       ret = do_writepages(mapping, &wbc);
-       wbc_detach_inode(&wbc);
-       return ret;
+       return filemap_fdatawrite_wbc(mapping, &wbc);
   }
   
   static inline int __filemap_fdatawrite(struct address_space *mapping,
@@@ -1009,44 -1025,6 +1027,44 @@@ struct page *__page_cache_alloc(gfp_t g
   EXPORT_SYMBOL(__page_cache_alloc);
   #endif
   
+ +/*
+ + * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
+ + *
+ + * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
+ + *
+ + * @mapping1: the first mapping to lock
+ + * @mapping2: the second mapping to lock
+ + */
+ +void filemap_invalidate_lock_two(struct address_space *mapping1,
+ +                               struct address_space *mapping2)
+ +{
+ +      if (mapping1 > mapping2)
+ +              swap(mapping1, mapping2);
+ +      if (mapping1)
+ +              down_write(&mapping1->invalidate_lock);
+ +      if (mapping2 && mapping1 != mapping2)
+ +              down_write_nested(&mapping2->invalidate_lock, 1);
+ +}
+ +EXPORT_SYMBOL(filemap_invalidate_lock_two);
+ +
+ +/*
+ + * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
+ + *
+ + * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
+ + *
+ + * @mapping1: the first mapping to unlock
+ + * @mapping2: the second mapping to unlock
+ + */
+ +void filemap_invalidate_unlock_two(struct address_space *mapping1,
+ +                                 struct address_space *mapping2)
+ +{
+ +      if (mapping1)
+ +              up_write(&mapping1->invalidate_lock);
+ +      if (mapping2 && mapping1 != mapping2)
+ +              up_write(&mapping2->invalidate_lock);
+ +}
+ +EXPORT_SYMBOL(filemap_invalidate_unlock_two);
+ +
   /*
    * In order to wait for pages to become available there must be
    * waitqueues associated with pages. By using a hash table of
@@@ -2408,30 -2386,20 +2426,30 @@@ static int filemap_update_page(struct k
   {
         int error;
   
+ +      if (iocb->ki_flags & IOCB_NOWAIT) {
+ +              if (!filemap_invalidate_trylock_shared(mapping))
+ +                      return -EAGAIN;
+ +      } else {
+ +              filemap_invalidate_lock_shared(mapping);
+ +      }
+ +
         if (!trylock_page(page)) {
+ +              error = -EAGAIN;
                 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
- -                      return -EAGAIN;
+ +                      goto unlock_mapping;
                 if (!(iocb->ki_flags & IOCB_WAITQ)) {
+ +                      filemap_invalidate_unlock_shared(mapping);
                         put_and_wait_on_page_locked(page, TASK_KILLABLE);
                         return AOP_TRUNCATED_PAGE;
                 }
                 error = __lock_page_async(page, iocb->ki_waitq);
                 if (error)
- -                      return error;
+ +                      goto unlock_mapping;
         }
   
+ +      error = AOP_TRUNCATED_PAGE;
         if (!page->mapping)
- -              goto truncated;
+ +              goto unlock;
   
         error = 0;
         if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
@@@ -2442,13 -2410,15 +2460,13 @@@
                 goto unlock;
   
         error = filemap_read_page(iocb->ki_filp, mapping, page);
- -      if (error == AOP_TRUNCATED_PAGE)
- -              put_page(page);
- -      return error;
- -truncated:
- -      unlock_page(page);
- -      put_page(page);
- -      return AOP_TRUNCATED_PAGE;
+ +      goto unlock_mapping;
   unlock:
         unlock_page(page);
+ +unlock_mapping:
+ +      filemap_invalidate_unlock_shared(mapping);
+ +      if (error == AOP_TRUNCATED_PAGE)
+ +              put_page(page);
         return error;
   }
   
@@@ -2463,19 -2433,6 +2481,19 @@@ static int filemap_create_page(struct f
         if (!page)
                 return -ENOMEM;
   
+ +      /*
+ +       * Protect against truncate / hole punch. Grabbing invalidate_lock here
+ +       * assures we cannot instantiate and bring uptodate new pagecache pages
+ +       * after evicting page cache during truncate and before actually
+ +       * freeing blocks.  Note that we could release invalidate_lock after
+ +       * inserting the page into page cache as the locked page would then be
+ +       * enough to synchronize with hole punching. But there are code paths
+ +       * such as filemap_update_page() filling in partially uptodate pages or
+ +       * ->readpages() that need to hold invalidate_lock while mapping blocks
+ +       * for IO so let's hold the lock here as well to keep locking rules
+ +       * simple.
+ +       */
+ +      filemap_invalidate_lock_shared(mapping);
         error = add_to_page_cache_lru(page, mapping, index,
                         mapping_gfp_constraint(mapping, GFP_KERNEL));
         if (error == -EEXIST)
@@@ -2487,11 -2444,9 +2505,11 @@@
         if (error)
                 goto error;
   
+ +      filemap_invalidate_unlock_shared(mapping);
         pagevec_add(pvec, page);
         return 0;
   error:
+ +      filemap_invalidate_unlock_shared(mapping);
         put_page(page);
         return error;
   }
@@@ -3030,7 -2985,6 +3048,7 @@@ vm_fault_t filemap_fault(struct vm_faul
         pgoff_t max_off;
         struct page *page;
         vm_fault_t ret = 0;
+ +      bool mapping_locked = false;
   
         max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
         if (unlikely(offset >= max_off))
@@@ -3040,39 -2994,25 +3058,39 @@@
          * Do we have something in the page cache already?
          */
         page = find_get_page(mapping, offset);
- -      if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
+ +      if (likely(page)) {
                 /*
- -               * We found the page, so try async readahead before
- -               * waiting for the lock.
+ +               * We found the page, so try async readahead before waiting for
+ +               * the lock.
                  */
- -              fpin = do_async_mmap_readahead(vmf, page);
- -      } else if (!page) {
+ +              if (!(vmf->flags & FAULT_FLAG_TRIED))
+ +                      fpin = do_async_mmap_readahead(vmf, page);
+ +              if (unlikely(!PageUptodate(page))) {
+ +                      filemap_invalidate_lock_shared(mapping);
+ +                      mapping_locked = true;
+ +              }
+ +      } else {
                 /* No page in the page cache at all */
                 count_vm_event(PGMAJFAULT);
                 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
                 ret = VM_FAULT_MAJOR;
                 fpin = do_sync_mmap_readahead(vmf);
   retry_find:
+ +              /*
+ +               * See comment in filemap_create_page() why we need
+ +               * invalidate_lock
+ +               */
+ +              if (!mapping_locked) {
+ +                      filemap_invalidate_lock_shared(mapping);
+ +                      mapping_locked = true;
+ +              }
                 page = pagecache_get_page(mapping, offset,
                                           FGP_CREAT|FGP_FOR_MMAP,
                                           vmf->gfp_mask);
                 if (!page) {
                         if (fpin)
                                 goto out_retry;
+ +                      filemap_invalidate_unlock_shared(mapping);
                         return VM_FAULT_OOM;
                 }
         }
@@@ -3092,20 -3032,8 +3110,20 @@@
          * We have a locked page in the page cache, now we need to check
          * that it's up-to-date. If not, it is going to be due to an error.
          */
- -      if (unlikely(!PageUptodate(page)))
+ +      if (unlikely(!PageUptodate(page))) {
+ +              /*
+ +               * The page was in cache and uptodate and now it is not.
+ +               * Strange but possible since we didn't hold the page lock all
+ +               * the time. Let's drop everything get the invalidate lock and
+ +               * try again.
+ +               */
+ +              if (!mapping_locked) {
+ +                      unlock_page(page);
+ +                      put_page(page);
+ +                      goto retry_find;
+ +              }
                 goto page_not_uptodate;
+ +      }
   
         /*
          * We've made it this far and we had to drop our mmap_lock, now is the
@@@ -3116,8 -3044,6 +3134,8 @@@
                 unlock_page(page);
                 goto out_retry;
         }
+ +      if (mapping_locked)
+ +              filemap_invalidate_unlock_shared(mapping);
   
         /*
          * Found the page and have a reference on it.
@@@ -3148,7 -3074,6 +3166,7 @@@ page_not_uptodate
   
         if (!error || error == AOP_TRUNCATED_PAGE)
                 goto retry_find;
+ +      filemap_invalidate_unlock_shared(mapping);
   
         return VM_FAULT_SIGBUS;
   
@@@ -3160,8 -3085,6 +3178,8 @@@ out_retry
          */
         if (page)
                 put_page(page);
+ +      if (mapping_locked)
+ +              filemap_invalidate_unlock_shared(mapping);
         if (fpin)
                 fput(fpin);
         return ret | VM_FAULT_RETRY;
@@@ -3532,8 -3455,6 +3550,8 @@@ out
    *
    * If the page does not get brought uptodate, return -EIO.
    *
+ + * The function expects mapping->invalidate_lock to be already held.
+ + *
    * Return: up to date page on success, ERR_PTR() on failure.
    */
   struct page *read_cache_page(struct address_space *mapping,
@@@ -3557,8 -3478,6 +3575,8 @@@ EXPORT_SYMBOL(read_cache_page)
    *
    * If the page does not get brought uptodate, return -EIO.
    *
+ + * The function expects mapping->invalidate_lock to be already held.
+ + *
    * Return: up to date page on success, ERR_PTR() on failure.
    */
   struct page *read_cache_page_gfp(struct address_space *mapping,
@@@ -3803,12 -3722,12 +3821,12 @@@ EXPORT_SYMBOL(generic_perform_write)
    * modification times and calls proper subroutines depending on whether we
    * do direct IO or a standard buffered write.
    *
- - * It expects i_mutex to be grabbed unless we work on a block device or similar
+ + * It expects i_rwsem to be grabbed unless we work on a block device or similar
    * object which does not need locking at all.
    *
    * This function does *not* take care of syncing data in case of O_SYNC write.
    * A caller has to handle it. This is mainly due to the fact that we want to
- - * avoid syncing under i_mutex.
+ + * avoid syncing under i_rwsem.
    *
    * Return:
    * * number of bytes written, even for truncated writes
@@@ -3896,7 -3815,7 +3914,7 @@@ EXPORT_SYMBOL(__generic_file_write_iter
    *
    * This is a wrapper around __generic_file_write_iter() to be used by most
    * filesystems. It takes care of syncing the file in case of O_SYNC file
- - * and acquires i_mutex as needed.
+ + * and acquires i_rwsem as needed.
    * Return:
    * * negative error code if no data has been written at all of
    *   vfs_fsync_range() failed for a synchronous write
author	Linus Torvalds <[email protected]>
	Tue, 31 Aug 2021 16:41:22 +0000 (09:41 -0700)
committer	Linus Torvalds <[email protected]>
	Tue, 31 Aug 2021 16:41:22 +0000 (09:41 -0700)
		1	2
block/bio.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/9p/vfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/bio.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blk_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history