#include "blk.h"
#include "blk-rq-qos.h"
+struct bio_alloc_cache {
+ struct bio_list free_list;
+ unsigned int nr;
+};
+
static struct biovec_slab {
int nr_vecs;
char *name;
void bio_init(struct bio *bio, struct bio_vec *table,
unsigned short max_vecs)
{
- memset(bio, 0, sizeof(*bio));
+ bio->bi_next = NULL;
+ bio->bi_bdev = NULL;
+ bio->bi_opf = 0;
+ bio->bi_flags = 0;
+ bio->bi_ioprio = 0;
+ bio->bi_write_hint = 0;
+ bio->bi_status = 0;
+ bio->bi_iter.bi_sector = 0;
+ bio->bi_iter.bi_size = 0;
+ bio->bi_iter.bi_idx = 0;
+ bio->bi_iter.bi_bvec_done = 0;
+ bio->bi_end_io = NULL;
+ bio->bi_private = NULL;
+#ifdef CONFIG_BLK_CGROUP
+ bio->bi_blkg = NULL;
+ bio->bi_issue.value = 0;
+#ifdef CONFIG_BLK_CGROUP_IOCOST
+ bio->bi_iocost_cost = 0;
+#endif
+#endif
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+ bio->bi_crypt_context = NULL;
+#endif
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ bio->bi_integrity = NULL;
+#endif
+ bio->bi_vcnt = 0;
+
atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1);
- bio->bi_io_vec = table;
bio->bi_max_vecs = max_vecs;
+ bio->bi_io_vec = table;
+ bio->bi_pool = NULL;
}
EXPORT_SYMBOL(bio_init);
void zero_fill_bio(struct bio *bio)
{
- unsigned long flags;
struct bio_vec bv;
struct bvec_iter iter;
- bio_for_each_segment(bv, bio, iter) {
- char *data = bvec_kmap_irq(&bv, &flags);
- memset(data, 0, bv.bv_len);
- flush_dcache_page(bv.bv_page);
- bvec_kunmap_irq(data, &flags);
- }
+ bio_for_each_segment(bv, bio, iter)
+ memzero_bvec(&bv);
}
EXPORT_SYMBOL(zero_fill_bio);
bio_truncate(bio, maxsector << 9);
}
+#define ALLOC_CACHE_MAX 512
+#define ALLOC_CACHE_SLACK 64
+
+static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+ unsigned int nr)
+{
+ unsigned int i = 0;
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&cache->free_list)) != NULL) {
+ cache->nr--;
+ bio_free(bio);
+ if (++i == nr)
+ break;
+ }
+}
+
+static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
+{
+ struct bio_set *bs;
+
+ bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead);
+ if (bs->cache) {
+ struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu);
+
+ bio_alloc_cache_prune(cache, -1U);
+ }
+ return 0;
+}
+
+static void bio_alloc_cache_destroy(struct bio_set *bs)
+{
+ int cpu;
+
+ if (!bs->cache)
+ return;
+
+ cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
+ for_each_possible_cpu(cpu) {
+ struct bio_alloc_cache *cache;
+
+ cache = per_cpu_ptr(bs->cache, cpu);
+ bio_alloc_cache_prune(cache, -1U);
+ }
+ free_percpu(bs->cache);
+}
+
/**
* bio_put - release a reference to a bio
* @bio: bio to release reference to
**/
void bio_put(struct bio *bio)
{
- if (!bio_flagged(bio, BIO_REFFED))
- bio_free(bio);
- else {
+ if (unlikely(bio_flagged(bio, BIO_REFFED))) {
BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+ if (!atomic_dec_and_test(&bio->__bi_cnt))
+ return;
+ }
- /*
- * last put frees it
- */
- if (atomic_dec_and_test(&bio->__bi_cnt))
- bio_free(bio);
+ if (bio_flagged(bio, BIO_PERCPU_CACHE)) {
+ struct bio_alloc_cache *cache;
+
+ bio_uninit(bio);
+ cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
+ bio_list_add_head(&cache->free_list, bio);
+ if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
+ bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
+ put_cpu();
+ } else {
+ bio_free(bio);
}
}
EXPORT_SYMBOL(bio_put);
return 0;
}
+static void bio_put_pages(struct page **pages, size_t size, size_t off)
+{
+ size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
+
+ for (i = 0; i < nr; i++)
+ put_page(pages[i]);
+}
+
#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
/**
if (same_page)
put_page(page);
} else {
- if (WARN_ON_ONCE(bio_full(bio, len)))
- return -EINVAL;
+ if (WARN_ON_ONCE(bio_full(bio, len))) {
+ bio_put_pages(pages + i, left, offset);
+ return -EINVAL;
+ }
__bio_add_page(bio, page, len, offset);
}
offset = 0;
len = min_t(size_t, PAGE_SIZE - offset, left);
if (bio_add_hw_page(q, bio, page, len, offset,
max_append_sectors, &same_page) != len) {
+ bio_put_pages(pages + i, left, offset);
ret = -EINVAL;
break;
}
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter)
{
- struct bio_vec src_bv, dst_bv;
- void *src_p, *dst_p;
- unsigned bytes;
-
while (src_iter->bi_size && dst_iter->bi_size) {
- src_bv = bio_iter_iovec(src, *src_iter);
- dst_bv = bio_iter_iovec(dst, *dst_iter);
-
- bytes = min(src_bv.bv_len, dst_bv.bv_len);
+ struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
+ struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
+ unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
+ void *src_buf;
- src_p = kmap_atomic(src_bv.bv_page);
- dst_p = kmap_atomic(dst_bv.bv_page);
-
- memcpy(dst_p + dst_bv.bv_offset,
- src_p + src_bv.bv_offset,
- bytes);
-
- kunmap_atomic(dst_p);
- kunmap_atomic(src_p);
-
- flush_dcache_page(dst_bv.bv_page);
+ src_buf = bvec_kmap_local(&src_bv);
+ memcpy_to_bvec(&dst_bv, src_buf);
+ kunmap_local(src_buf);
bio_advance_iter_single(src, src_iter, bytes);
bio_advance_iter_single(dst, dst_iter, bytes);
* @bio: bio to trim
* @offset: number of sectors to trim from the front of @bio
* @size: size we want to trim @bio to, in sectors
+ *
+ * This function is typically used for bios that are cloned and submitted
+ * to the underlying device in parts.
*/
- void bio_trim(struct bio *bio, int offset, int size)
+ void bio_trim(struct bio *bio, sector_t offset, sector_t size)
{
- /* 'bio' is a cloned bio which we need to trim to match
- * the given offset and size.
- */
+ if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
+ offset + size > bio->bi_iter.bi_size))
+ return;
size <<= 9;
if (offset == 0 && size == bio->bi_iter.bi_size)
if (bio_integrity(bio))
bio_integrity_trim(bio);
-
}
EXPORT_SYMBOL_GPL(bio_trim);
*/
void bioset_exit(struct bio_set *bs)
{
+ bio_alloc_cache_destroy(bs);
if (bs->rescue_workqueue)
destroy_workqueue(bs->rescue_workqueue);
bs->rescue_workqueue = NULL;
biovec_init_pool(&bs->bvec_pool, pool_size))
goto bad;
- if (!(flags & BIOSET_NEED_RESCUER))
- return 0;
-
- bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
- if (!bs->rescue_workqueue)
- goto bad;
+ if (flags & BIOSET_NEED_RESCUER) {
+ bs->rescue_workqueue = alloc_workqueue("bioset",
+ WQ_MEM_RECLAIM, 0);
+ if (!bs->rescue_workqueue)
+ goto bad;
+ }
+ if (flags & BIOSET_PERCPU_CACHE) {
+ bs->cache = alloc_percpu(struct bio_alloc_cache);
+ if (!bs->cache)
+ goto bad;
+ cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
+ }
return 0;
bad:
}
EXPORT_SYMBOL(bioset_init_from_src);
+/**
+ * bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb
+ * @kiocb: kiocb describing the IO
+ * @nr_iovecs: number of iovecs to pre-allocate
+ * @bs: bio_set to allocate from
+ *
+ * Description:
+ * Like @bio_alloc_bioset, but pass in the kiocb. The kiocb is only
+ * used to check if we should dip into the per-cpu bio_set allocation
+ * cache. The allocation uses GFP_KERNEL internally. On return, the
+ * bio is marked BIO_PERCPU_CACHEABLE, and the final put of the bio
+ * MUST be done from process context, not hard/soft IRQ.
+ *
+ */
+struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
+ struct bio_set *bs)
+{
+ struct bio_alloc_cache *cache;
+ struct bio *bio;
+
+ if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS)
+ return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+
+ cache = per_cpu_ptr(bs->cache, get_cpu());
+ bio = bio_list_pop(&cache->free_list);
+ if (bio) {
+ cache->nr--;
+ put_cpu();
+ bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
+ bio->bi_pool = bs;
+ bio_set_flag(bio, BIO_PERCPU_CACHE);
+ return bio;
+ }
+ put_cpu();
+ bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+ bio_set_flag(bio, BIO_PERCPU_CACHE);
+ return bio;
+}
+EXPORT_SYMBOL_GPL(bio_alloc_kiocb);
+
static int __init init_bio(void)
{
int i;
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
}
+ cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
+ bio_cpu_dead);
+
if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
panic("bio: can't allocate bios\n");
p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
- /* No mandatory locks */
- if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
-
if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
filemap_write_and_wait(inode->i_mapping);
invalidate_mapping_pages(&inode->i_data, 0, -1);
p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
filp, cmd, fl, filp);
- /* No mandatory locks */
- if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- goto out_err;
-
if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
filemap_write_and_wait(inode->i_mapping);
invalidate_mapping_pages(&inode->i_data, 0, -1);
ret = v9fs_file_getlock(filp, fl);
else
ret = -EINVAL;
-out_err:
return ret;
}
p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
filp, cmd, fl, filp);
- /* No mandatory locks */
- if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- goto out_err;
-
if (!(fl->fl_flags & FL_FLOCK))
goto out_err;
p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
inode = file_inode(vma->vm_file);
-
- if (!mapping_can_writeback(inode->i_mapping))
- wbc.nr_to_write = 0;
-
- might_sleep();
- sync_inode(inode, &wbc);
+ filemap_fdatawrite_wbc(inode->i_mapping, &wbc);
}
#include <linux/sched/mm.h>
#include <linux/iomap.h>
#include <asm/unaligned.h>
+ #include <linux/fsverity.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
cur_size = min_t(unsigned long, compressed_size,
PAGE_SIZE);
- kaddr = kmap_atomic(cpage);
+ kaddr = page_address(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_atomic(kaddr);
i++;
ptr += cur_size;
*/
static inline bool inode_can_compress(struct btrfs_inode *inode)
{
+ /* Subpage doesn't support compression yet */
+ if (inode->root->fs_info->sectorsize < PAGE_SIZE)
+ return false;
if (inode->flags & BTRFS_INODE_NODATACOW ||
inode->flags & BTRFS_INODE_NODATASUM)
return false;
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) {
+ if (inode_need_compress(BTRFS_I(inode), start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
}
}
cont:
- if (start == 0) {
+ /*
+ * Check cow_file_range() for why we don't even try to create inline
+ * extent for subpage case.
+ */
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
/* lets try to make an inline extent */
if (ret || total_in < actual_end) {
/* we didn't compress the entire range, try
p->mapping = inode->vfs_inode.i_mapping;
btrfs_writepage_endio_finish_ordered(inode, p, start,
- end, 0);
+ end, false);
p->mapping = NULL;
extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
- if (start == 0) {
+ /*
+ * Due to the page size limit, for subpage we can only trigger the
+ * writeback for the dirty sectors of page, that means data writeback
+ * is doing more writeback than what we want.
+ *
+ * This is especially unexpected for some call sites like fallocate,
+ * where we only increase i_size after everything is done.
+ * This means we can trigger inline extent even if we didn't want to.
+ * So here we skip inline extent creation completely.
+ */
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
/* lets try to make an inline extent */
ret = cow_file_range_inline(inode, start, end, 0,
BTRFS_COMPRESS_NONE, NULL);
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
- /* atomic_sub_return implies a barrier */
- if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
- 5 * SZ_1M)
- cond_wake_up_nomb(&fs_info->async_submit_wait);
-
/*
* ->inode could be NULL if async_chunk_start has failed to compress,
* in which case we don't have anything to submit, yet we need to
*/
if (async_chunk->inode)
submit_compressed_extents(async_chunk);
+
+ /* atomic_sub_return implies a barrier */
+ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+ 5 * SZ_1M)
+ cond_wake_up_nomb(&fs_info->async_submit_wait);
}
static noinline void async_cow_free(struct btrfs_work *work)
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
page_started, nr_written);
}
+ ASSERT(ret <= 0);
if (ret)
btrfs_cleanup_ordered_extents(inode, locked_page, start,
end - start + 1);
struct extent_map *split_mid = NULL;
struct extent_map *split_post = NULL;
int ret = 0;
- int modified;
unsigned long flags;
/* Sanity check */
ASSERT(em->len == len);
ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+ ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+ ASSERT(!list_empty(&em->list));
flags = em->flags;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &flags);
- modified = !list_empty(&em->list);
/* First, replace the em with a new extent_map starting from * em->start */
split_pre->start = em->start;
split_pre->compress_type = em->compress_type;
split_pre->generation = em->generation;
- replace_extent_mapping(em_tree, em, split_pre, modified);
+ replace_extent_mapping(em_tree, em, split_pre, 1);
/*
* Now we only have an extent_map at:
split_mid->flags = flags;
split_mid->compress_type = em->compress_type;
split_mid->generation = em->generation;
- add_extent_mapping(em_tree, split_mid, modified);
+ add_extent_mapping(em_tree, split_mid, 1);
}
if (post) {
split_post->flags = flags;
split_post->compress_type = em->compress_type;
split_post->generation = em->generation;
- add_extent_mapping(em_tree, split_post, modified);
+ add_extent_mapping(em_tree, split_post, 1);
}
/* Once for us */
* to fix it up. The async helper will wait for ordered extents, set
* the delalloc bit and make it safe to write the page.
*/
- int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
+ int btrfs_writepage_cow_fixup(struct page *page)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
- u64 end, int uptodate)
+ u64 end, bool uptodate)
{
trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
return 0;
}
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ /*
+ * For subpage case, above PageChecked is not safe as it's not subpage
+ * compatible.
+ * But for now only cow fixup and compressed read utilize PageChecked
+ * flag, while in this context we can easily use io_bio->csum to
+ * determine if we really need to do csum verification.
+ *
+ * So for now, just exit if io_bio->csum is NULL, as it means it's
+ * compressed read, and its compressed data csum has already been
+ * verified.
+ */
+ if (io_bio->csum == NULL)
return 0;
- if (!root->fs_info->csum_root)
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
return 0;
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
- test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
- clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
+ if (!root->fs_info->csum_root)
return 0;
- }
ASSERT(page_offset(page) <= start &&
end <= page_offset(page) + PAGE_SIZE - 1);
for (pg_off = offset_in_page(start);
pg_off < offset_in_page(end);
pg_off += sectorsize, bio_offset += sectorsize) {
+ u64 file_offset = pg_off + page_offset(page);
int ret;
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ test_range_bit(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM, 1, NULL)) {
+ /* Skip the range without csum for data reloc inode */
+ clear_extent_bits(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM);
+ continue;
+ }
ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
page_offset(page) + pg_off);
if (ret < 0) {
/*
* If we have an inode with links, there are a couple of
- * possibilities. Old kernels (before v3.12) used to create an
+ * possibilities:
+ *
+ * 1. We were halfway through creating fsverity metadata for the
+ * file. In that case, the orphan item represents incomplete
+ * fsverity metadata which must be cleaned up with
+ * btrfs_drop_verity_items and deleting the orphan item.
+
+ * 2. Old kernels (before v3.12) used to create an
* orphan item for truncate indicating that there were possibly
* extent items past i_size that needed to be deleted. In v3.12,
* truncate was changed to update i_size in sync with the extent
* but either way, we can delete the orphan item.
*/
if (ret == -ENOENT || inode->i_nlink) {
- if (!ret)
+ if (!ret) {
+ ret = btrfs_drop_verity_items(BTRFS_I(inode));
iput(inode);
+ if (ret)
+ goto out;
+ }
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
rdev = btrfs_inode_rdev(leaf, inode_item);
BTRFS_I(inode)->index_cnt = (u64)-1;
- BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+ btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
+ &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
cache_index:
/*
struct inode *inode)
{
struct btrfs_map_token token;
+ u64 flags;
btrfs_init_map_token(&token, leaf);
btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
btrfs_set_token_inode_transid(&token, item, trans->transid);
btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
- btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_token_inode_flags(&token, item, flags);
btrfs_set_token_inode_block_group(&token, item, 0);
}
int ret;
/*
- * Still need to make sure the inode looks like it's been updated so
- * that any holes get logged if we fsync.
+ * If NO_HOLES is enabled, we don't need to do anything.
+ * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
+ * or btrfs_update_inode() will be called, which guarantee that the next
+ * fsync will know this inode was changed and needs to be logged.
*/
- if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
- inode->last_trans = fs_info->generation;
- inode->last_sub_trans = root->log_transid;
- inode->last_log_commit = root->last_log_commit;
+ if (btrfs_fs_incompat(fs_info, NO_HOLES))
return 0;
- }
/*
* 1 - for the one we're dropping
if (btrfs_root_readonly(root))
return -EROFS;
- err = setattr_prepare(&init_user_ns, dentry, attr);
+ err = setattr_prepare(mnt_userns, dentry, attr);
if (err)
return err;
}
if (attr->ia_valid) {
- setattr_copy(&init_user_ns, inode, attr);
+ setattr_copy(mnt_userns, inode, attr);
inode_inc_iversion(inode);
err = btrfs_dirty_inode(inode);
if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(&init_user_ns, inode,
- inode->i_mode);
+ err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
}
return err;
trace_btrfs_inode_evict(inode);
if (!root) {
+ fsverity_cleanup_inode(inode);
clear_inode(inode);
return;
}
* to retry these periodically in the future.
*/
btrfs_remove_delayed_node(BTRFS_I(inode));
+ fsverity_cleanup_inode(inode);
clear_inode(inode);
}
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
+ struct user_namespace *mnt_userns,
struct inode *dir,
const char *name, int name_len,
u64 ref_objectid, u64 objectid,
if (ret != 0)
goto fail_unlock;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode_set_bytes(inode, 0);
inode->i_mtime = current_time(inode);
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
- mode, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
- mode, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
if (err)
goto out_fail;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid,
S_IFDIR | mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
u64 start_sector;
int async_submit = 0;
u64 submit_len;
- int clone_offset = 0;
- int clone_len;
+ u64 clone_offset = 0;
+ u64 clone_len;
u64 logical;
int ret;
blk_status_t status;
status = errno_to_blk_status(ret);
goto out_err_em;
}
- ASSERT(geom.len <= INT_MAX);
- clone_len = min_t(int, submit_len, geom.len);
+ clone_len = min(submit_len, geom.len);
+ ASSERT(clone_len <= UINT_MAX);
/*
* This will never fail as it's passing GPF_NOFS and
extent_readahead(rac);
}
+ /*
+ * For releasepage() and invalidatepage() we have a race window where
+ * end_page_writeback() is called but the subpage spinlock is not yet released.
+ * If we continue to release/invalidate the page, we could cause use-after-free
+ * for subpage spinlock. So this function is to spin and wait for subpage
+ * spinlock.
+ */
+ static void wait_subpage_spinlock(struct page *page)
+ {
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_subpage *subpage;
+
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return;
+
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ /*
+ * This may look insane as we just acquire the spinlock and release it,
+ * without doing anything. But we just want to make sure no one is
+ * still holding the subpage spinlock.
+ * And since the page is not dirty nor writeback, and we have page
+ * locked, the only possible way to hold a spinlock is from the endio
+ * function to clear page writeback.
+ *
+ * Here we just acquire the spinlock so that all existing callers
+ * should exit and we're safe to release/invalidate the page.
+ */
+ spin_lock_irq(&subpage->lock);
+ spin_unlock_irq(&subpage->lock);
+ }
+
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
int ret = try_release_extent_mapping(page, gfp_flags);
- if (ret == 1)
+
+ if (ret == 1) {
+ wait_subpage_spinlock(page);
clear_page_extent_mapped(page);
+ }
return ret;
}
* do double ordered extent accounting on the same page.
*/
wait_on_page_writeback(page);
+ wait_subpage_spinlock(page);
/*
* For subpage case, we have call sites like
spin_unlock_irq(&inode->ordered_tree.lock);
if (btrfs_dec_test_ordered_pending(inode, &ordered,
- cur, range_end + 1 - cur, 1)) {
+ cur, range_end + 1 - cur)) {
btrfs_finish_ordered_io(ordered);
/*
* The ordered extent has finished, now we're again
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
- struct btrfs_root *parent_root)
+ struct btrfs_root *parent_root,
+ struct user_namespace *mnt_userns)
{
struct inode *inode;
int err;
if (err < 0)
return err;
- inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
+ inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
+ ino, ino,
S_IFDIR | (~current_umask() & S_IRWXUGO),
&index);
if (IS_ERR(inode))
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
+ ei->ro_flags = 0;
ei->csum_bytes = 0;
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
struct inode *inode = d_inode(path->dentry);
u32 blocksize = inode->i_sb->s_blocksize;
u32 bi_flags = BTRFS_I(inode)->flags;
+ u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (bi_flags & BTRFS_INODE_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
+ if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
+ stat->attributes |= STATX_ATTR_VERITY;
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(&init_user_ns, inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
spin_lock(&BTRFS_I(inode)->lock);
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- root_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(dest);
- dest_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, root,
old_dentry->d_name.name,
old_dentry->d_name.len,
BTRFS_I(new_inode), 1);
}
+ /*
+ * Now pin the logs of the roots. We do it to ensure that no other task
+ * can sync the logs while we are in progress with the rename, because
+ * that could result in an inconsistency in case any of the inodes that
+ * are part of this rename operation were logged before.
+ *
+ * We pin the logs even if at this precise moment none of the inodes was
+ * logged before. This is because right after we checked for that, some
+ * other task fsyncing some other inode not involved with this rename
+ * operation could log that one of our inodes exists.
+ *
+ * We don't need to pin the logs before the above calls to
+ * btrfs_insert_inode_ref(), since those don't ever need to change a log.
+ */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_pin_log_trans(root);
+ root_log_pinned = true;
+ }
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_pin_log_trans(dest);
+ dest_log_pinned = true;
+ }
+
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- (new_inode &&
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
+ btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
btrfs_set_log_full_commit(trans);
if (root_log_pinned) {
static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
+ struct user_namespace *mnt_userns,
struct inode *dir,
struct dentry *dentry)
{
if (ret)
return ret;
- inode = btrfs_new_inode(trans, root, dir,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
dentry->d_name.name,
dentry->d_name.len,
btrfs_ino(BTRFS_I(dir)),
return ret;
}
- static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+ static int btrfs_rename(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
struct btrfs_trans_handle *trans;
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
+ /*
+ * Now pin the log. We do it to ensure that no other task can
+ * sync the log while we are in progress with the rename, as
+ * that could result in an inconsistency in case any of the
+ * inodes that are part of this rename operation were logged
+ * before.
+ *
+ * We pin the log even if at this precise moment none of the
+ * inodes was logged before. This is because right after we
+ * checked for that, some other task fsyncing some other inode
+ * not involved with this rename operation could log that one of
+ * our inodes exists.
+ *
+ * We don't need to pin the logs before the above call to
+ * btrfs_insert_inode_ref(), since that does not need to change
+ * a log.
+ */
+ btrfs_pin_log_trans(root);
+ log_pinned = true;
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
}
if (flags & RENAME_WHITEOUT) {
- ret = btrfs_whiteout_for_rename(trans, root, old_dir,
- old_dentry);
+ ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
+ old_dir, old_dentry);
if (ret) {
btrfs_abort_transaction(trans, ret);
return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
new_dentry);
- return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+ return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+ new_dentry, flags);
}
struct btrfs_delalloc_work {
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
} else {
- ret = sync_inode(inode, wbc);
- if (!ret &&
- test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = sync_inode(inode, wbc);
+ ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
btrfs_add_delayed_iput(inode);
if (ret || wbc->nr_to_write <= 0)
goto out;
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
- objectid, S_IFLNK|S_IRWXUGO, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid,
+ S_IFLNK | S_IRWXUGO, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
return -EACCES;
}
- return generic_permission(&init_user_ns, inode, mask);
+ return generic_permission(mnt_userns, inode, mask);
}
static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (ret)
goto out;
- inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
return result;
}
+struct filename *
+getname_uflags(const char __user *filename, int uflags)
+{
+ int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+
+ return getname_flags(filename, flags, NULL);
+}
+
struct filename *
getname(const char __user * filename)
{
void putname(struct filename *name)
{
+ if (IS_ERR_OR_NULL(name))
+ return;
+
BUG_ON(name->refcnt <= 0);
if (--name->refcnt > 0)
return err;
}
-int filename_lookup(int dfd, struct filename *name, unsigned flags,
+static int __filename_lookup(int dfd, struct filename *name, unsigned flags,
struct path *path, struct path *root)
{
int retval;
audit_inode(name, path->dentry,
flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
restore_nameidata();
+ return retval;
+}
+
+int filename_lookup(int dfd, struct filename *name, unsigned flags,
+ struct path *path, struct path *root)
+{
+ int retval = __filename_lookup(dfd, name, flags, path, root);
+
putname(name);
return retval;
}
return err;
}
-static struct filename *filename_parentat(int dfd, struct filename *name,
+static int __filename_parentat(int dfd, struct filename *name,
unsigned int flags, struct path *parent,
struct qstr *last, int *type)
{
struct nameidata nd;
if (IS_ERR(name))
- return name;
+ return PTR_ERR(name);
set_nameidata(&nd, dfd, name, NULL);
retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
if (unlikely(retval == -ECHILD))
*last = nd.last;
*type = nd.last_type;
audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
- } else {
- putname(name);
- name = ERR_PTR(retval);
}
restore_nameidata();
- return name;
+ return retval;
+}
+
+static int filename_parentat(int dfd, struct filename *name,
+ unsigned int flags, struct path *parent,
+ struct qstr *last, int *type)
+{
+ int retval = __filename_parentat(dfd, name, flags, parent, last, type);
+
+ putname(name);
+ return retval;
}
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
{
- struct filename *filename;
struct dentry *d;
struct qstr last;
- int type;
+ int type, error;
- filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
+ error = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
&last, &type);
- if (IS_ERR(filename))
- return ERR_CAST(filename);
+ if (error)
+ return ERR_PTR(error);
if (unlikely(type != LAST_NORM)) {
path_put(path);
- putname(filename);
return ERR_PTR(-EINVAL);
}
inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
inode_unlock(path->dentry->d_inode);
path_put(path);
}
- putname(filename);
return d;
}
}
EXPORT_SYMBOL(vfs_path_lookup);
- static int lookup_one_len_common(const char *name, struct dentry *base,
- int len, struct qstr *this)
+ static int lookup_one_common(struct user_namespace *mnt_userns,
+ const char *name, struct dentry *base, int len,
+ struct qstr *this)
{
this->name = name;
this->len = len;
return err;
}
- return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC);
+ return inode_permission(mnt_userns, base->d_inode, MAY_EXEC);
}
/**
WARN_ON_ONCE(!inode_is_locked(base->d_inode));
- err = lookup_one_len_common(name, base, len, &this);
+ err = lookup_one_common(&init_user_ns, name, base, len, &this);
if (err)
return ERR_PTR(err);
WARN_ON_ONCE(!inode_is_locked(base->d_inode));
- err = lookup_one_len_common(name, base, len, &this);
+ err = lookup_one_common(&init_user_ns, name, base, len, &this);
if (err)
return ERR_PTR(err);
}
EXPORT_SYMBOL(lookup_one_len);
+ /**
+ * lookup_one - filesystem helper to lookup single pathname component
+ * @mnt_userns: user namespace of the mount the lookup is performed from
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
+ */
+ struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name,
+ struct dentry *base, int len)
+ {
+ struct dentry *dentry;
+ struct qstr this;
+ int err;
+
+ WARN_ON_ONCE(!inode_is_locked(base->d_inode));
+
+ err = lookup_one_common(mnt_userns, name, base, len, &this);
+ if (err)
+ return ERR_PTR(err);
+
+ dentry = lookup_dcache(&this, base, 0);
+ return dentry ? dentry : __lookup_slow(&this, base, 0);
+ }
+ EXPORT_SYMBOL(lookup_one);
+
/**
* lookup_one_len_unlocked - filesystem helper to lookup single pathname component
* @name: pathname component to lookup
int err;
struct dentry *ret;
- err = lookup_one_len_common(name, base, len, &this);
+ err = lookup_one_common(&init_user_ns, name, base, len, &this);
if (err)
return ERR_PTR(err);
/*
* Refuse to truncate files with mandatory locks held on them.
*/
- error = locks_verify_locked(filp);
- if (!error)
- error = security_path_truncate(path);
+ error = security_path_truncate(path);
if (!error) {
error = do_truncate(mnt_userns, path->dentry, 0,
ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
return file;
}
-static struct dentry *filename_create(int dfd, struct filename *name,
+static struct dentry *__filename_create(int dfd, struct filename *name,
struct path *path, unsigned int lookup_flags)
{
struct dentry *dentry = ERR_PTR(-EEXIST);
*/
lookup_flags &= LOOKUP_REVAL;
- name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
- if (IS_ERR(name))
- return ERR_CAST(name);
+ error = __filename_parentat(dfd, name, lookup_flags, path, &last, &type);
+ if (error)
+ return ERR_PTR(error);
/*
* Yucky last component or no last component at all?
error = err2;
goto fail;
}
- putname(name);
return dentry;
fail:
dput(dentry);
mnt_drop_write(path->mnt);
out:
path_put(path);
- putname(name);
return dentry;
}
+static inline struct dentry *filename_create(int dfd, struct filename *name,
+ struct path *path, unsigned int lookup_flags)
+{
+ struct dentry *res = __filename_create(dfd, name, path, lookup_flags);
+
+ putname(name);
+ return res;
+}
+
struct dentry *kern_path_create(int dfd, const char *pathname,
struct path *path, unsigned int lookup_flags)
{
}
}
-static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
+static int do_mknodat(int dfd, struct filename *name, umode_t mode,
unsigned int dev)
{
struct user_namespace *mnt_userns;
error = may_mknod(mode);
if (error)
- return error;
+ goto out1;
retry:
- dentry = user_path_create(dfd, filename, &path, lookup_flags);
+ dentry = __filename_create(dfd, name, &path, lookup_flags);
+ error = PTR_ERR(dentry);
if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+ goto out1;
if (!IS_POSIXACL(path.dentry->d_inode))
mode &= ~current_umask();
error = security_path_mknod(&path, dentry, mode, dev);
if (error)
- goto out;
+ goto out2;
mnt_userns = mnt_user_ns(path.mnt);
switch (mode & S_IFMT) {
dentry, mode, 0);
break;
}
-out:
+out2:
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+out1:
+ putname(name);
return error;
}
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
unsigned int, dev)
{
- return do_mknodat(dfd, filename, mode, dev);
+ return do_mknodat(dfd, getname(filename), mode, dev);
}
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
{
- return do_mknodat(AT_FDCWD, filename, mode, dev);
+ return do_mknodat(AT_FDCWD, getname(filename), mode, dev);
}
/**
}
EXPORT_SYMBOL(vfs_mkdir);
-static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
+int do_mkdirat(int dfd, struct filename *name, umode_t mode)
{
struct dentry *dentry;
struct path path;
unsigned int lookup_flags = LOOKUP_DIRECTORY;
retry:
- dentry = user_path_create(dfd, pathname, &path, lookup_flags);
+ dentry = __filename_create(dfd, name, &path, lookup_flags);
+ error = PTR_ERR(dentry);
if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+ goto out_putname;
if (!IS_POSIXACL(path.dentry->d_inode))
mode &= ~current_umask();
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+out_putname:
+ putname(name);
return error;
}
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
- return do_mkdirat(dfd, pathname, mode);
+ return do_mkdirat(dfd, getname(pathname), mode);
}
SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
{
- return do_mkdirat(AT_FDCWD, pathname, mode);
+ return do_mkdirat(AT_FDCWD, getname(pathname), mode);
}
/**
}
EXPORT_SYMBOL(vfs_rmdir);
-long do_rmdir(int dfd, struct filename *name)
+int do_rmdir(int dfd, struct filename *name)
{
struct user_namespace *mnt_userns;
- int error = 0;
+ int error;
struct dentry *dentry;
struct path path;
struct qstr last;
int type;
unsigned int lookup_flags = 0;
retry:
- name = filename_parentat(dfd, name, lookup_flags,
- &path, &last, &type);
- if (IS_ERR(name))
- return PTR_ERR(name);
+ error = __filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
+ if (error)
+ goto exit1;
switch (type) {
case LAST_DOTDOT:
error = -ENOTEMPTY;
- goto exit1;
+ goto exit2;
case LAST_DOT:
error = -EINVAL;
- goto exit1;
+ goto exit2;
case LAST_ROOT:
error = -EBUSY;
- goto exit1;
+ goto exit2;
}
error = mnt_want_write(path.mnt);
if (error)
- goto exit1;
+ goto exit2;
inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
dentry = __lookup_hash(&last, path.dentry, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
- goto exit2;
+ goto exit3;
if (!dentry->d_inode) {
error = -ENOENT;
- goto exit3;
+ goto exit4;
}
error = security_path_rmdir(&path, dentry);
if (error)
- goto exit3;
+ goto exit4;
mnt_userns = mnt_user_ns(path.mnt);
error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
-exit3:
+exit4:
dput(dentry);
-exit2:
+exit3:
inode_unlock(path.dentry->d_inode);
mnt_drop_write(path.mnt);
-exit1:
+exit2:
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+exit1:
putname(name);
return error;
}
* writeout happening, and we don't want to prevent access to the directory
* while waiting on the I/O.
*/
-long do_unlinkat(int dfd, struct filename *name)
+int do_unlinkat(int dfd, struct filename *name)
{
int error;
struct dentry *dentry;
struct inode *delegated_inode = NULL;
unsigned int lookup_flags = 0;
retry:
- name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
- if (IS_ERR(name))
- return PTR_ERR(name);
+ error = __filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
+ if (error)
+ goto exit1;
error = -EISDIR;
if (type != LAST_NORM)
- goto exit1;
+ goto exit2;
error = mnt_want_write(path.mnt);
if (error)
- goto exit1;
+ goto exit2;
retry_deleg:
inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
dentry = __lookup_hash(&last, path.dentry, lookup_flags);
ihold(inode);
error = security_path_unlink(&path, dentry);
if (error)
- goto exit2;
+ goto exit3;
mnt_userns = mnt_user_ns(path.mnt);
error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry,
&delegated_inode);
-exit2:
+exit3:
dput(dentry);
}
inode_unlock(path.dentry->d_inode);
goto retry_deleg;
}
mnt_drop_write(path.mnt);
-exit1:
+exit2:
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
inode = NULL;
goto retry;
}
+exit1:
putname(name);
return error;
error = -EISDIR;
else
error = -ENOTDIR;
- goto exit2;
+ goto exit3;
}
SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
}
EXPORT_SYMBOL(vfs_symlink);
-static long do_symlinkat(const char __user *oldname, int newdfd,
- const char __user *newname)
+int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
{
int error;
- struct filename *from;
struct dentry *dentry;
struct path path;
unsigned int lookup_flags = 0;
- from = getname(oldname);
- if (IS_ERR(from))
- return PTR_ERR(from);
+ if (IS_ERR(from)) {
+ error = PTR_ERR(from);
+ goto out_putnames;
+ }
retry:
- dentry = user_path_create(newdfd, newname, &path, lookup_flags);
+ dentry = __filename_create(newdfd, to, &path, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
- goto out_putname;
+ goto out_putnames;
error = security_path_symlink(&path, dentry, from->name);
if (!error) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
-out_putname:
+out_putnames:
+ putname(to);
putname(from);
return error;
}
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
int, newdfd, const char __user *, newname)
{
- return do_symlinkat(oldname, newdfd, newname);
+ return do_symlinkat(getname(oldname), newdfd, getname(newname));
}
SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
{
- return do_symlinkat(oldname, AT_FDCWD, newname);
+ return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname));
}
/**
* with linux 2.0, and to avoid hard-linking to directories
* and other special files. --ADM
*/
-static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
- const char __user *newname, int flags)
+int do_linkat(int olddfd, struct filename *old, int newdfd,
+ struct filename *new, int flags)
{
struct user_namespace *mnt_userns;
struct dentry *new_dentry;
int how = 0;
int error;
- if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
- return -EINVAL;
+ if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) {
+ error = -EINVAL;
+ goto out_putnames;
+ }
/*
* To use null names we require CAP_DAC_READ_SEARCH
* This ensures that not everyone will be able to create
* handlink using the passed filedescriptor.
*/
- if (flags & AT_EMPTY_PATH) {
- if (!capable(CAP_DAC_READ_SEARCH))
- return -ENOENT;
- how = LOOKUP_EMPTY;
+ if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) {
+ error = -ENOENT;
+ goto out_putnames;
}
if (flags & AT_SYMLINK_FOLLOW)
how |= LOOKUP_FOLLOW;
retry:
- error = user_path_at(olddfd, oldname, how, &old_path);
+ error = __filename_lookup(olddfd, old, how, &old_path, NULL);
if (error)
- return error;
+ goto out_putnames;
- new_dentry = user_path_create(newdfd, newname, &new_path,
+ new_dentry = __filename_create(newdfd, new, &new_path,
(how & LOOKUP_REVAL));
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
- goto out;
+ goto out_putpath;
error = -EXDEV;
if (old_path.mnt != new_path.mnt)
how |= LOOKUP_REVAL;
goto retry;
}
-out:
+out_putpath:
path_put(&old_path);
+out_putnames:
+ putname(old);
+ putname(new);
return error;
}
SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname, int, flags)
{
- return do_linkat(olddfd, oldname, newdfd, newname, flags);
+ return do_linkat(olddfd, getname_uflags(oldname, flags),
+ newdfd, getname(newname), flags);
}
SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
{
- return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+ return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0);
}
/**
int error = -EINVAL;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
- goto put_both;
+ goto put_names;
if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
(flags & RENAME_EXCHANGE))
- goto put_both;
+ goto put_names;
if (flags & RENAME_EXCHANGE)
target_flags = 0;
retry:
- from = filename_parentat(olddfd, from, lookup_flags, &old_path,
+ error = __filename_parentat(olddfd, from, lookup_flags, &old_path,
&old_last, &old_type);
- if (IS_ERR(from)) {
- error = PTR_ERR(from);
- goto put_new;
- }
+ if (error)
+ goto put_names;
- to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
+ error = __filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
&new_type);
- if (IS_ERR(to)) {
- error = PTR_ERR(to);
+ if (error)
goto exit1;
- }
error = -EXDEV;
if (old_path.mnt != new_path.mnt)
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
-put_both:
- if (!IS_ERR(from))
- putname(from);
-put_new:
- if (!IS_ERR(to))
- putname(to);
+put_names:
+ putname(from);
+ putname(to);
return error;
}
#ifndef __LINUX_BIO_H
#define __LINUX_BIO_H
-#include <linux/highmem.h>
#include <linux/mempool.h>
#include <linux/ioprio.h>
/* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
#endif /* CONFIG_BLK_DEV_INTEGRITY */
- extern void bio_trim(struct bio *bio, int offset, int size);
+ void bio_trim(struct bio *bio, sector_t offset, sector_t size);
extern struct bio *bio_split(struct bio *bio, int sectors,
gfp_t gfp, struct bio_set *bs);
enum {
BIOSET_NEED_BVECS = BIT(0),
BIOSET_NEED_RESCUER = BIT(1),
+ BIOSET_PERCPU_CACHE = BIT(2),
};
extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
extern void bioset_exit(struct bio_set *);
struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs,
struct bio_set *bs);
+struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
+ struct bio_set *bs);
struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs);
extern void bio_put(struct bio *);
struct bio *src) { }
#endif /* CONFIG_BLK_CGROUP */
-#ifdef CONFIG_HIGHMEM
-/*
- * remember never ever reenable interrupts between a bvec_kmap_irq and
- * bvec_kunmap_irq!
- */
-static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
-{
- unsigned long addr;
-
- /*
- * might not be a highmem page, but the preempt/irq count
- * balancing is a lot nicer this way
- */
- local_irq_save(*flags);
- addr = (unsigned long) kmap_atomic(bvec->bv_page);
-
- BUG_ON(addr & ~PAGE_MASK);
-
- return (char *) addr + bvec->bv_offset;
-}
-
-static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
-{
- unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
-
- kunmap_atomic((void *) ptr);
- local_irq_restore(*flags);
-}
-
-#else
-static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
-{
- return page_address(bvec->bv_page) + bvec->bv_offset;
-}
-
-static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
-{
- *flags = 0;
-}
-#endif
-
/*
* BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
*
struct kmem_cache *bio_slab;
unsigned int front_pad;
+ /*
+ * per-cpu bio alloc cache
+ */
+ struct bio_alloc_cache __percpu *cache;
+
mempool_t bio_pool;
mempool_t bvec_pool;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
struct bio_list rescue_list;
struct work_struct rescue_work;
struct workqueue_struct *rescue_workqueue;
+
+ /*
+ * Hot un-plug notifier for the per-cpu cache, if used
+ */
+ struct hlist_node cpuhp_dead;
};
static inline bool bioset_initialized(struct bio_set *bs)
void * bd_holder;
int bd_holders;
bool bd_write_holder;
-#ifdef CONFIG_SYSFS
- struct list_head bd_holder_disks;
-#endif
struct kobject *bd_holder_dir;
u8 bd_partno;
spinlock_t bd_size_lock; /* for bd_inode->i_size updates */
struct gendisk * bd_disk;
- struct backing_dev_info *bd_bdi;
/* The counter of freeze processes */
int bd_fsfreeze_count;
};
#define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs)
+ #define BIO_MAX_SECTORS (UINT_MAX >> SECTOR_SHIFT)
/*
* bio flags
BIO_TRACKED, /* set if bio goes through the rq_qos path */
BIO_REMAPPED,
BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */
+ BIO_PERCPU_CACHE, /* can participate in per-cpu alloc cache */
BIO_FLAG_LAST
};
/* iocb->ki_waitq is valid */
#define IOCB_WAITQ (1 << 19)
#define IOCB_NOIO (1 << 20)
+/* can use bio alloc cache */
+#define IOCB_ALLOC_CACHE (1 << 21)
struct kiocb {
struct file *ki_filp;
* struct address_space - Contents of a cacheable, mappable object.
* @host: Owner, either the inode or the block_device.
* @i_pages: Cached pages.
+ * @invalidate_lock: Guards coherency between page cache contents and
+ * file offset->disk block mappings in the filesystem during invalidates.
+ * It is also used to block modification of page cache contents through
+ * memory mappings.
* @gfp_mask: Memory allocation flags to use for allocating pages.
* @i_mmap_writable: Number of VM_SHARED mappings.
* @nr_thps: Number of THPs in the pagecache (non-shmem only).
struct address_space {
struct inode *host;
struct xarray i_pages;
+ struct rw_semaphore invalidate_lock;
gfp_t gfp_mask;
atomic_t i_mmap_writable;
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
down_read_nested(&inode->i_rwsem, subclass);
}
+static inline void filemap_invalidate_lock(struct address_space *mapping)
+{
+ down_write(&mapping->invalidate_lock);
+}
+
+static inline void filemap_invalidate_unlock(struct address_space *mapping)
+{
+ up_write(&mapping->invalidate_lock);
+}
+
+static inline void filemap_invalidate_lock_shared(struct address_space *mapping)
+{
+ down_read(&mapping->invalidate_lock);
+}
+
+static inline int filemap_invalidate_trylock_shared(
+ struct address_space *mapping)
+{
+ return down_read_trylock(&mapping->invalidate_lock);
+}
+
+static inline void filemap_invalidate_unlock_shared(
+ struct address_space *mapping)
+{
+ up_read(&mapping->invalidate_lock);
+}
+
void lock_two_nondirectories(struct inode *, struct inode*);
void unlock_two_nondirectories(struct inode *, struct inode*);
+void filemap_invalidate_lock_two(struct address_space *mapping1,
+ struct address_space *mapping2);
+void filemap_invalidate_unlock_two(struct address_space *mapping1,
+ struct address_space *mapping2);
+
+
/*
* NOTE: in a 32bit arch with a preemptable kernel and
* an UP compile the i_size_read/write must be atomic
/* Number of inodes with nlink == 0 but still referenced */
atomic_long_t s_remove_count;
- /* Pending fsnotify inode refs */
- atomic_long_t s_fsnotify_inode_refs;
+ /*
+ * Number of inode/mount/sb objects that are being watched, note that
+ * inodes objects are currently double-accounted.
+ */
+ atomic_long_t s_fsnotify_connectors;
/* Being remounted read-only */
int s_readonly_remount;
extern int file_modified(struct file *file);
- int sync_inode(struct inode *inode, struct writeback_control *wbc);
int sync_inode_metadata(struct inode *inode, int wait);
struct file_system_type {
struct lock_class_key i_lock_key;
struct lock_class_key i_mutex_key;
+ struct lock_class_key invalidate_lock_key;
struct lock_class_key i_mutex_dir_key;
};
#define MAX_RW_COUNT (INT_MAX & PAGE_MASK)
-#ifdef CONFIG_MANDATORY_FILE_LOCKING
-extern int locks_mandatory_locked(struct file *);
-extern int locks_mandatory_area(struct inode *, struct file *, loff_t, loff_t, unsigned char);
-
-/*
- * Candidates for mandatory locking have the setgid bit set
- * but no group execute bit - an otherwise meaningless combination.
- */
-
-static inline int __mandatory_lock(struct inode *ino)
-{
- return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
-}
-
-/*
- * ... and these candidates should be on SB_MANDLOCK mounted fs,
- * otherwise these will be advisory locks
- */
-
-static inline int mandatory_lock(struct inode *ino)
-{
- return IS_MANDLOCK(ino) && __mandatory_lock(ino);
-}
-
-static inline int locks_verify_locked(struct file *file)
-{
- if (mandatory_lock(locks_inode(file)))
- return locks_mandatory_locked(file);
- return 0;
-}
-
-static inline int locks_verify_truncate(struct inode *inode,
- struct file *f,
- loff_t size)
-{
- if (!inode->i_flctx || !mandatory_lock(inode))
- return 0;
-
- if (size < inode->i_size) {
- return locks_mandatory_area(inode, f, size, inode->i_size - 1,
- F_WRLCK);
- } else {
- return locks_mandatory_area(inode, f, inode->i_size, size - 1,
- F_WRLCK);
- }
-}
-
-#else /* !CONFIG_MANDATORY_FILE_LOCKING */
-
-static inline int locks_mandatory_locked(struct file *file)
-{
- return 0;
-}
-
-static inline int locks_mandatory_area(struct inode *inode, struct file *filp,
- loff_t start, loff_t end, unsigned char type)
-{
- return 0;
-}
-
-static inline int __mandatory_lock(struct inode *inode)
-{
- return 0;
-}
-
-static inline int mandatory_lock(struct inode *inode)
-{
- return 0;
-}
-
-static inline int locks_verify_locked(struct file *file)
-{
- return 0;
-}
-
-static inline int locks_verify_truncate(struct inode *inode, struct file *filp,
- size_t size)
-{
- return 0;
-}
-
-#endif /* CONFIG_MANDATORY_FILE_LOCKING */
-
-
#ifdef CONFIG_FILE_LOCKING
static inline int break_lease(struct inode *inode, unsigned int mode)
{
extern int filp_close(struct file *, fl_owner_t id);
extern struct filename *getname_flags(const char __user *, int, int *);
+extern struct filename *getname_uflags(const char __user *, int);
extern struct filename *getname(const char __user *);
extern struct filename *getname_kernel(const char *);
extern void putname(struct filename *name);
loff_t start, loff_t end);
extern int filemap_check_errors(struct address_space *mapping);
extern void __filemap_set_wb_err(struct address_space *mapping, int err);
+ int filemap_fdatawrite_wbc(struct address_space *mapping,
+ struct writeback_control *wbc);
static inline int filemap_write_and_wait(struct address_space *mapping)
{
ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
struct iov_iter *iter);
-/* fs/block_dev.c */
-extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
- int datasync);
-
/* fs/splice.c */
extern ssize_t generic_file_splice_read(struct file *, loff_t *,
struct pipe_inode_info *, size_t, unsigned int);
* ->swap_lock (exclusive_swap_page, others)
* ->i_pages lock
*
- * ->i_mutex
- * ->i_mmap_rwsem (truncate->unmap_mapping_range)
+ * ->i_rwsem
+ * ->invalidate_lock (acquired by fs in truncate path)
+ * ->i_mmap_rwsem (truncate->unmap_mapping_range)
*
* ->mmap_lock
* ->i_mmap_rwsem
* ->i_pages lock (arch-dependent flush_dcache_mmap_lock)
*
* ->mmap_lock
- * ->lock_page (access_process_vm)
+ * ->invalidate_lock (filemap_fault)
+ * ->lock_page (filemap_fault, access_process_vm)
*
- * ->i_mutex (generic_perform_write)
+ * ->i_rwsem (generic_perform_write)
* ->mmap_lock (fault_in_pages_readable->do_page_fault)
*
* bdi->wb.list_lock
return 0;
}
+ /**
+ * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
+ * @mapping: address space structure to write
+ * @wbc: the writeback_control controlling the writeout
+ *
+ * Call writepages on the mapping using the provided wbc to control the
+ * writeout.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+ int filemap_fdatawrite_wbc(struct address_space *mapping,
+ struct writeback_control *wbc)
+ {
+ int ret;
+
+ if (!mapping_can_writeback(mapping) ||
+ !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ return 0;
+
+ wbc_attach_fdatawrite_inode(wbc, mapping->host);
+ ret = do_writepages(mapping, wbc);
+ wbc_detach_inode(wbc);
+ return ret;
+ }
+ EXPORT_SYMBOL(filemap_fdatawrite_wbc);
+
/**
* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end, int sync_mode)
{
- int ret;
struct writeback_control wbc = {
.sync_mode = sync_mode,
.nr_to_write = LONG_MAX,
.range_end = end,
};
- if (!mapping_can_writeback(mapping) ||
- !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- return 0;
-
- wbc_attach_fdatawrite_inode(&wbc, mapping->host);
- ret = do_writepages(mapping, &wbc);
- wbc_detach_inode(&wbc);
- return ret;
+ return filemap_fdatawrite_wbc(mapping, &wbc);
}
static inline int __filemap_fdatawrite(struct address_space *mapping,
EXPORT_SYMBOL(__page_cache_alloc);
#endif
+/*
+ * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
+ *
+ * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
+ *
+ * @mapping1: the first mapping to lock
+ * @mapping2: the second mapping to lock
+ */
+void filemap_invalidate_lock_two(struct address_space *mapping1,
+ struct address_space *mapping2)
+{
+ if (mapping1 > mapping2)
+ swap(mapping1, mapping2);
+ if (mapping1)
+ down_write(&mapping1->invalidate_lock);
+ if (mapping2 && mapping1 != mapping2)
+ down_write_nested(&mapping2->invalidate_lock, 1);
+}
+EXPORT_SYMBOL(filemap_invalidate_lock_two);
+
+/*
+ * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
+ *
+ * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
+ *
+ * @mapping1: the first mapping to unlock
+ * @mapping2: the second mapping to unlock
+ */
+void filemap_invalidate_unlock_two(struct address_space *mapping1,
+ struct address_space *mapping2)
+{
+ if (mapping1)
+ up_write(&mapping1->invalidate_lock);
+ if (mapping2 && mapping1 != mapping2)
+ up_write(&mapping2->invalidate_lock);
+}
+EXPORT_SYMBOL(filemap_invalidate_unlock_two);
+
/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
{
int error;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!filemap_invalidate_trylock_shared(mapping))
+ return -EAGAIN;
+ } else {
+ filemap_invalidate_lock_shared(mapping);
+ }
+
if (!trylock_page(page)) {
+ error = -EAGAIN;
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
- return -EAGAIN;
+ goto unlock_mapping;
if (!(iocb->ki_flags & IOCB_WAITQ)) {
+ filemap_invalidate_unlock_shared(mapping);
put_and_wait_on_page_locked(page, TASK_KILLABLE);
return AOP_TRUNCATED_PAGE;
}
error = __lock_page_async(page, iocb->ki_waitq);
if (error)
- return error;
+ goto unlock_mapping;
}
+ error = AOP_TRUNCATED_PAGE;
if (!page->mapping)
- goto truncated;
+ goto unlock;
error = 0;
if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
goto unlock;
error = filemap_read_page(iocb->ki_filp, mapping, page);
- if (error == AOP_TRUNCATED_PAGE)
- put_page(page);
- return error;
-truncated:
- unlock_page(page);
- put_page(page);
- return AOP_TRUNCATED_PAGE;
+ goto unlock_mapping;
unlock:
unlock_page(page);
+unlock_mapping:
+ filemap_invalidate_unlock_shared(mapping);
+ if (error == AOP_TRUNCATED_PAGE)
+ put_page(page);
return error;
}
if (!page)
return -ENOMEM;
+ /*
+ * Protect against truncate / hole punch. Grabbing invalidate_lock here
+ * assures we cannot instantiate and bring uptodate new pagecache pages
+ * after evicting page cache during truncate and before actually
+ * freeing blocks. Note that we could release invalidate_lock after
+ * inserting the page into page cache as the locked page would then be
+ * enough to synchronize with hole punching. But there are code paths
+ * such as filemap_update_page() filling in partially uptodate pages or
+ * ->readpages() that need to hold invalidate_lock while mapping blocks
+ * for IO so let's hold the lock here as well to keep locking rules
+ * simple.
+ */
+ filemap_invalidate_lock_shared(mapping);
error = add_to_page_cache_lru(page, mapping, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error == -EEXIST)
if (error)
goto error;
+ filemap_invalidate_unlock_shared(mapping);
pagevec_add(pvec, page);
return 0;
error:
+ filemap_invalidate_unlock_shared(mapping);
put_page(page);
return error;
}
pgoff_t max_off;
struct page *page;
vm_fault_t ret = 0;
+ bool mapping_locked = false;
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(offset >= max_off))
* Do we have something in the page cache already?
*/
page = find_get_page(mapping, offset);
- if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
+ if (likely(page)) {
/*
- * We found the page, so try async readahead before
- * waiting for the lock.
+ * We found the page, so try async readahead before waiting for
+ * the lock.
*/
- fpin = do_async_mmap_readahead(vmf, page);
- } else if (!page) {
+ if (!(vmf->flags & FAULT_FLAG_TRIED))
+ fpin = do_async_mmap_readahead(vmf, page);
+ if (unlikely(!PageUptodate(page))) {
+ filemap_invalidate_lock_shared(mapping);
+ mapping_locked = true;
+ }
+ } else {
/* No page in the page cache at all */
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
fpin = do_sync_mmap_readahead(vmf);
retry_find:
+ /*
+ * See comment in filemap_create_page() why we need
+ * invalidate_lock
+ */
+ if (!mapping_locked) {
+ filemap_invalidate_lock_shared(mapping);
+ mapping_locked = true;
+ }
page = pagecache_get_page(mapping, offset,
FGP_CREAT|FGP_FOR_MMAP,
vmf->gfp_mask);
if (!page) {
if (fpin)
goto out_retry;
+ filemap_invalidate_unlock_shared(mapping);
return VM_FAULT_OOM;
}
}
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
*/
- if (unlikely(!PageUptodate(page)))
+ if (unlikely(!PageUptodate(page))) {
+ /*
+ * The page was in cache and uptodate and now it is not.
+ * Strange but possible since we didn't hold the page lock all
+ * the time. Let's drop everything get the invalidate lock and
+ * try again.
+ */
+ if (!mapping_locked) {
+ unlock_page(page);
+ put_page(page);
+ goto retry_find;
+ }
goto page_not_uptodate;
+ }
/*
* We've made it this far and we had to drop our mmap_lock, now is the
unlock_page(page);
goto out_retry;
}
+ if (mapping_locked)
+ filemap_invalidate_unlock_shared(mapping);
/*
* Found the page and have a reference on it.
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
+ filemap_invalidate_unlock_shared(mapping);
return VM_FAULT_SIGBUS;
*/
if (page)
put_page(page);
+ if (mapping_locked)
+ filemap_invalidate_unlock_shared(mapping);
if (fpin)
fput(fpin);
return ret | VM_FAULT_RETRY;
*
* If the page does not get brought uptodate, return -EIO.
*
+ * The function expects mapping->invalidate_lock to be already held.
+ *
* Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page(struct address_space *mapping,
*
* If the page does not get brought uptodate, return -EIO.
*
+ * The function expects mapping->invalidate_lock to be already held.
+ *
* Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page_gfp(struct address_space *mapping,
* modification times and calls proper subroutines depending on whether we
* do direct IO or a standard buffered write.
*
- * It expects i_mutex to be grabbed unless we work on a block device or similar
+ * It expects i_rwsem to be grabbed unless we work on a block device or similar
* object which does not need locking at all.
*
* This function does *not* take care of syncing data in case of O_SYNC write.
* A caller has to handle it. This is mainly due to the fact that we want to
- * avoid syncing under i_mutex.
+ * avoid syncing under i_rwsem.
*
* Return:
* * number of bytes written, even for truncated writes
*
* This is a wrapper around __generic_file_write_iter() to be used by most
* filesystems. It takes care of syncing the file in case of O_SYNC file
- * and acquires i_mutex as needed.
+ * and acquires i_rwsem as needed.
* Return:
* * negative error code if no data has been written at all of
* vfs_fsync_range() failed for a synchronous write