mutex_unlock(&bdev->bd_holder_lock);
bd_clear_claiming(whole, holder);
mutex_unlock(&bdev_lock);
-
- if (hops && hops->get_holder)
- hops->get_holder(holder);
}
/**
static void bd_end_claim(struct block_device *bdev, void *holder)
{
struct block_device *whole = bdev_whole(bdev);
- const struct blk_holder_ops *hops = bdev->bd_holder_ops;
bool unblock = false;
/*
whole->bd_holder = NULL;
mutex_unlock(&bdev_lock);
- if (hops && hops->put_holder)
- hops->put_holder(holder);
-
/*
* If this was the last claim, remove holder link and unblock evpoll if
* it was a write holder.
bdev_write_inode(bdev);
}
+static void blkdev_put_whole(struct block_device *bdev)
+{
+ if (atomic_dec_and_test(&bdev->bd_openers))
+ blkdev_flush_mapping(bdev);
+ if (bdev->bd_disk->fops->release)
+ bdev->bd_disk->fops->release(bdev->bd_disk);
+}
+
static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode)
{
struct gendisk *disk = bdev->bd_disk;
if (!atomic_read(&bdev->bd_openers))
set_init_blocksize(bdev);
- if (test_bit(GD_NEED_PART_SCAN, &disk->state))
- bdev_disk_changed(disk, false);
atomic_inc(&bdev->bd_openers);
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state)) {
+ /*
+ * Only return scanning errors if we are called from contexts
+ * that explicitly want them, e.g. the BLKRRPART ioctl.
+ */
+ ret = bdev_disk_changed(disk, false);
+ if (ret && (mode & BLK_OPEN_STRICT_SCAN)) {
+ blkdev_put_whole(bdev);
+ return ret;
+ }
+ }
return 0;
}
-static void blkdev_put_whole(struct block_device *bdev)
-{
- if (atomic_dec_and_test(&bdev->bd_openers))
- blkdev_flush_mapping(bdev);
- if (bdev->bd_disk->fops->release)
- bdev->bd_disk->fops->release(bdev->bd_disk);
-}
-
static int blkdev_get_part(struct block_device *part, blk_mode_t mode)
{
struct gendisk *disk = part->bd_disk;
static bool bdev_writes_blocked(struct block_device *bdev)
{
- return bdev->bd_writers == -1;
+ return bdev->bd_writers < 0;
}
static void bdev_block_writes(struct block_device *bdev)
{
- bdev->bd_writers = -1;
+ bdev->bd_writers--;
}
static void bdev_unblock_writes(struct block_device *bdev)
{
- bdev->bd_writers = 0;
+ bdev->bd_writers++;
}
static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode)
bdev->bd_writers++;
}
+static inline bool bdev_unclaimed(const struct file *bdev_file)
+{
+ return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host);
+}
+
static void bdev_yield_write_access(struct file *bdev_file)
{
struct block_device *bdev;
if (bdev_allow_write_mounted)
return;
+ if (bdev_unclaimed(bdev_file))
+ return;
+
bdev = file_bdev(bdev_file);
- /* Yield exclusive or shared write access. */
- if (bdev_file->f_mode & FMODE_WRITE) {
- if (bdev_writes_blocked(bdev))
- bdev_unblock_writes(bdev);
- else
- bdev->bd_writers--;
- }
+
+ if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED)
+ bdev_unblock_writes(bdev);
+ else if (bdev_file->f_mode & FMODE_WRITE)
+ bdev->bd_writers--;
}
/**
goto abort_claiming;
ret = -EBUSY;
if (!bdev_may_open(bdev, mode))
- goto abort_claiming;
+ goto put_module;
if (bdev_is_partition(bdev))
ret = blkdev_get_part(bdev, mode);
else
disk_unblock_events(disk);
bdev_file->f_flags |= O_LARGEFILE;
- bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
+ bdev_file->f_mode |= FMODE_CAN_ODIRECT;
if (bdev_nowait(bdev))
bdev_file->f_mode |= FMODE_NOWAIT;
+ if (mode & BLK_OPEN_RESTRICT_WRITES)
+ bdev_file->f_mode |= FMODE_WRITE_RESTRICTED;
bdev_file->f_mapping = bdev->bd_inode->i_mapping;
bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping);
bdev_file->private_data = holder;
}
EXPORT_SYMBOL(bdev_file_open_by_path);
+static inline void bd_yield_claim(struct file *bdev_file)
+{
+ struct block_device *bdev = file_bdev(bdev_file);
+ void *holder = bdev_file->private_data;
+
+ lockdep_assert_held(&bdev->bd_disk->open_mutex);
+
+ if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder)))
+ return;
+
+ if (!bdev_unclaimed(bdev_file))
+ bd_end_claim(bdev, holder);
+}
+
void bdev_release(struct file *bdev_file)
{
struct block_device *bdev = file_bdev(bdev_file);
bdev_yield_write_access(bdev_file);
if (holder)
- bd_end_claim(bdev, holder);
+ bd_yield_claim(bdev_file);
/*
* Trigger event checking and tell drivers to flush MEDIA_CHANGE
blkdev_put_no_open(bdev);
}
+/**
+ * bdev_fput - yield claim to the block device and put the file
+ * @bdev_file: open block device
+ *
+ * Yield claim on the block device and put the file. Ensure that the
+ * block device can be reclaimed before the file is closed which is a
+ * deferred operation.
+ */
+void bdev_fput(struct file *bdev_file)
+{
+ if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops))
+ return;
+
+ if (bdev_file->private_data) {
+ struct block_device *bdev = file_bdev(bdev_file);
+ struct gendisk *disk = bdev->bd_disk;
+
+ mutex_lock(&disk->open_mutex);
+ bdev_yield_write_access(bdev_file);
+ bd_yield_claim(bdev_file);
+ /*
+ * Tell release we already gave up our hold on the
+ * device and if write restrictions are available that
+ * we already gave up write access to the device.
+ */
+ bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host);
+ mutex_unlock(&disk->open_mutex);
+ }
+
+ fput(bdev_file);
+}
+EXPORT_SYMBOL(bdev_fput);
+
/**
* lookup_bdev() - Look up a struct block_device by name.
* @pathname: Name of the block device in the filesystem.
unsigned long mmap_base;
unsigned long mmap_size;
- struct page **ring_pages;
+ struct folio **ring_folios;
long nr_pages;
struct rcu_work free_rwork; /* see free_ioctx() */
spinlock_t completion_lock;
} ____cacheline_aligned_in_smp;
- struct page *internal_pages[AIO_RING_PAGES];
+ struct folio *internal_folios[AIO_RING_PAGES];
struct file *aio_ring_file;
unsigned id;
put_aio_ring_file(ctx);
for (i = 0; i < ctx->nr_pages; i++) {
- struct page *page;
- pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
- page_count(ctx->ring_pages[i]));
- page = ctx->ring_pages[i];
- if (!page)
+ struct folio *folio = ctx->ring_folios[i];
+
+ if (!folio)
continue;
- ctx->ring_pages[i] = NULL;
- put_page(page);
+
+ pr_debug("pid(%d) [%d] folio->count=%d\n", current->pid, i,
+ folio_ref_count(folio));
+ ctx->ring_folios[i] = NULL;
+ folio_put(folio);
}
- if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
- kfree(ctx->ring_pages);
- ctx->ring_pages = NULL;
+ if (ctx->ring_folios && ctx->ring_folios != ctx->internal_folios) {
+ kfree(ctx->ring_folios);
+ ctx->ring_folios = NULL;
}
}
idx = src->index;
if (idx < (pgoff_t)ctx->nr_pages) {
/* Make sure the old folio hasn't already been changed */
- if (ctx->ring_pages[idx] != &src->page)
+ if (ctx->ring_folios[idx] != src)
rc = -EAGAIN;
} else
rc = -EINVAL;
*/
spin_lock_irqsave(&ctx->completion_lock, flags);
folio_migrate_copy(dst, src);
- BUG_ON(ctx->ring_pages[idx] != &src->page);
- ctx->ring_pages[idx] = &dst->page;
+ BUG_ON(ctx->ring_folios[idx] != src);
+ ctx->ring_folios[idx] = dst;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
/* The old folio is no longer accessible. */
nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
/ sizeof(struct io_event);
- ctx->ring_pages = ctx->internal_pages;
+ ctx->ring_folios = ctx->internal_folios;
if (nr_pages > AIO_RING_PAGES) {
- ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
- GFP_KERNEL);
- if (!ctx->ring_pages) {
+ ctx->ring_folios = kcalloc(nr_pages, sizeof(struct folio *),
+ GFP_KERNEL);
+ if (!ctx->ring_folios) {
put_aio_ring_file(ctx);
return -ENOMEM;
}
}
for (i = 0; i < nr_pages; i++) {
- struct page *page;
- page = find_or_create_page(file->f_mapping,
- i, GFP_USER | __GFP_ZERO);
- if (!page)
+ struct folio *folio;
+
+ folio = __filemap_get_folio(file->f_mapping, i,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ GFP_USER | __GFP_ZERO);
+ if (IS_ERR(folio))
break;
- pr_debug("pid(%d) page[%d]->count=%d\n",
- current->pid, i, page_count(page));
- SetPageUptodate(page);
- unlock_page(page);
- ctx->ring_pages[i] = page;
+ pr_debug("pid(%d) [%d] folio->count=%d\n", current->pid, i,
+ folio_ref_count(folio));
+ folio_end_read(folio, true);
+
+ ctx->ring_folios[i] = folio;
}
ctx->nr_pages = i;
ctx->user_id = ctx->mmap_base;
ctx->nr_events = nr_events; /* trusted copy */
- ring = page_address(ctx->ring_pages[0]);
+ ring = folio_address(ctx->ring_folios[0]);
ring->nr = nr_events; /* user copy */
ring->id = ~0U;
ring->head = ring->tail = 0;
ring->compat_features = AIO_RING_COMPAT_FEATURES;
ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
ring->header_length = sizeof(struct aio_ring);
- flush_dcache_page(ctx->ring_pages[0]);
+ flush_dcache_folio(ctx->ring_folios[0]);
return 0;
}
/* While kioctx setup is in progress,
* we are protected from page migration
- * changes ring_pages by ->ring_lock.
+ * changes ring_folios by ->ring_lock.
*/
- ring = page_address(ctx->ring_pages[0]);
+ ring = folio_address(ctx->ring_folios[0]);
ring->id = ctx->id;
return 0;
}
* against ctx->completed_events below will make sure we do the
* safe/right thing.
*/
- ring = page_address(ctx->ring_pages[0]);
+ ring = folio_address(ctx->ring_folios[0]);
head = ring->head;
refill_reqs_available(ctx, head, ctx->tail);
if (++tail >= ctx->nr_events)
tail = 0;
- ev_page = page_address(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+ ev_page = folio_address(ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE]);
event = ev_page + pos % AIO_EVENTS_PER_PAGE;
*event = iocb->ki_res;
- flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+ flush_dcache_folio(ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE]);
pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
(void __user *)(unsigned long)iocb->ki_res.obj,
ctx->tail = tail;
- ring = page_address(ctx->ring_pages[0]);
+ ring = folio_address(ctx->ring_folios[0]);
head = ring->head;
ring->tail = tail;
- flush_dcache_page(ctx->ring_pages[0]);
+ flush_dcache_folio(ctx->ring_folios[0]);
ctx->completed_events++;
if (ctx->completed_events > 1)
spin_lock_irqsave(&ctx->wait.lock, flags);
list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry)
if (avail >= curr->min_nr) {
- list_del_init_careful(&curr->w.entry);
wake_up_process(curr->w.private);
+ list_del_init_careful(&curr->w.entry);
}
spin_unlock_irqrestore(&ctx->wait.lock, flags);
}
sched_annotate_sleep();
mutex_lock(&ctx->ring_lock);
- /* Access to ->ring_pages here is protected by ctx->ring_lock. */
- ring = page_address(ctx->ring_pages[0]);
+ /* Access to ->ring_folios here is protected by ctx->ring_lock. */
+ ring = folio_address(ctx->ring_folios[0]);
head = ring->head;
tail = ring->tail;
while (ret < nr) {
long avail;
struct io_event *ev;
- struct page *page;
+ struct folio *folio;
avail = (head <= tail ? tail : ctx->nr_events) - head;
if (head == tail)
break;
pos = head + AIO_EVENTS_OFFSET;
- page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
+ folio = ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE];
pos %= AIO_EVENTS_PER_PAGE;
avail = min(avail, nr - ret);
avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);
- ev = page_address(page);
+ ev = folio_address(folio);
copy_ret = copy_to_user(event + ret, ev + pos,
sizeof(*ev) * avail);
head %= ctx->nr_events;
}
- ring = page_address(ctx->ring_pages[0]);
+ ring = folio_address(ctx->ring_folios[0]);
ring->head = head;
- flush_dcache_page(ctx->ring_pages[0]);
+ flush_dcache_folio(ctx->ring_folios[0]);
pr_debug("%li h%u t%u\n", ret, head, tail);
out:
BUG_ON(!old);
if (unlikely(old != inode)) {
- discard_new_inode(&inode->v);
+ __destroy_inode(&inode->v);
+ kmem_cache_free(bch2_inode_cache, inode);
inode = old;
} else {
mutex_lock(&c->vfs_inodes_lock);
if (unlikely(!inode)) {
int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
- if (ret && inode)
- discard_new_inode(&inode->v);
+ if (ret && inode) {
+ __destroy_inode(&inode->v);
+ kmem_cache_free(bch2_inode_cache, inode);
+ }
if (ret)
return ERR_PTR(ret);
}
stat->blksize = block_bytes(c);
stat->blocks = inode->v.i_blocks;
+ stat->subvol = inode->ei_subvol;
+ stat->result_mask |= STATX_SUBVOL;
+
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
- struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
unsigned offset_into_extent, sectors;
bool have_extent = false;
u32 snapshot;
if (ret)
return ret;
+ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
if (start + len < start)
return -EINVAL;
return dget(sb->s_root);
err_put_super:
+ __bch2_fs_stop(c);
deactivate_locked_super(sb);
return ERR_PTR(bch2_err_class(ret));
}
0, *alloc_hint, &ins, 1, 1);
if (ret) {
/*
- * Here we used to try again by going back to non-compressed
- * path for ENOSPC. But we can't reserve space even for
- * compressed size, how could it work for uncompressed size
- * which requires larger size? So here we directly go error
- * path.
+ * We can't reserve contiguous space for the compressed size.
+ * Unlikely, but it's possible that we could have enough
+ * non-contiguous space for the uncompressed size instead. So
+ * fall back to uncompressed.
*/
- goto out_free;
+ submit_uncompressed_range(inode, async_extent, locked_page);
+ goto done;
}
/* Here we're doing allocation and writeback of the compressed pages */
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
-out_free:
mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
extent_clear_unlock_delalloc(inode, start, end,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
*/
if (bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(inode, len, false);
+ btrfs_delalloc_release_metadata(inode, len, true);
/* For sanity tests. */
if (btrfs_is_testing(fs_info))
struct btrfs_trans_handle *trans;
struct btrfs_block_rsv block_rsv;
u64 root_flags;
+ u64 qgroup_reserved = 0;
int ret;
down_write(&fs_info->subvol_sem);
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
if (ret)
goto out_undead;
+ qgroup_reserved = block_rsv.qgroup_rsv_reserved;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_release;
}
+ ret = btrfs_record_root_in_trans(trans, root);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
+ qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
ret = btrfs_end_transaction(trans);
inode->i_flags |= S_DEAD;
out_release:
- btrfs_subvolume_release_metadata(root, &block_rsv);
+ btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
+ if (qgroup_reserved)
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
out_undead:
if (ret) {
spin_lock(&dest->root_item_lock);
generic_fillattr(idmap, request_mask, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
+ stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
+ stat->result_mask |= STATX_SUBVOL;
+
spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
inode_bytes = inode_get_bytes(inode);
{}
};
- /* String parameter that allows empty argument */
- #define fsparam_string_empty(NAME, OPT) \
- __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
-
/*
* Mount option specification
* We don't use fsparam_flag_no because of the way we set the
brelse(sbi->s_sbh);
if (sbi->s_journal_bdev_file) {
invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
- fput(sbi->s_journal_bdev_file);
+ bdev_fput(sbi->s_journal_bdev_file);
}
out_fail:
invalidate_bdev(sb->s_bdev);
out_bh:
brelse(bh);
out_bdev:
- fput(bdev_file);
+ bdev_fput(bdev_file);
return ERR_PTR(errno);
}
out_journal:
jbd2_journal_destroy(journal);
out_bdev:
- fput(bdev_file);
+ bdev_fput(bdev_file);
return ERR_PTR(errno);
}
kill_block_super(sb);
if (bdev_file)
- fput(bdev_file);
+ bdev_fput(bdev_file);
}
static struct file_system_type ext4_fs_type = {
if (!f.file)
return ERR_PTR(-EBADF);
+ if (flags & LOOKUP_LINKAT_EMPTY) {
+ if (f.file->f_cred != current_cred() &&
+ !ns_capable(f.file->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
+ fdput(f);
+ return ERR_PTR(-ENOENT);
+ }
+ }
+
dentry = f.file->f_path.dentry;
if (*s && unlikely(!d_can_lookup(dentry))) {
case 0: case S_IFREG:
error = vfs_create(idmap, path.dentry->d_inode,
dentry, mode, true);
+ if (!error)
+ security_path_post_mknod(idmap, dentry);
break;
case S_IFCHR: case S_IFBLK:
error = vfs_mknod(idmap, path.dentry->d_inode,
dentry, mode, 0);
break;
}
-
- if (error)
- goto out2;
-
- security_path_post_mknod(idmap, dentry);
out2:
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
goto out_putnames;
}
/*
- * To use null names we require CAP_DAC_READ_SEARCH
+ * To use null names we require CAP_DAC_READ_SEARCH or
+ * that the open-time creds of the dfd matches current.
* This ensures that not everyone will be able to create
- * handlink using the passed filedescriptor.
+ * a hardlink using the passed file descriptor.
*/
- if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) {
- error = -ENOENT;
- goto out_putnames;
- }
+ if (flags & AT_EMPTY_PATH)
+ how |= LOOKUP_LINKAT_EMPTY;
if (flags & AT_SYMLINK_FOLLOW)
how |= LOOKUP_FOLLOW;
*/
/* file is open for reading */
- #define FMODE_READ ((__force fmode_t)0x1)
+ #define FMODE_READ ((__force fmode_t)(1 << 0))
/* file is open for writing */
- #define FMODE_WRITE ((__force fmode_t)0x2)
+ #define FMODE_WRITE ((__force fmode_t)(1 << 1))
/* file is seekable */
- #define FMODE_LSEEK ((__force fmode_t)0x4)
+ #define FMODE_LSEEK ((__force fmode_t)(1 << 2))
/* file can be accessed using pread */
- #define FMODE_PREAD ((__force fmode_t)0x8)
+ #define FMODE_PREAD ((__force fmode_t)(1 << 3))
/* file can be accessed using pwrite */
- #define FMODE_PWRITE ((__force fmode_t)0x10)
+ #define FMODE_PWRITE ((__force fmode_t)(1 << 4))
/* File is opened for execution with sys_execve / sys_uselib */
- #define FMODE_EXEC ((__force fmode_t)0x20)
+ #define FMODE_EXEC ((__force fmode_t)(1 << 5))
+/* File writes are restricted (block device specific) */
- #define FMODE_WRITE_RESTRICTED ((__force fmode_t)0x40)
++#define FMODE_WRITE_RESTRICTED ((__force fmode_t)(1 << 6))
+
-/* FMODE_* bits 6 to 8 */
++/* FMODE_* bits 7 to 8 */
+
/* 32bit hashes as llseek() offset (for directories) */
- #define FMODE_32BITHASH ((__force fmode_t)0x200)
+ #define FMODE_32BITHASH ((__force fmode_t)(1 << 9))
/* 64bit hashes as llseek() offset (for directories) */
- #define FMODE_64BITHASH ((__force fmode_t)0x400)
+ #define FMODE_64BITHASH ((__force fmode_t)(1 << 10))
/*
* Don't update ctime and mtime.
* Currently a special hack for the XFS open_by_handle ioctl, but we'll
* hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
*/
- #define FMODE_NOCMTIME ((__force fmode_t)0x800)
+ #define FMODE_NOCMTIME ((__force fmode_t)(1 << 11))
/* Expect random access pattern */
- #define FMODE_RANDOM ((__force fmode_t)0x1000)
+ #define FMODE_RANDOM ((__force fmode_t)(1 << 12))
/* File is huge (eg. /dev/mem): treat loff_t as unsigned */
- #define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000)
+ #define FMODE_UNSIGNED_OFFSET ((__force fmode_t)(1 << 13))
/* File is opened with O_PATH; almost nothing can be done with it */
- #define FMODE_PATH ((__force fmode_t)0x4000)
+ #define FMODE_PATH ((__force fmode_t)(1 << 14))
/* File needs atomic accesses to f_pos */
- #define FMODE_ATOMIC_POS ((__force fmode_t)0x8000)
+ #define FMODE_ATOMIC_POS ((__force fmode_t)(1 << 15))
/* Write access to underlying fs */
- #define FMODE_WRITER ((__force fmode_t)0x10000)
+ #define FMODE_WRITER ((__force fmode_t)(1 << 16))
/* Has read method(s) */
- #define FMODE_CAN_READ ((__force fmode_t)0x20000)
+ #define FMODE_CAN_READ ((__force fmode_t)(1 << 17))
/* Has write method(s) */
- #define FMODE_CAN_WRITE ((__force fmode_t)0x40000)
+ #define FMODE_CAN_WRITE ((__force fmode_t)(1 << 18))
- #define FMODE_OPENED ((__force fmode_t)0x80000)
- #define FMODE_CREATED ((__force fmode_t)0x100000)
+ #define FMODE_OPENED ((__force fmode_t)(1 << 19))
+ #define FMODE_CREATED ((__force fmode_t)(1 << 20))
/* File is stream-like */
- #define FMODE_STREAM ((__force fmode_t)0x200000)
+ #define FMODE_STREAM ((__force fmode_t)(1 << 21))
/* File supports DIRECT IO */
- #define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000)
+ #define FMODE_CAN_ODIRECT ((__force fmode_t)(1 << 22))
- #define FMODE_NOREUSE ((__force fmode_t)0x800000)
+ #define FMODE_NOREUSE ((__force fmode_t)(1 << 23))
- /* File supports non-exclusive O_DIRECT writes from multiple threads */
- #define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)0x1000000)
+ /* FMODE_* bit 24 */
/* File is embedded in backing_file object */
- #define FMODE_BACKING ((__force fmode_t)0x2000000)
+ #define FMODE_BACKING ((__force fmode_t)(1 << 25))
/* File was opened by fanotify and shouldn't generate fanotify events */
- #define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
+ #define FMODE_NONOTIFY ((__force fmode_t)(1 << 26))
/* File is capable of returning -EAGAIN if I/O will block */
- #define FMODE_NOWAIT ((__force fmode_t)0x8000000)
+ #define FMODE_NOWAIT ((__force fmode_t)(1 << 27))
/* File represents mount that needs unmounting */
- #define FMODE_NEED_UNMOUNT ((__force fmode_t)0x10000000)
+ #define FMODE_NEED_UNMOUNT ((__force fmode_t)(1 << 28))
/* File does not contribute to nr_files count */
- #define FMODE_NOACCOUNT ((__force fmode_t)0x20000000)
-
- /* File supports async buffered reads */
- #define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000)
-
- /* File supports async nowait buffered writes */
- #define FMODE_BUF_WASYNC ((__force fmode_t)0x80000000)
+ #define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29))
/*
* Attribute flags. These should be or-ed together to figure out what
__u32 handle_bytes;
int handle_type;
/* file identifier */
- unsigned char f_handle[];
+ unsigned char f_handle[] __counted_by(handle_bytes);
};
static inline struct file *get_file(struct file *f)
{
- atomic_long_inc(&f->f_count);
+ long prior = atomic_long_fetch_inc_relaxed(&f->f_count);
+ WARN_ONCE(!prior, "struct file::f_count incremented from zero; use-after-free condition present!\n");
return f;
}
struct io_uring_cmd;
struct offset_ctx;
+ typedef unsigned int __bitwise fop_flags_t;
+
struct file_operations {
struct module *owner;
+ fop_flags_t fop_flags;
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
- unsigned long mmap_supported_flags;
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
unsigned int poll_flags);
} __randomize_layout;
+ /* Supports async buffered reads */
+ #define FOP_BUFFER_RASYNC ((__force fop_flags_t)(1 << 0))
+ /* Supports async buffered writes */
+ #define FOP_BUFFER_WASYNC ((__force fop_flags_t)(1 << 1))
+ /* Supports synchronous page faults for mappings */
+ #define FOP_MMAP_SYNC ((__force fop_flags_t)(1 << 2))
+ /* Supports non-exclusive O_DIRECT writes from multiple threads */
+ #define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3))
+ /* Contains huge pages */
+ #define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4))
+
/* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *,
int (*) (struct file *, struct dir_context *));
#define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD)
#define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME)
+
+ #ifdef CONFIG_SWAP
#define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE)
+ #else
+ #define IS_SWAPFILE(inode) ((void)(inode), 0U)
+ #endif
+
#define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE)
#define IS_IMA(inode) ((inode)->i_flags & S_IMA)
#define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT)
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
int simple_offset_empty(struct dentry *dentry);
+ int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry);
int simple_offset_rename_exchange(struct inode *old_dir,
struct dentry *old_dentry,
struct inode *new_dir,
static void io_queue_sqe(struct io_kiocb *req);
struct kmem_cache *req_cachep;
+static struct workqueue_struct *iou_wq __ro_after_init;
static int __read_mostly sysctl_io_uring_disabled;
static int __read_mostly sysctl_io_uring_group = -1;
err:
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
- kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
return NULL;
/* don't serialize this request if the fs doesn't need it */
if (should_hash && (req->file->f_flags & O_DIRECT) &&
- (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE))
+ (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE))
should_hash = false;
if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
io_wq_hash_work(&req->work, file_inode(req->file));
err = -EBADFD;
if (!io_file_can_poll(req))
goto fail;
- err = -ECANCELED;
- if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK)
- goto fail;
- return;
+ if (req->file->f_flags & O_NONBLOCK ||
+ req->file->f_mode & FMODE_NOWAIT) {
+ err = -ECANCELED;
+ if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK)
+ goto fail;
+ return;
+ } else {
+ req->flags &= ~REQ_F_APOLL_MULTISHOT;
+ }
}
if (req->flags & REQ_F_FORCE_ASYNC) {
if (__io_cqring_events_user(ctx) >= min_events)
return 0;
- if (sig) {
-#ifdef CONFIG_COMPAT
- if (in_compat_syscall())
- ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
- sigsz);
- else
-#endif
- ret = set_user_sigmask(sig, sigsz);
-
- if (ret)
- return ret;
- }
-
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
io_napi_adjust_timeout(ctx, &iowq, &ts);
}
+ if (sig) {
+#ifdef CONFIG_COMPAT
+ if (in_compat_syscall())
+ ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
+ sigsz);
+ else
+#endif
+ ret = set_user_sigmask(sig, sigsz);
+
+ if (ret)
+ return ret;
+ }
+
io_napi_busy_loop(ctx, &iowq);
trace_io_uring_cqring_wait(ctx, min_events);
io_napi_free(ctx);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
- kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
}
* noise and overhead, there's no discernable change in runtime
* over using system_wq.
*/
- queue_work(system_unbound_wq, &ctx->exit_work);
+ queue_work(iou_wq, &ctx->exit_work);
}
static int io_uring_release(struct inode *inode, struct file *file)
ptr = ctx->sq_sqes;
break;
case IORING_OFF_PBUF_RING: {
+ struct io_buffer_list *bl;
unsigned int bgid;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- rcu_read_lock();
- ptr = io_pbuf_get_address(ctx, bgid);
- rcu_read_unlock();
- if (!ptr)
- return ERR_PTR(-EINVAL);
+ bl = io_pbuf_get_bl(ctx, bgid);
+ if (IS_ERR(bl))
+ return bl;
+ ptr = bl->buf_ring;
+ io_put_bl(ctx, bl);
break;
}
default:
io_buf_cachep = KMEM_CACHE(io_buffer,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
+ iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64);
+
#ifdef CONFIG_SYSCTL
register_sysctl_init("kernel", kernel_io_uring_disabled_table);
#endif
* just use poll if we can, and don't attempt if the fs doesn't
* support callback based unlocks
*/
- if (io_file_can_poll(req) || !(req->file->f_mode & FMODE_BUF_RASYNC))
+ if (io_file_can_poll(req) ||
+ !(req->file->f_op->fop_flags & FOP_BUFFER_RASYNC))
return false;
wait->wait.func = io_async_buf_func;
ret = __io_read(req, issue_flags);
+ /*
+ * If the file doesn't support proper NOWAIT, then disable multishot
+ * and stay in single shot mode.
+ */
+ if (!io_file_supports_nowait(req))
+ req->flags &= ~REQ_F_APOLL_MULTISHOT;
+
/*
* If we get -EAGAIN, recycle our buffer and just let normal poll
* handling arm it.
/*
* Any successful return value will keep the multishot read armed.
*/
- if (ret > 0) {
+ if (ret > 0 && req->flags & REQ_F_APOLL_MULTISHOT) {
/*
* Put our buffer and post a CQE. If we fail to post a CQE, then
* jump to the termination path. This request is then done.
if (unlikely(!io_file_supports_nowait(req)))
goto copy_iov;
- /* File path supports NOWAIT for non-direct_IO only for block devices. */
+ /* Check if we can support NOWAIT. */
if (!(kiocb->ki_flags & IOCB_DIRECT) &&
- !(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) &&
- (req->flags & REQ_F_ISREG))
+ !(req->file->f_op->fop_flags & FOP_BUFFER_WASYNC) &&
+ (req->flags & REQ_F_ISREG))
goto copy_iov;
kiocb->ki_flags |= IOCB_NOWAIT;
#define shmem_huge SHMEM_HUGE_DENY
-bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
- struct mm_struct *mm, unsigned long vm_flags)
-{
- return false;
-}
-
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
struct shrink_control *sc, unsigned long nr_to_split)
{
return error;
}
- simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry);
- error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry);
+ error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
if (error)
return error;