dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
+ !blk_mq_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
* filesystems that don't need it and also allows us to create the workqueue
* late enough so the we can include s_id in the name of the workqueue.
*/
- static int sb_init_dio_done_wq(struct super_block *sb)
+ int sb_init_dio_done_wq(struct super_block *sb)
{
struct workqueue_struct *old;
struct workqueue_struct *wq = alloc_workqueue("dio/%s",
return ret;
}
-/*
- * Clean any dirty buffers in the blockdev mapping which alias newly-created
- * file blocks. Only called for S_ISREG files - blockdevs do not set
- * buffer_new
- */
-static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
-{
- unsigned i;
- unsigned nblocks;
-
- nblocks = map_bh->b_size >> dio->inode->i_blkbits;
-
- for (i = 0; i < nblocks; i++) {
- unmap_underlying_metadata(map_bh->b_bdev,
- map_bh->b_blocknr + i);
- }
-}
-
/*
* If we are not writing the entire block and get_block() allocated
* the block for us, we need to fill-in the unused portion of the
goto do_holes;
sdio->blocks_available =
- map_bh->b_size >> sdio->blkbits;
+ map_bh->b_size >> blkbits;
sdio->next_block_for_io =
map_bh->b_blocknr << sdio->blkfactor;
- if (buffer_new(map_bh))
- clean_blockdev_aliases(dio, map_bh);
+ if (buffer_new(map_bh)) {
+ clean_bdev_aliases(
+ map_bh->b_bdev,
+ map_bh->b_blocknr,
+ map_bh->b_size >> blkbits);
+ }
if (!sdio->blkfactor)
goto do_holes;
dio->inode = inode;
if (iov_iter_rw(iter) == WRITE) {
dio->op = REQ_OP_WRITE;
- dio->op_flags = WRITE_ODIRECT;
+ dio->op_flags = REQ_SYNC | REQ_IDLE;
} else {
dio->op = REQ_OP_READ;
}
#include <linux/uio.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
+ #include <linux/task_io_accounting_ops.h>
#include <linux/dax.h>
#include "internal.h"
return 0;
}
EXPORT_SYMBOL_GPL(iomap_fiemap);
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_ODIRECT);
+
+ /*
+ * Private flags for iomap_dio, must not overlap with the public ones in
+ * iomap.h:
+ */
+ #define IOMAP_DIO_WRITE (1 << 30)
+ #define IOMAP_DIO_DIRTY (1 << 31)
+
+ struct iomap_dio {
+ struct kiocb *iocb;
+ iomap_dio_end_io_t *end_io;
+ loff_t i_size;
+ loff_t size;
+ atomic_t ref;
+ unsigned flags;
+ int error;
+
+ union {
+ /* used during submission and for synchronous completion: */
+ struct {
+ struct iov_iter *iter;
+ struct task_struct *waiter;
+ struct request_queue *last_queue;
+ blk_qc_t cookie;
+ } submit;
+
+ /* used for aio completion: */
+ struct {
+ struct work_struct work;
+ } aio;
+ };
+ };
+
+ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
+ {
+ struct kiocb *iocb = dio->iocb;
+ ssize_t ret;
+
+ if (dio->end_io) {
+ ret = dio->end_io(iocb,
+ dio->error ? dio->error : dio->size,
+ dio->flags);
+ } else {
+ ret = dio->error;
+ }
+
+ if (likely(!ret)) {
+ ret = dio->size;
+ /* check for short read */
+ if (iocb->ki_pos + ret > dio->i_size &&
+ !(dio->flags & IOMAP_DIO_WRITE))
+ ret = dio->i_size - iocb->ki_pos;
+ iocb->ki_pos += ret;
+ }
+
+ inode_dio_end(file_inode(iocb->ki_filp));
+ kfree(dio);
+
+ return ret;
+ }
+
+ static void iomap_dio_complete_work(struct work_struct *work)
+ {
+ struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
+ struct kiocb *iocb = dio->iocb;
+ bool is_write = (dio->flags & IOMAP_DIO_WRITE);
+ ssize_t ret;
+
+ ret = iomap_dio_complete(dio);
+ if (is_write && ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ iocb->ki_complete(iocb, ret, 0);
+ }
+
+ /*
+ * Set an error in the dio if none is set yet. We have to use cmpxchg
+ * as the submission context and the completion context(s) can race to
+ * update the error.
+ */
+ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
+ {
+ cmpxchg(&dio->error, 0, ret);
+ }
+
+ static void iomap_dio_bio_end_io(struct bio *bio)
+ {
+ struct iomap_dio *dio = bio->bi_private;
+ bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+ if (bio->bi_error)
+ iomap_dio_set_error(dio, bio->bi_error);
+
+ if (atomic_dec_and_test(&dio->ref)) {
+ if (is_sync_kiocb(dio->iocb)) {
+ struct task_struct *waiter = dio->submit.waiter;
+
+ WRITE_ONCE(dio->submit.waiter, NULL);
+ wake_up_process(waiter);
+ } else if (dio->flags & IOMAP_DIO_WRITE) {
+ struct inode *inode = file_inode(dio->iocb->ki_filp);
+
+ INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+ queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
+ } else {
+ iomap_dio_complete_work(&dio->aio.work);
+ }
+ }
+
+ if (should_dirty) {
+ bio_check_pages_dirty(bio);
+ } else {
+ struct bio_vec *bvec;
+ int i;
+
+ bio_for_each_segment_all(bvec, bio, i)
+ put_page(bvec->bv_page);
+ bio_put(bio);
+ }
+ }
+
+ static blk_qc_t
+ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
+ unsigned len)
+ {
+ struct page *page = ZERO_PAGE(0);
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_KERNEL, 1);
+ bio->bi_bdev = iomap->bdev;
+ bio->bi_iter.bi_sector =
+ iomap->blkno + ((pos - iomap->offset) >> 9);
+ bio->bi_private = dio;
+ bio->bi_end_io = iomap_dio_bio_end_io;
+
+ get_page(page);
+ if (bio_add_page(bio, page, len, 0) != len)
+ BUG();
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_ODIRECT);
++ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+
+ atomic_inc(&dio->ref);
+ return submit_bio(bio);
+ }
+
+ static loff_t
+ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
+ void *data, struct iomap *iomap)
+ {
+ struct iomap_dio *dio = data;
+ unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
+ unsigned fs_block_size = (1 << inode->i_blkbits), pad;
+ unsigned align = iov_iter_alignment(dio->submit.iter);
+ struct iov_iter iter;
+ struct bio *bio;
+ bool need_zeroout = false;
+ int nr_pages, ret;
+
+ if ((pos | length | align) & ((1 << blkbits) - 1))
+ return -EINVAL;
+
+ switch (iomap->type) {
+ case IOMAP_HOLE:
+ if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
+ return -EIO;
+ /*FALLTHRU*/
+ case IOMAP_UNWRITTEN:
+ if (!(dio->flags & IOMAP_DIO_WRITE)) {
+ iov_iter_zero(length, dio->submit.iter);
+ dio->size += length;
+ return length;
+ }
+ dio->flags |= IOMAP_DIO_UNWRITTEN;
+ need_zeroout = true;
+ break;
+ case IOMAP_MAPPED:
+ if (iomap->flags & IOMAP_F_SHARED)
+ dio->flags |= IOMAP_DIO_COW;
+ if (iomap->flags & IOMAP_F_NEW)
+ need_zeroout = true;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+
+ /*
+ * Operate on a partial iter trimmed to the extent we were called for.
+ * We'll update the iter in the dio once we're done with this extent.
+ */
+ iter = *dio->submit.iter;
+ iov_iter_truncate(&iter, length);
+
+ nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+ if (nr_pages <= 0)
+ return nr_pages;
+
+ if (need_zeroout) {
+ /* zero out from the start of the block to the write offset */
+ pad = pos & (fs_block_size - 1);
+ if (pad)
+ iomap_dio_zero(dio, iomap, pos - pad, pad);
+ }
+
+ do {
+ if (dio->error)
+ return 0;
+
+ bio = bio_alloc(GFP_KERNEL, nr_pages);
+ bio->bi_bdev = iomap->bdev;
+ bio->bi_iter.bi_sector =
+ iomap->blkno + ((pos - iomap->offset) >> 9);
+ bio->bi_private = dio;
+ bio->bi_end_io = iomap_dio_bio_end_io;
+
+ ret = bio_iov_iter_get_pages(bio, &iter);
+ if (unlikely(ret)) {
+ bio_put(bio);
+ return ret;
+ }
+
+ if (dio->flags & IOMAP_DIO_WRITE) {
- !blk_poll(dio->submit.last_queue,
- dio->submit.cookie))
++ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+ task_io_account_write(bio->bi_iter.bi_size);
+ } else {
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ if (dio->flags & IOMAP_DIO_DIRTY)
+ bio_set_pages_dirty(bio);
+ }
+
+ dio->size += bio->bi_iter.bi_size;
+ pos += bio->bi_iter.bi_size;
+
+ nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+
+ atomic_inc(&dio->ref);
+
+ dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+ dio->submit.cookie = submit_bio(bio);
+ } while (nr_pages);
+
+ if (need_zeroout) {
+ /* zero out from the end of the write to the end of the block */
+ pad = pos & (fs_block_size - 1);
+ if (pad)
+ iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
+ }
+
+ iov_iter_advance(dio->submit.iter, length);
+ return length;
+ }
+
+ ssize_t
+ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops,
+ iomap_dio_end_io_t end_io)
+ {
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ size_t count = iov_iter_count(iter);
+ loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0;
+ unsigned int flags = IOMAP_DIRECT;
+ struct blk_plug plug;
+ struct iomap_dio *dio;
+
+ lockdep_assert_held(&inode->i_rwsem);
+
+ if (!count)
+ return 0;
+
+ dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+ if (!dio)
+ return -ENOMEM;
+
+ dio->iocb = iocb;
+ atomic_set(&dio->ref, 1);
+ dio->size = 0;
+ dio->i_size = i_size_read(inode);
+ dio->end_io = end_io;
+ dio->error = 0;
+ dio->flags = 0;
+
+ dio->submit.iter = iter;
+ if (is_sync_kiocb(iocb)) {
+ dio->submit.waiter = current;
+ dio->submit.cookie = BLK_QC_T_NONE;
+ dio->submit.last_queue = NULL;
+ }
+
+ if (iov_iter_rw(iter) == READ) {
+ if (pos >= dio->i_size)
+ goto out_free_dio;
+
+ if (iter->type == ITER_IOVEC)
+ dio->flags |= IOMAP_DIO_DIRTY;
+ } else {
+ dio->flags |= IOMAP_DIO_WRITE;
+ flags |= IOMAP_WRITE;
+ }
+
+ if (mapping->nrpages) {
+ ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
+ if (ret)
+ goto out_free_dio;
+
+ ret = invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+ WARN_ON_ONCE(ret);
+ ret = 0;
+ }
+
+ inode_dio_begin(inode);
+
+ blk_start_plug(&plug);
+ do {
+ ret = iomap_apply(inode, pos, count, flags, ops, dio,
+ iomap_dio_actor);
+ if (ret <= 0) {
+ /* magic error code to fall back to buffered I/O */
+ if (ret == -ENOTBLK)
+ ret = 0;
+ break;
+ }
+ pos += ret;
+ } while ((count = iov_iter_count(iter)) > 0);
+ blk_finish_plug(&plug);
+
+ if (ret < 0)
+ iomap_dio_set_error(dio, ret);
+
+ if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
+ !inode->i_sb->s_dio_done_wq) {
+ ret = sb_init_dio_done_wq(inode->i_sb);
+ if (ret < 0)
+ iomap_dio_set_error(dio, ret);
+ }
+
+ if (!atomic_dec_and_test(&dio->ref)) {
+ if (!is_sync_kiocb(iocb))
+ return -EIOCBQUEUED;
+
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!READ_ONCE(dio->submit.waiter))
+ break;
+
+ if (!(iocb->ki_flags & IOCB_HIPRI) ||
+ !dio->submit.last_queue ||
++ !blk_mq_poll(dio->submit.last_queue,
++ dio->submit.cookie))
+ io_schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ }
+
+ /*
+ * Try again to invalidate clean pages which might have been cached by
+ * non-direct readahead, or faulted in by get_user_pages() if the source
+ * of the write was an mmap'ed region of the file we're writing. Either
+ * one is a pretty crazy thing to do, so we don't support it 100%. If
+ * this invalidation fails, tough, the write still worked...
+ */
+ if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
+ ret = invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+ WARN_ON_ONCE(ret);
+ }
+
+ return iomap_dio_complete(dio);
+
+ out_free_dio:
+ kfree(dio);
+ return ret;
+ }
+ EXPORT_SYMBOL_GPL(iomap_dio_rw);
#include <linux/pagevec.h>
#include <linux/writeback.h>
- /* flags for direct write completions */
- #define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
- #define XFS_DIO_FLAG_APPEND (1 << 1)
- #define XFS_DIO_FLAG_COW (1 << 2)
-
/*
* structure owned by writepages passed to individual writepage calls
*/
ioend->io_bio->bi_private = ioend;
ioend->io_bio->bi_end_io = xfs_end_bio;
- bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
- (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+ ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+
/*
* If we are failing the IO now, just mark the ioend with an
* error and finish it. This will run IO completion immediately
bio_chain(ioend->io_bio, new);
bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
- bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
- (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+ ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
submit_bio(ioend->io_bio);
ioend->io_bio = new;
}
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_bmbt_irec imap;
- bool is_cow = false, need_alloc = false;
+ bool is_cow = false;
int error;
/*
* Else we need to check if there is a COW mapping at this offset.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
- is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc);
+ is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (!is_cow)
* And if the COW mapping has a delayed extent here we need to
* allocate real space for it now.
*/
- if (need_alloc) {
+ if (isnullstartblock(imap.br_startblock)) {
error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
&imap);
if (error)
return try_to_free_buffers(page);
}
- /*
- * When we map a DIO buffer, we may need to pass flags to
- * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
- *
- * Note that for DIO, an IO to the highest supported file block offset (i.e.
- * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
- * bit variable. Hence if we see this overflow, we have to assume that the IO is
- * extending the file size. We won't know for sure until IO completion is run
- * and the actual max write offset is communicated to the IO completion
- * routine.
- */
- static void
- xfs_map_direct(
- struct inode *inode,
- struct buffer_head *bh_result,
- struct xfs_bmbt_irec *imap,
- xfs_off_t offset,
- bool is_cow)
- {
- uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
- xfs_off_t size = bh_result->b_size;
-
- trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
- ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW :
- XFS_IO_OVERWRITE, imap);
-
- if (ISUNWRITTEN(imap)) {
- *flags |= XFS_DIO_FLAG_UNWRITTEN;
- set_buffer_defer_completion(bh_result);
- } else if (is_cow) {
- *flags |= XFS_DIO_FLAG_COW;
- set_buffer_defer_completion(bh_result);
- }
- if (offset + size > i_size_read(inode) || offset + size < 0) {
- *flags |= XFS_DIO_FLAG_APPEND;
- set_buffer_defer_completion(bh_result);
- }
- }
-
/*
* If this is O_DIRECT or the mpage code calling tell them how large the mapping
* is, so that we can avoid repeated get_blocks calls.
bh_result->b_size = mapping_size;
}
- /* Bounce unaligned directio writes to the page cache. */
static int
- xfs_bounce_unaligned_dio_write(
- struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb,
- struct xfs_bmbt_irec *imap)
- {
- struct xfs_bmbt_irec irec;
- xfs_fileoff_t delta;
- bool shared;
- bool x;
- int error;
-
- irec = *imap;
- if (offset_fsb > irec.br_startoff) {
- delta = offset_fsb - irec.br_startoff;
- irec.br_blockcount -= delta;
- irec.br_startblock += delta;
- irec.br_startoff = offset_fsb;
- }
- error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
- if (error)
- return error;
-
- /*
- * We're here because we're trying to do a directio write to a
- * region that isn't aligned to a filesystem block. If any part
- * of the extent is shared, fall back to buffered mode to handle
- * the RMW. This is done by returning -EREMCHG ("remote addr
- * changed"), which is caught further up the call stack.
- */
- if (shared) {
- trace_xfs_reflink_bounce_dio_write(ip, imap);
- return -EREMCHG;
- }
- return 0;
- }
-
- STATIC int
- __xfs_get_blocks(
+ xfs_get_blocks(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
- int create,
- bool direct)
+ int create)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
int nimaps = 1;
xfs_off_t offset;
ssize_t size;
- int new = 0;
- bool is_cow = false;
- bool need_alloc = false;
- BUG_ON(create && !direct);
+ BUG_ON(create);
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
size = bh_result->b_size;
- if (!create && offset >= i_size_read(inode))
+ if (offset >= i_size_read(inode))
return 0;
/*
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
- if (create && direct && xfs_is_reflink_inode(ip))
- is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap,
- &need_alloc);
- if (!is_cow) {
- error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
- &imap, &nimaps, XFS_BMAPI_ENTIRE);
- /*
- * Truncate an overwrite extent if there's a pending CoW
- * reservation before the end of this extent. This
- * forces us to come back to get_blocks to take care of
- * the CoW.
- */
- if (create && direct && nimaps &&
- imap.br_startblock != HOLESTARTBLOCK &&
- imap.br_startblock != DELAYSTARTBLOCK &&
- !ISUNWRITTEN(&imap))
- xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb,
- &imap);
- }
- ASSERT(!need_alloc);
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+ &imap, &nimaps, XFS_BMAPI_ENTIRE);
if (error)
goto out_unlock;
- /* for DAX, we convert unwritten extents directly */
- if (create &&
- (!nimaps ||
- (imap.br_startblock == HOLESTARTBLOCK ||
- imap.br_startblock == DELAYSTARTBLOCK) ||
- (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
- /*
- * xfs_iomap_write_direct() expects the shared lock. It
- * is unlocked on return.
- */
- if (lockmode == XFS_ILOCK_EXCL)
- xfs_ilock_demote(ip, lockmode);
-
- error = xfs_iomap_write_direct(ip, offset, size,
- &imap, nimaps);
- if (error)
- return error;
- new = 1;
-
- trace_xfs_get_blocks_alloc(ip, offset, size,
- ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
- : XFS_IO_DELALLOC, &imap);
- } else if (nimaps) {
+ if (nimaps) {
trace_xfs_get_blocks_found(ip, offset, size,
ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
: XFS_IO_OVERWRITE, &imap);
goto out_unlock;
}
- if (IS_DAX(inode) && create) {
- ASSERT(!ISUNWRITTEN(&imap));
- /* zeroing is not needed at a higher layer */
- new = 0;
- }
-
/* trim mapping down to size requested */
xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
*/
if (imap.br_startblock != HOLESTARTBLOCK &&
imap.br_startblock != DELAYSTARTBLOCK &&
- (create || !ISUNWRITTEN(&imap))) {
- if (create && direct && !is_cow) {
- error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
- &imap);
- if (error)
- return error;
- }
-
+ !ISUNWRITTEN(&imap))
xfs_map_buffer(inode, bh_result, &imap, offset);
- if (ISUNWRITTEN(&imap))
- set_buffer_unwritten(bh_result);
- /* direct IO needs special help */
- if (create)
- xfs_map_direct(inode, bh_result, &imap, offset, is_cow);
- }
/*
* If this is a realtime file, data may be on a different device.
* to that pointed to from the buffer_head b_bdev currently.
*/
bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
-
- /*
- * If we previously allocated a block out beyond eof and we are now
- * coming back to use it then we will need to flag it as new even if it
- * has a disk address.
- *
- * With sub-block writes into unwritten extents we also need to mark
- * the buffer as new so that the unwritten parts of the buffer gets
- * correctly zeroed.
- */
- if (create &&
- ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
- (offset >= i_size_read(inode)) ||
- (new || ISUNWRITTEN(&imap))))
- set_buffer_new(bh_result);
-
- BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
-
return 0;
out_unlock:
return error;
}
- int
- xfs_get_blocks(
- struct inode *inode,
- sector_t iblock,
- struct buffer_head *bh_result,
- int create)
- {
- return __xfs_get_blocks(inode, iblock, bh_result, create, false);
- }
-
- int
- xfs_get_blocks_direct(
- struct inode *inode,
- sector_t iblock,
- struct buffer_head *bh_result,
- int create)
- {
- return __xfs_get_blocks(inode, iblock, bh_result, create, true);
- }
-
- /*
- * Complete a direct I/O write request.
- *
- * xfs_map_direct passes us some flags in the private data to tell us what to
- * do. If no flags are set, then the write IO is an overwrite wholly within
- * the existing allocated file size and so there is nothing for us to do.
- *
- * Note that in this case the completion can be called in interrupt context,
- * whereas if we have flags set we will always be called in task context
- * (i.e. from a workqueue).
- */
- int
- xfs_end_io_direct_write(
- struct kiocb *iocb,
- loff_t offset,
- ssize_t size,
- void *private)
- {
- struct inode *inode = file_inode(iocb->ki_filp);
- struct xfs_inode *ip = XFS_I(inode);
- uintptr_t flags = (uintptr_t)private;
- int error = 0;
-
- trace_xfs_end_io_direct_write(ip, offset, size);
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EIO;
-
- if (size <= 0)
- return size;
-
- /*
- * The flags tell us whether we are doing unwritten extent conversions
- * or an append transaction that updates the on-disk file size. These
- * cases are the only cases where we should *potentially* be needing
- * to update the VFS inode size.
- */
- if (flags == 0) {
- ASSERT(offset + size <= i_size_read(inode));
- return 0;
- }
-
- /*
- * We need to update the in-core inode size here so that we don't end up
- * with the on-disk inode size being outside the in-core inode size. We
- * have no other method of updating EOF for AIO, so always do it here
- * if necessary.
- *
- * We need to lock the test/set EOF update as we can be racing with
- * other IO completions here to update the EOF. Failing to serialise
- * here can result in EOF moving backwards and Bad Things Happen when
- * that occurs.
- */
- spin_lock(&ip->i_flags_lock);
- if (offset + size > i_size_read(inode))
- i_size_write(inode, offset + size);
- spin_unlock(&ip->i_flags_lock);
-
- if (flags & XFS_DIO_FLAG_COW)
- error = xfs_reflink_end_cow(ip, offset, size);
- if (flags & XFS_DIO_FLAG_UNWRITTEN) {
- trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
-
- error = xfs_iomap_write_unwritten(ip, offset, size);
- }
- if (flags & XFS_DIO_FLAG_APPEND) {
- trace_xfs_end_io_direct_write_append(ip, offset, size);
-
- error = xfs_setfilesize(ip, offset, size);
- }
-
- return error;
- }
-
STATIC ssize_t
xfs_vm_direct_IO(
struct kiocb *iocb,
struct xfs_inode *ip = XFS_I(inode);
trace_xfs_vm_bmap(XFS_I(inode));
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
/*
* The swap code (ab-)uses ->bmap to get a block mapping and then
* that on reflinks inodes, so we have to skip out here. And yes,
* 0 is the magic code for a bmap error..
*/
- if (xfs_is_reflink_inode(ip)) {
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ if (xfs_is_reflink_inode(ip))
return 0;
- }
+
filemap_write_and_wait(mapping);
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return generic_block_bmap(mapping, block, xfs_get_blocks);
}
init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_lru);
INIT_LIST_HEAD(&bp->b_list);
- RB_CLEAR_NODE(&bp->b_rbnode);
sema_init(&bp->b_sema, 0); /* held, no waiters */
spin_lock_init(&bp->b_lock);
XB_SET_OWNER(bp);
/*
* Finding and Reading Buffers
*/
+ static int
+ _xfs_buf_obj_cmp(
+ struct rhashtable_compare_arg *arg,
+ const void *obj)
+ {
+ const struct xfs_buf_map *map = arg->key;
+ const struct xfs_buf *bp = obj;
+
+ /*
+ * The key hashing in the lookup path depends on the key being the
+ * first element of the compare_arg, make sure to assert this.
+ */
+ BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
+
+ if (bp->b_bn != map->bm_bn)
+ return 1;
+
+ if (unlikely(bp->b_length != map->bm_len)) {
+ /*
+ * found a block number match. If the range doesn't
+ * match, the only way this is allowed is if the buffer
+ * in the cache is stale and the transaction that made
+ * it stale has not yet committed. i.e. we are
+ * reallocating a busy extent. Skip this buffer and
+ * continue searching for an exact match.
+ */
+ ASSERT(bp->b_flags & XBF_STALE);
+ return 1;
+ }
+ return 0;
+ }
+
+ static const struct rhashtable_params xfs_buf_hash_params = {
+ .min_size = 32, /* empty AGs have minimal footprint */
+ .nelem_hint = 16,
+ .key_len = sizeof(xfs_daddr_t),
+ .key_offset = offsetof(struct xfs_buf, b_bn),
+ .head_offset = offsetof(struct xfs_buf, b_rhash_head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = _xfs_buf_obj_cmp,
+ };
+
+ int
+ xfs_buf_hash_init(
+ struct xfs_perag *pag)
+ {
+ spin_lock_init(&pag->pag_buf_lock);
+ return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
+ }
+
+ void
+ xfs_buf_hash_destroy(
+ struct xfs_perag *pag)
+ {
+ rhashtable_destroy(&pag->pag_buf_hash);
+ }
/*
* Look up, and creates if absent, a lockable buffer for
xfs_buf_t *new_bp)
{
struct xfs_perag *pag;
- struct rb_node **rbp;
- struct rb_node *parent;
xfs_buf_t *bp;
- xfs_daddr_t blkno = map[0].bm_bn;
+ struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
xfs_daddr_t eofs;
- int numblks = 0;
int i;
for (i = 0; i < nmaps; i++)
- numblks += map[i].bm_len;
+ cmap.bm_len += map[i].bm_len;
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize));
- ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
+ ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
+ ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
/*
* Corrupted block numbers can get through to here, unfortunately, so we
* have to check that the buffer falls within the filesystem bounds.
*/
eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
- if (blkno < 0 || blkno >= eofs) {
+ if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
/*
* XXX (dgc): we should really be returning -EFSCORRUPTED here,
* but none of the higher level infrastructure supports
*/
xfs_alert(btp->bt_mount,
"%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
- __func__, blkno, eofs);
+ __func__, cmap.bm_bn, eofs);
WARN_ON(1);
return NULL;
}
- /* get tree root */
pag = xfs_perag_get(btp->bt_mount,
- xfs_daddr_to_agno(btp->bt_mount, blkno));
+ xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
- /* walk tree */
spin_lock(&pag->pag_buf_lock);
- rbp = &pag->pag_buf_tree.rb_node;
- parent = NULL;
- bp = NULL;
- while (*rbp) {
- parent = *rbp;
- bp = rb_entry(parent, struct xfs_buf, b_rbnode);
-
- if (blkno < bp->b_bn)
- rbp = &(*rbp)->rb_left;
- else if (blkno > bp->b_bn)
- rbp = &(*rbp)->rb_right;
- else {
- /*
- * found a block number match. If the range doesn't
- * match, the only way this is allowed is if the buffer
- * in the cache is stale and the transaction that made
- * it stale has not yet committed. i.e. we are
- * reallocating a busy extent. Skip this buffer and
- * continue searching to the right for an exact match.
- */
- if (bp->b_length != numblks) {
- ASSERT(bp->b_flags & XBF_STALE);
- rbp = &(*rbp)->rb_right;
- continue;
- }
- atomic_inc(&bp->b_hold);
- goto found;
- }
+ bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
+ xfs_buf_hash_params);
+ if (bp) {
+ atomic_inc(&bp->b_hold);
+ goto found;
}
/* No match found */
if (new_bp) {
- rb_link_node(&new_bp->b_rbnode, parent, rbp);
- rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
/* the buffer keeps the perag reference until it is freed */
new_bp->b_pag = pag;
+ rhashtable_insert_fast(&pag->pag_buf_hash,
+ &new_bp->b_rhash_head,
+ xfs_buf_hash_params);
spin_unlock(&pag->pag_buf_lock);
} else {
XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
if (!pag) {
ASSERT(list_empty(&bp->b_lru));
- ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
if (atomic_dec_and_test(&bp->b_hold)) {
xfs_buf_ioacct_dec(bp);
xfs_buf_free(bp);
return;
}
- ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
-
ASSERT(atomic_read(&bp->b_hold) > 0);
release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
}
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+ rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
+ xfs_buf_hash_params);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
freebuf = true;
if (bp->b_flags & XBF_WRITE) {
op = REQ_OP_WRITE;
if (bp->b_flags & XBF_SYNCIO)
- op_flags = WRITE_SYNC;
+ op_flags = REQ_SYNC;
if (bp->b_flags & XBF_FUA)
op_flags |= REQ_FUA;
if (bp->b_flags & XBF_FLUSH)
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
- if (mp->m_flags & XFS_MOUNT_BARRIER)
- xfs_blkdev_issue_flush(btp);
+ xfs_blkdev_issue_flush(btp);
kmem_free(btp);
}
name = class->name;
if (!name) {
name = __get_key_name(class->key, str);
- printk("%s", name);
+ printk(KERN_CONT "%s", name);
} else {
- printk("%s", name);
+ printk(KERN_CONT "%s", name);
if (class->name_version > 1)
- printk("#%d", class->name_version);
+ printk(KERN_CONT "#%d", class->name_version);
if (class->subclass)
- printk("/%d", class->subclass);
+ printk(KERN_CONT "/%d", class->subclass);
}
}
get_usage_chars(class, usage);
- printk(" (");
+ printk(KERN_CONT " (");
__print_lock_name(class);
- printk("){%s}", usage);
+ printk(KERN_CONT "){%s}", usage);
}
static void print_lockdep_cache(struct lockdep_map *lock)
if (!name)
name = __get_key_name(lock->key->subkeys, str);
- printk("%s", name);
+ printk(KERN_CONT "%s", name);
}
static void print_lock(struct held_lock *hlock)
barrier();
if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
- printk("<RELEASED>\n");
+ printk(KERN_CONT "<RELEASED>\n");
return;
}
print_lock_name(lock_classes + class_idx - 1);
- printk(", at: ");
- print_ip_sym(hlock->acquire_ip);
+ printk(KERN_CONT ", at: [<%p>] %pS\n",
+ (void *)hlock->acquire_ip, (void *)hlock->acquire_ip);
}
static void lockdep_print_held_locks(struct task_struct *curr)
printk("\nnew class %p: %s", class->key, class->name);
if (class->name_version > 1)
- printk("#%d", class->name_version);
- printk("\n");
+ printk(KERN_CONT "#%d", class->name_version);
+ printk(KERN_CONT "\n");
dump_stack();
if (!graph_lock()) {
/*
* Add a new dependency to the head of the list:
*/
-static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
- struct list_head *head, unsigned long ip,
- int distance, struct stack_trace *trace)
+static int add_lock_to_list(struct lock_class *this, struct list_head *head,
+ unsigned long ip, int distance,
+ struct stack_trace *trace)
{
struct lock_list *entry;
/*
return 0;
printk("\n-> #%u", depth);
print_lock_name(target->class);
- printk(":\n");
+ printk(KERN_CONT ":\n");
print_stack_trace(&target->trace, 6);
return 0;
if (parent != source) {
printk("Chain exists of:\n ");
__print_lock_name(source);
- printk(" --> ");
+ printk(KERN_CONT " --> ");
__print_lock_name(parent);
- printk(" --> ");
+ printk(KERN_CONT " --> ");
__print_lock_name(target);
- printk("\n\n");
+ printk(KERN_CONT "\n\n");
}
printk(" Possible unsafe locking scenario:\n\n");
printk(" ---- ----\n");
printk(" lock(");
__print_lock_name(target);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" lock(");
__print_lock_name(parent);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" lock(");
__print_lock_name(target);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" lock(");
__print_lock_name(source);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
printk("%*s->", depth, "");
print_lock_name(class);
- printk(" ops: %lu", class->ops);
- printk(" {\n");
+ printk(KERN_CONT " ops: %lu", class->ops);
+ printk(KERN_CONT " {\n");
for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
if (class->usage_mask & (1 << bit)) {
int len = depth;
len += printk("%*s %s", depth, "", usage_str[bit]);
- len += printk(" at:\n");
+ len += printk(KERN_CONT " at:\n");
print_stack_trace(class->usage_traces + bit, len);
}
}
printk("%*s }\n", depth, "");
- printk("%*s ... key at: ",depth,"");
- print_ip_sym((unsigned long)class->key);
+ printk("%*s ... key at: [<%p>] %pS\n",
+ depth, "", class->key, class->key);
}
/*
if (middle_class != unsafe_class) {
printk("Chain exists of:\n ");
__print_lock_name(safe_class);
- printk(" --> ");
+ printk(KERN_CONT " --> ");
__print_lock_name(middle_class);
- printk(" --> ");
+ printk(KERN_CONT " --> ");
__print_lock_name(unsafe_class);
- printk("\n\n");
+ printk(KERN_CONT "\n\n");
}
printk(" Possible interrupt unsafe locking scenario:\n\n");
printk(" ---- ----\n");
printk(" lock(");
__print_lock_name(unsafe_class);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" local_irq_disable();\n");
printk(" lock(");
__print_lock_name(safe_class);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" lock(");
__print_lock_name(middle_class);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
__print_lock_name(safe_class);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
print_lock(prev);
printk("which would create a new lock dependency:\n");
print_lock_name(hlock_class(prev));
- printk(" ->");
+ printk(KERN_CONT " ->");
print_lock_name(hlock_class(next));
- printk("\n");
+ printk(KERN_CONT "\n");
printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
irqclass);
lockdep_print_held_locks(curr);
- printk("\nthe dependencies between %s-irq-safe lock", irqclass);
- printk(" and the holding lock:\n");
+ printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
if (!save_trace(&prev_root->trace))
return 0;
print_shortest_lock_dependencies(backwards_entry, prev_root);
printk(" ----\n");
printk(" lock(");
__print_lock_name(prev);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" lock(");
__print_lock_name(next);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
printk(" May be due to missing lock nesting notation\n\n");
}
* Ok, all validations passed, add the new lock
* to the previous lock's dependency list:
*/
- ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
+ ret = add_lock_to_list(hlock_class(next),
&hlock_class(prev)->locks_after,
next->acquire_ip, distance, &trace);
if (!ret)
return 0;
- ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
+ ret = add_lock_to_list(hlock_class(prev),
&hlock_class(next)->locks_before,
next->acquire_ip, distance, &trace);
if (!ret)
graph_unlock();
printk("\n new dependency: ");
print_lock_name(hlock_class(prev));
- printk(" => ");
+ printk(KERN_CONT " => ");
print_lock_name(hlock_class(next));
- printk("\n");
+ printk(KERN_CONT "\n");
dump_stack();
return graph_lock();
}
printk(" ----\n");
printk(" lock(");
__print_lock_name(class);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
__print_lock_name(class);
- printk(");\n");
+ printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
void print_irqtrace_events(struct task_struct *curr)
{
printk("irq event stamp: %u\n", curr->irq_events);
- printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event);
- print_ip_sym(curr->hardirq_enable_ip);
- printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event);
- print_ip_sym(curr->hardirq_disable_ip);
- printk("softirqs last enabled at (%u): ", curr->softirq_enable_event);
- print_ip_sym(curr->softirq_enable_ip);
- printk("softirqs last disabled at (%u): ", curr->softirq_disable_event);
- print_ip_sym(curr->softirq_disable_ip);
+ printk("hardirqs last enabled at (%u): [<%p>] %pS\n",
+ curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
+ (void *)curr->hardirq_enable_ip);
+ printk("hardirqs last disabled at (%u): [<%p>] %pS\n",
+ curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip,
+ (void *)curr->hardirq_disable_ip);
+ printk("softirqs last enabled at (%u): [<%p>] %pS\n",
+ curr->softirq_enable_event, (void *)curr->softirq_enable_ip,
+ (void *)curr->softirq_enable_ip);
+ printk("softirqs last disabled at (%u): [<%p>] %pS\n",
+ curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
+ (void *)curr->softirq_disable_ip);
}
static int HARDIRQ_verbose(struct lock_class *class)
return 0;
}
- static int __lock_is_held(struct lockdep_map *lock);
+ static int __lock_is_held(struct lockdep_map *lock, int read);
/*
* This gets called for every mutex_lock*()/spin_lock*() operation.
if (very_verbose(class)) {
printk("\nacquire class [%p] %s", class->key, class->name);
if (class->name_version > 1)
- printk("#%d", class->name_version);
- printk("\n");
+ printk(KERN_CONT "#%d", class->name_version);
+ printk(KERN_CONT "\n");
dump_stack();
}
}
chain_key = iterate_chain_key(chain_key, class_idx);
- if (nest_lock && !__lock_is_held(nest_lock))
+ if (nest_lock && !__lock_is_held(nest_lock, -1))
return print_lock_nested_lock_not_held(curr, hlock, ip);
if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
printk("%s/%d is trying to release lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
- printk(") at:\n");
+ printk(KERN_CONT ") at:\n");
print_ip_sym(ip);
printk("but there are no more locks to release!\n");
printk("\nother info that might help us debug this:\n");
return 1;
}
- static int __lock_is_held(struct lockdep_map *lock)
+ static int __lock_is_held(struct lockdep_map *lock, int read)
{
struct task_struct *curr = current;
int i;
for (i = 0; i < curr->lockdep_depth; i++) {
struct held_lock *hlock = curr->held_locks + i;
- if (match_held_lock(hlock, lock))
- return 1;
+ if (match_held_lock(hlock, lock)) {
+ if (read == -1 || hlock->read == read)
+ return 1;
+
+ return 0;
+ }
}
return 0;
}
EXPORT_SYMBOL_GPL(lock_release);
- int lock_is_held(struct lockdep_map *lock)
+ int lock_is_held_type(struct lockdep_map *lock, int read)
{
unsigned long flags;
int ret = 0;
check_flags(flags);
current->lockdep_recursion = 1;
- ret = __lock_is_held(lock);
+ ret = __lock_is_held(lock, read);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
return ret;
}
- EXPORT_SYMBOL_GPL(lock_is_held);
+ EXPORT_SYMBOL_GPL(lock_is_held_type);
struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
{
printk("%s/%d is trying to contend lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
- printk(") at:\n");
+ printk(KERN_CONT ") at:\n");
print_ip_sym(ip);
printk("but there are no locks held!\n");
printk("\nother info that might help us debug this:\n");