}
if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->cmd_flags,
- data->ctx->cpu);
+ data->ctx);
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
spin_unlock_irq(&q->requeue_lock);
list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
- if (!(rq->rq_flags & RQF_SOFTBARRIER))
+ if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
continue;
rq->rq_flags &= ~RQF_SOFTBARRIER;
list_del_init(&rq->queuelist);
- blk_mq_sched_insert_request(rq, true, false, false);
+ /*
+ * If RQF_DONTPREP, rq has contained some driver specific
+ * data, so insert it to hctx dispatch list to avoid any
+ * merge.
+ */
+ if (rq->rq_flags & RQF_DONTPREP)
+ blk_mq_request_bypass_insert(rq, false);
+ else
+ blk_mq_sched_insert_request(rq, true, false, false);
}
while (!list_empty(&rq_list)) {
struct blk_mq_tags *tags;
int node;
- node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
size_t rq_size, left;
int node;
- node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
* If the cpu isn't present, the cpu is mapped to first hctx.
*/
for_each_possible_cpu(i) {
- hctx_idx = set->map[0].mq_map[i];
+ hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */
if (!set->tags[hctx_idx] &&
!__blk_mq_alloc_rq_map(set, hctx_idx)) {
* case, remap the current ctx to hctx[0] which
* is guaranteed to always have tags allocated
*/
- set->map[0].mq_map[i] = 0;
+ set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0;
}
ctx = per_cpu_ptr(q->queue_ctx, i);
for (j = 0; j < set->nr_maps; j++) {
- if (!set->map[j].nr_queues)
+ if (!set->map[j].nr_queues) {
+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
+ HCTX_TYPE_DEFAULT, i);
continue;
+ }
hctx = blk_mq_map_queue_type(q, j, i);
-
+ ctx->hctxs[j] = hctx;
/*
* If the CPU is already set in the mask, then we've
* mapped this one already. This can happen if
*/
BUG_ON(!hctx->nr_ctx);
}
+
+ for (; j < HCTX_MAX_TYPES; j++)
+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
+ HCTX_TYPE_DEFAULT, i);
}
mutex_unlock(&q->sysfs_lock);
int node;
struct blk_mq_hw_ctx *hctx;
- node = blk_mq_hw_queue_to_node(&set->map[0], i);
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
/*
* If the hw queue has been mapped to another numa node,
* we need to realloc the hctx. If allocation fails, fallback
set->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
- if (!(set->flags & BLK_MQ_F_SG_MERGE))
- blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
-
q->sg_reserved_size = INT_MAX;
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
return set->ops->map_queues(set);
} else {
BUG_ON(set->nr_maps > 1);
- return blk_mq_map_queues(&set->map[0]);
+ return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
}
}
if (!set)
return -EINVAL;
+ if (q->nr_requests == nr)
+ return 0;
+
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
nr_hw_queues, prev_nr_hw_queues);
set->nr_hw_queues = prev_nr_hw_queues;
- blk_mq_map_queues(&set->map[0]);
+ blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
goto fallback;
}
blk_mq_map_swqueue(q);
scsi_change_queue_depth(sdev, depth);
}
- blk_queue_flush_queueable(q, false);
-
if (dev->flags & ATA_DFLAG_TRUSTED)
sdev->security_supported = 1;
* This inconsistency confuses several controllers which
* perform PIO using DMA such as Intel AHCIs and sil3124/32.
* These controllers use actual number of transferred bytes to
- * update DMA poitner and transfer of 4n+2 bytes make those
+ * update DMA pointer and transfer of 4n+2 bytes make those
* controller push DMA pointer by 4n+4 bytes because SATA data
* FISes are aligned to 4 bytes. This causes data corruption
* and buffer overrun.
static void request_done(int uptodate)
{
struct request *req = current_req;
- struct request_queue *q;
int block;
char msg[sizeof("request done ") + sizeof(int) * 3];
return;
}
- q = req->q;
-
if (uptodate) {
/* maintain values for invalidation on geometry
* change */
if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) {
if (lock_fdc(drive))
- return -EINTR;
+ return 0;
poll_drive(false, 0);
process_fd_request();
}
#include <linux/export.h>
#include <linux/debugfs.h>
#include <linux/prefetch.h>
+#include <linux/numa.h>
#include "mtip32xx.h"
#define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32)
WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE);
/* Allocate a DMA buffer for the trim structure */
- buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
+ buf = dma_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
GFP_KERNEL);
if (!buf)
return BLK_STS_RESOURCE;
MTIP_TRIM_TIMEOUT_MS) < 0)
ret = BLK_STS_IOERR;
- dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
+ dma_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
return ret;
}
if (!user_buffer)
return -EFAULT;
- buf = dmam_alloc_coherent(&port->dd->pdev->dev,
+ buf = dma_alloc_coherent(&port->dd->pdev->dev,
ATA_SECT_SIZE * xfer_sz,
&dma_addr,
GFP_KERNEL);
}
exit_drive_command:
if (buf)
- dmam_free_coherent(&port->dd->pdev->dev,
+ dma_free_coherent(&port->dd->pdev->dev,
ATA_SECT_SIZE * xfer_sz, buf, dma_addr);
return rv;
}
struct mtip_port *port = dd->port;
if (port->block1)
- dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+ dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
port->block1, port->block1_dma);
if (port->command_list) {
- dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+ dma_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
port->command_list, port->command_list_dma);
}
}
/* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */
port->block1 =
- dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+ dma_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
&port->block1_dma, GFP_KERNEL);
if (!port->block1)
return -ENOMEM;
/* Allocate dma memory for command list */
port->command_list =
- dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+ dma_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
&port->command_list_dma, GFP_KERNEL);
if (!port->command_list) {
- dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+ dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
port->block1, port->block1_dma);
port->block1 = NULL;
port->block1_dma = 0;
mtip_start_port(dd->port);
/* Setup the ISR and enable interrupts. */
- rv = devm_request_irq(&dd->pdev->dev,
- dd->pdev->irq,
- mtip_irq_handler,
- IRQF_SHARED,
- dev_driver_string(&dd->pdev->dev),
- dd);
-
+ rv = request_irq(dd->pdev->irq, mtip_irq_handler, IRQF_SHARED,
+ dev_driver_string(&dd->pdev->dev), dd);
if (rv) {
dev_err(&dd->pdev->dev,
"Unable to allocate IRQ %d\n", dd->pdev->irq);
/* Release the IRQ. */
irq_set_affinity_hint(dd->pdev->irq, NULL);
- devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+ free_irq(dd->pdev->irq, dd);
out2:
mtip_deinit_port(dd->port);
/* Release the IRQ. */
irq_set_affinity_hint(dd->pdev->irq, NULL);
- devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+ free_irq(dd->pdev->irq, dd);
msleep(1000);
/* Free dma regions */
if (!cmd->command)
return;
- dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
- cmd->command, cmd->command_dma);
+ dma_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, cmd->command,
+ cmd->command_dma);
}
static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq,
struct driver_data *dd = set->driver_data;
struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
- cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
+ cmd->command = dma_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
&cmd->command_dma, GFP_KERNEL);
if (!cmd->command)
return -ENOMEM;
/* Helper for selecting a node in round robin mode */
static inline int mtip_get_next_rr_node(void)
{
- static int next_node = -1;
+ static int next_node = NUMA_NO_NODE;
- if (next_node == -1) {
+ if (next_node == NUMA_NO_NODE) {
next_node = first_online_node;
return next_node;
}
module_param(single_major, bool, 0444);
MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
-static ssize_t rbd_add(struct bus_type *bus, const char *buf,
- size_t count);
-static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
- size_t count);
-static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
- size_t count);
-static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
- size_t count);
+static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
+static ssize_t remove_store(struct bus_type *bus, const char *buf,
+ size_t count);
+static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
+ size_t count);
+static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
+ size_t count);
static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
static int rbd_dev_id_to_minor(int dev_id)
return is_lock_owner;
}
-static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
+static ssize_t supported_features_show(struct bus_type *bus, char *buf)
{
return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
}
-static BUS_ATTR(add, 0200, NULL, rbd_add);
-static BUS_ATTR(remove, 0200, NULL, rbd_remove);
-static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major);
-static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major);
-static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL);
+static BUS_ATTR_WO(add);
+static BUS_ATTR_WO(remove);
+static BUS_ATTR_WO(add_single_major);
+static BUS_ATTR_WO(remove_single_major);
+static BUS_ATTR_RO(supported_features);
static struct attribute *rbd_bus_attrs[] = {
&bus_attr_add.attr,
rbd_dev->tag_set.ops = &rbd_mq_ops;
rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
- rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
rbd_dev->tag_set.nr_hw_queues = 1;
rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
goto out;
}
-static ssize_t rbd_add(struct bus_type *bus,
- const char *buf,
- size_t count)
+static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
{
if (single_major)
return -EINVAL;
return do_rbd_add(bus, buf, count);
}
-static ssize_t rbd_add_single_major(struct bus_type *bus,
- const char *buf,
- size_t count)
+static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
+ size_t count)
{
return do_rbd_add(bus, buf, count);
}
return count;
}
-static ssize_t rbd_remove(struct bus_type *bus,
- const char *buf,
- size_t count)
+static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
{
if (single_major)
return -EINVAL;
return do_rbd_remove(bus, buf, count);
}
-static ssize_t rbd_remove_single_major(struct bus_type *bus,
- const char *buf,
- size_t count)
+static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
+ size_t count)
{
return do_rbd_remove(bus, buf, count);
}
if (IS_ERR(bip))
return PTR_ERR(bip);
- tag_len = io->cc->on_disk_tag_size * bio_sectors(bio);
+ tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift);
bip->bip_iter.bi_size = tag_len;
bip->bip_iter.bi_sector = io->cc->start + io->sector;
{
unsigned int i;
struct bio_vec *bv;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, clone, i) {
+ bio_for_each_segment_all(bv, clone, i, iter_all) {
BUG_ON(!bv->bv_page);
mempool_free(bv->bv_page, &cc->page_pool);
}
return;
}
set_bit(Blocked, &rdev->flags);
- if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++;
- set_bit(Faulty, &rdev->flags);
- } else
- set_bit(Faulty, &rdev->flags);
+ set_bit(Faulty, &rdev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery is running, make sure it aborts.
reschedule_retry(r1_bio);
}
+static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
+{
+ sector_t sync_blocks = 0;
+ sector_t s = r1_bio->sector;
+ long sectors_to_go = r1_bio->sectors;
+
+ /* make sure these bits don't get cleared. */
+ do {
+ md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
+ s += sync_blocks;
+ sectors_to_go -= sync_blocks;
+ } while (sectors_to_go > 0);
+}
+
static void end_sync_write(struct bio *bio)
{
int uptodate = !bio->bi_status;
struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
if (!uptodate) {
- sector_t sync_blocks = 0;
- sector_t s = r1_bio->sector;
- long sectors_to_go = r1_bio->sectors;
- /* make sure these bits doesn't get cleared. */
- do {
- md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
- s += sync_blocks;
- sectors_to_go -= sync_blocks;
- } while (sectors_to_go > 0);
+ abort_sync_write(mddev, r1_bio);
set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED, &
struct page **spages = get_resync_pages(sbio)->pages;
struct bio_vec *bi;
int page_len[RESYNC_PAGES] = { 0 };
+ struct bvec_iter_all iter_all;
if (sbio->bi_end_io != end_sync_read)
continue;
/* Now we can 'fixup' the error value */
sbio->bi_status = 0;
- bio_for_each_segment_all(bi, sbio, j)
+ bio_for_each_segment_all(bi, sbio, j, iter_all)
page_len[j] = bi->bv_len;
if (!status) {
(i == r1_bio->read_disk ||
!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
continue;
- if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
+ if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
+ abort_sync_write(mddev, r1_bio);
continue;
+ }
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
{
struct mmc_host *host = card->host;
u64 limit = BLK_BOUNCE_HIGH;
+ unsigned block_size = 512;
if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask)
limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT;
blk_queue_max_hw_sectors(mq->queue,
min(host->max_blk_count, host->max_req_size / 512));
blk_queue_max_segments(mq->queue, host->max_segs);
- blk_queue_max_segment_size(mq->queue, host->max_seg_size);
+
+ if (mmc_card_mmc(card))
+ block_size = card->ext_csd.data_sector_size;
+
+ blk_queue_logical_block_size(mq->queue, block_size);
+ blk_queue_max_segment_size(mq->queue,
+ round_down(host->max_seg_size, block_size));
INIT_WORK(&mq->recovery_work, mmc_mq_recovery_handler);
INIT_WORK(&mq->complete_work, mmc_blk_mq_complete_work);
else
mq->tag_set.queue_depth = MMC_QUEUE_DEPTH;
mq->tag_set.numa_node = NUMA_NO_NODE;
- mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE |
- BLK_MQ_F_BLOCKING;
+ mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
mq->tag_set.nr_hw_queues = 1;
mq->tag_set.cmd_size = sizeof(struct mmc_queue_req);
mq->tag_set.driver_data = mq;
+ // SPDX-License-Identifier: GPL-2.0
/*
* NVM Express device driver
* Copyright (c) 2011-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
*/
#include <linux/aer.h>
int n = 0, ret;
ret = kstrtoint(val, 10, &n);
+ if (ret)
+ return ret;
if (n > num_possible_cpus())
n = num_possible_cpus();
return ret;
}
-/* irq_queues covers admin queue */
-static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
+/*
+ * nirqs is the number of interrupts available for write and read
+ * queues. The core already reserved an interrupt for the admin queue.
+ */
+static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
{
- unsigned int this_w_queues = write_queues;
-
- WARN_ON(!irq_queues);
-
- /*
- * Setup read/write queue split, assign admin queue one independent
- * irq vector if irq_queues is > 1.
- */
- if (irq_queues <= 2) {
- dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
- dev->io_queues[HCTX_TYPE_READ] = 0;
- return;
- }
+ struct nvme_dev *dev = affd->priv;
+ unsigned int nr_read_queues;
/*
- * If 'write_queues' is set, ensure it leaves room for at least
- * one read queue and one admin queue
+ * If there is no interupt available for queues, ensure that
+ * the default queue is set to 1. The affinity set size is
+ * also set to one, but the irq core ignores it for this case.
+ *
+ * If only one interrupt is available or 'write_queue' == 0, combine
+ * write and read queues.
+ *
+ * If 'write_queues' > 0, ensure it leaves room for at least one read
+ * queue.
*/
- if (this_w_queues >= irq_queues)
- this_w_queues = irq_queues - 2;
-
- /*
- * If 'write_queues' is set to zero, reads and writes will share
- * a queue set.
- */
- if (!this_w_queues) {
- dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues - 1;
- dev->io_queues[HCTX_TYPE_READ] = 0;
+ if (!nrirqs) {
+ nrirqs = 1;
+ nr_read_queues = 0;
+ } else if (nrirqs == 1 || !write_queues) {
+ nr_read_queues = 0;
+ } else if (write_queues >= nrirqs) {
+ nr_read_queues = 1;
} else {
- dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
- dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues - 1;
+ nr_read_queues = nrirqs - write_queues;
}
+
+ dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
+ affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
+ dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
+ affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
+ affd->nr_sets = nr_read_queues ? 2 : 1;
}
static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
- int irq_sets[2];
struct irq_affinity affd = {
- .pre_vectors = 1,
- .nr_sets = ARRAY_SIZE(irq_sets),
- .sets = irq_sets,
+ .pre_vectors = 1,
+ .calc_sets = nvme_calc_irq_sets,
+ .priv = dev,
};
- int result = 0;
unsigned int irq_queues, this_p_queues;
/*
}
dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
- /*
- * For irq sets, we have to ask for minvec == maxvec. This passes
- * any reduction back to us, so we can adjust our queue counts and
- * IRQ vector needs.
- */
- do {
- nvme_calc_io_queues(dev, irq_queues);
- irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
- irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
- if (!irq_sets[1])
- affd.nr_sets = 1;
+ /* Initialize for the single interrupt case */
+ dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
+ dev->io_queues[HCTX_TYPE_READ] = 0;
- /*
- * If we got a failure and we're down to asking for just
- * 1 + 1 queues, just ask for a single vector. We'll share
- * that between the single IO queue and the admin queue.
- * Otherwise, we assign one independent vector to admin queue.
- */
- if (irq_queues > 1)
- irq_queues = irq_sets[0] + irq_sets[1] + 1;
-
- result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
- irq_queues,
- PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
-
- /*
- * Need to reduce our vec counts. If we get ENOSPC, the
- * platform should support mulitple vecs, we just need
- * to decrease our ask. If we get EINVAL, the platform
- * likely does not. Back down to ask for just one vector.
- */
- if (result == -ENOSPC) {
- irq_queues--;
- if (!irq_queues)
- return result;
- continue;
- } else if (result == -EINVAL) {
- irq_queues = 1;
- continue;
- } else if (result <= 0)
- return -EIO;
- break;
- } while (1);
-
- return result;
+ return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
+ PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
}
static void nvme_disable_io_queues(struct nvme_dev *dev)
mutex_lock(&dev->shutdown_lock);
result = nvme_pci_enable(dev);
if (result)
- goto out;
+ goto out_unlock;
result = nvme_pci_configure_admin_queue(dev);
if (result)
- goto out;
+ goto out_unlock;
result = nvme_alloc_admin_tags(dev);
if (result)
- goto out;
+ goto out_unlock;
/*
* Limit the max command size to prevent iod->sg allocations going
nvme_start_ctrl(&dev->ctrl);
return;
+ out_unlock:
+ mutex_unlock(&dev->shutdown_lock);
out:
nvme_remove_dead_ctrl(dev, result);
}
static int __init nvme_init(void)
{
+ BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
return pci_register_driver(&nvme_driver);
}
set_host_byte(cmd, DID_OK);
return BLK_STS_TARGET;
case DID_NEXUS_FAILURE:
+ set_host_byte(cmd, DID_OK);
return BLK_STS_NEXUS;
case DID_ALLOC_FAILURE:
set_host_byte(cmd, DID_OK);
shost->tag_set.queue_depth = shost->can_queue;
shost->tag_set.cmd_size = cmd_size;
shost->tag_set.numa_node = NUMA_NO_NODE;
- shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
shost->tag_set.flags |=
BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
shost->tag_set.driver_data = shost;
* device deleted during suspend)
*/
mutex_lock(&sdev->state_mutex);
- WARN_ON_ONCE(!sdev->quiesced_by);
sdev->quiesced_by = NULL;
blk_clear_pm_only(sdev->request_queue);
if (sdev->sdev_state == SDEV_QUIESCE)
int i;
struct bio_vec *bvec;
const blk_status_t err = bio->bi_status;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
/* page is already locked */
return err;
}
-#ifdef CONFIG_EROFS_FS_ZIP
-extern int z_erofs_map_blocks_iter(struct inode *,
- struct erofs_map_blocks *,
- struct page **, int);
-#endif
-
-int erofs_map_blocks_iter(struct inode *inode,
- struct erofs_map_blocks *map,
- struct page **mpage_ret, int flags)
-{
- /* by default, reading raw data never use erofs_map_blocks_iter */
- if (unlikely(!is_inode_layout_compression(inode))) {
- if (*mpage_ret)
- put_page(*mpage_ret);
- *mpage_ret = NULL;
-
- return erofs_map_blocks(inode, map, flags);
- }
-
-#ifdef CONFIG_EROFS_FS_ZIP
- return z_erofs_map_blocks_iter(inode, map, mpage_ret, flags);
-#else
- /* data compression is not available */
- return -ENOTSUPP;
-#endif
-}
-
int erofs_map_blocks(struct inode *inode,
struct erofs_map_blocks *map, int flags)
{
if (unlikely(is_inode_layout_compression(inode))) {
- struct page *mpage = NULL;
- int err;
+ int err = z_erofs_map_blocks_iter(inode, map, flags);
- err = erofs_map_blocks_iter(inode, map, &mpage, flags);
- if (mpage)
- put_page(mpage);
+ if (map->mpage) {
+ put_page(map->mpage);
+ map->mpage = NULL;
+ }
return err;
}
return erofs_map_blocks_flatmode(inode, map, flags);
Z_EROFS_VLE_WORK_SECONDARY,
Z_EROFS_VLE_WORK_PRIMARY,
/*
- * The current work has at least been linked with the following
- * processed chained works, which means if the processing page
- * is the tail partial page of the work, the current work can
- * safely use the whole page, as illustrated below:
- * +--------------+-------------------------------------------+
- * | tail page | head page (of the previous work) |
- * +--------------+-------------------------------------------+
- * /\ which belongs to the current work
- * [ (*) this page can be used for the current work itself. ]
+ * The current work was the tail of an exist chain, and the previous
+ * processed chained works are all decided to be hooked up to it.
+ * A new chain should be created for the remaining unprocessed works,
+ * therefore different from Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED,
+ * the next work cannot reuse the whole page in the following scenario:
+ * ________________________________________________________________
+ * | tail (partial) page | head (partial) page |
+ * | (belongs to the next work) | (belongs to the current work) |
+ * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
+ */
+ Z_EROFS_VLE_WORK_PRIMARY_HOOKED,
+ /*
+ * The current work has been linked with the processed chained works,
+ * and could be also linked with the potential remaining works, which
+ * means if the processing page is the tail partial page of the work,
+ * the current work can safely use the whole page (since the next work
+ * is under control) for in-place decompression, as illustrated below:
+ * ________________________________________________________________
+ * | tail (partial) page | head (partial) page |
+ * | (of the current work) | (of the previous work) |
+ * | PRIMARY_FOLLOWED or | |
+ * |_____PRIMARY_HOOKED____|____________PRIMARY_FOLLOWED____________|
+ *
+ * [ (*) the above page can be used for the current work itself. ]
*/
Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED,
Z_EROFS_VLE_WORK_MAX
{
struct erofs_sb_info *const sbi = EROFS_SB(mapping->host->i_sb);
const unsigned int clusterpages = erofs_clusterpages(sbi);
-
- struct z_erofs_vle_workgroup *grp;
+ struct z_erofs_vle_workgroup *const grp = (void *)page_private(page);
int ret = 0; /* 0 - busy */
- /* prevent the workgroup from being freed */
- rcu_read_lock();
- grp = (void *)page_private(page);
-
if (erofs_workgroup_try_to_freeze(&grp->obj, 1)) {
unsigned int i;
}
}
erofs_workgroup_unfreeze(&grp->obj, 1);
- }
- rcu_read_unlock();
- if (ret) {
- ClearPagePrivate(page);
- put_page(page);
+ if (ret) {
+ ClearPagePrivate(page);
+ put_page(page);
+ }
}
return ret;
}
return ret ? 0 : -EAGAIN;
}
-static inline bool try_to_claim_workgroup(
- struct z_erofs_vle_workgroup *grp,
- z_erofs_vle_owned_workgrp_t *owned_head,
- bool *hosted)
+static enum z_erofs_vle_work_role
+try_to_claim_workgroup(struct z_erofs_vle_workgroup *grp,
+ z_erofs_vle_owned_workgrp_t *owned_head,
+ bool *hosted)
{
DBG_BUGON(*hosted == true);
*owned_head = &grp->next;
*hosted = true;
+ /* lucky, I am the followee :) */
+ return Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
+
} else if (grp->next == Z_EROFS_VLE_WORKGRP_TAIL) {
/*
* type 2, link to the end of a existing open chain,
if (cmpxchg(&grp->next, Z_EROFS_VLE_WORKGRP_TAIL,
*owned_head) != Z_EROFS_VLE_WORKGRP_TAIL)
goto retry;
-
*owned_head = Z_EROFS_VLE_WORKGRP_TAIL;
- } else
- return false; /* :( better luck next time */
+ return Z_EROFS_VLE_WORK_PRIMARY_HOOKED;
+ }
- return true; /* lucky, I am the followee :) */
+ return Z_EROFS_VLE_WORK_PRIMARY; /* :( better luck next time */
}
struct z_erofs_vle_work_finder {
*f->hosted = false;
if (!primary)
*f->role = Z_EROFS_VLE_WORK_SECONDARY;
- /* claim the workgroup if possible */
- else if (try_to_claim_workgroup(grp, f->owned_head, f->hosted))
- *f->role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
- else
- *f->role = Z_EROFS_VLE_WORK_PRIMARY;
-
+ else /* claim the workgroup if possible */
+ *f->role = try_to_claim_workgroup(grp, f->owned_head,
+ f->hosted);
return work;
}
return work;
}
+#define builder_is_hooked(builder) \
+ ((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_HOOKED)
+
#define builder_is_followed(builder) \
((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED)
if (unlikely(work == ERR_PTR(-EAGAIN)))
goto repeat;
- if (unlikely(IS_ERR(work)))
+ if (IS_ERR(work))
return PTR_ERR(work);
got_it:
z_erofs_pagevec_ctor_init(&builder->vector,
erofs_workgroup_put(&grp->obj);
}
-void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
+static void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
{
struct z_erofs_vle_workgroup *grp =
z_erofs_vle_work_workgroup(work, true);
struct inode *const inode;
struct z_erofs_vle_work_builder builder;
- struct erofs_map_blocks_iter m_iter;
+ struct erofs_map_blocks map;
z_erofs_vle_owned_workgrp_t owned_head;
#define VLE_FRONTEND_INIT(__i) { \
.inode = __i, \
- .m_iter = { \
- { .m_llen = 0, .m_plen = 0 }, \
+ .map = { \
+ .m_llen = 0, \
+ .m_plen = 0, \
.mpage = NULL \
}, \
.builder = VLE_WORK_BUILDER_INIT(), \
{
struct super_block *const sb = fe->inode->i_sb;
struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
- struct erofs_map_blocks_iter *const m = &fe->m_iter;
- struct erofs_map_blocks *const map = &m->map;
+ struct erofs_map_blocks *const map = &fe->map;
struct z_erofs_vle_work_builder *const builder = &fe->builder;
const loff_t offset = page_offset(page);
- bool tight = builder_is_followed(builder);
+ bool tight = builder_is_hooked(builder);
struct z_erofs_vle_work *work = builder->work;
enum z_erofs_cache_alloctype cache_strategy;
/* lucky, within the range of the current map_blocks */
if (offset + cur >= map->m_la &&
- offset + cur < map->m_la + map->m_llen)
+ offset + cur < map->m_la + map->m_llen) {
+ /* didn't get a valid unzip work previously (very rare) */
+ if (!builder->work)
+ goto restart_now;
goto hitted;
+ }
/* go ahead the next map_blocks */
debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
map->m_la = offset + cur;
map->m_llen = 0;
- err = erofs_map_blocks_iter(fe->inode, map, &m->mpage, 0);
+ err = z_erofs_map_blocks_iter(fe->inode, map, 0);
if (unlikely(err))
goto err_out;
+restart_now:
if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED)))
goto hitted;
map->m_plen / PAGE_SIZE,
cache_strategy, page_pool, GFP_KERNEL);
- tight &= builder_is_followed(builder);
+ tight &= builder_is_hooked(builder);
work = builder->work;
hitted:
cur = end - min_t(unsigned int, offset + end - map->m_la, end);
(tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+ if (cur)
+ tight &= builder_is_followed(builder);
+
retry:
err = z_erofs_vle_work_add_page(builder, page, page_type);
/* should allocate an additional staging page for pagevec */
#ifdef EROFS_FS_HAS_MANAGED_CACHE
struct address_space *mc = NULL;
#endif
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
bool cachemngd = false;
if (llen > grp->llen)
llen = grp->llen;
- err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
- clusterpages, pages, llen, work->pageofs,
- z_erofs_onlinepage_endio);
+ err = z_erofs_vle_unzip_fast_percpu(compressed_pages, clusterpages,
+ pages, llen, work->pageofs);
if (err != -ENOTSUPP)
- goto out_percpu;
+ goto out;
if (sparsemem_pages >= nr_pages)
goto skip_allocpage;
erofs_vunmap(vout, nr_pages);
out:
+ /* must handle all compressed pages before endding pages */
+ for (i = 0; i < clusterpages; ++i) {
+ page = compressed_pages[i];
+
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+ if (page->mapping == MNGD_MAPPING(sbi))
+ continue;
+#endif
+ /* recycle all individual staging pages */
+ (void)z_erofs_gather_if_stagingpage(page_pool, page);
+
+ WRITE_ONCE(compressed_pages[i], NULL);
+ }
+
for (i = 0; i < nr_pages; ++i) {
page = pages[i];
+ if (!page)
+ continue;
+
DBG_BUGON(!page->mapping);
/* recycle all individual staging pages */
z_erofs_onlinepage_endio(page);
}
-out_percpu:
- for (i = 0; i < clusterpages; ++i) {
- page = compressed_pages[i];
-
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
- if (page->mapping == MNGD_MAPPING(sbi))
- continue;
-#endif
- /* recycle all individual staging pages */
- (void)z_erofs_gather_if_stagingpage(page_pool, page);
-
- WRITE_ONCE(compressed_pages[i], NULL);
- }
-
if (pages == z_pagemap_global)
mutex_unlock(&z_pagemap_global_lock);
else if (unlikely(pages != pages_onstack))
z_erofs_submit_and_unzip(&f, &pagepool, true);
out:
- if (f.m_iter.mpage)
- put_page(f.m_iter.mpage);
+ if (f.map.mpage)
+ put_page(f.map.mpage);
/* clean up the remaining free pages */
put_pages_list(&pagepool);
z_erofs_submit_and_unzip(&f, &pagepool, sync);
- if (f.m_iter.mpage)
- put_page(f.m_iter.mpage);
+ if (f.map.mpage)
+ put_page(f.map.mpage);
/* clean up the remaining free pages */
put_pages_list(&pagepool);
int z_erofs_map_blocks_iter(struct inode *inode,
struct erofs_map_blocks *map,
- struct page **mpage_ret, int flags)
+ int flags)
{
void *kaddr;
const struct vle_map_blocks_iter_ctx ctx = {
.inode = inode,
.sb = inode->i_sb,
.clusterbits = EROFS_I_SB(inode)->clusterbits,
- .mpage_ret = mpage_ret,
+ .mpage_ret = &map->mpage,
.kaddr_ret = &kaddr
};
const unsigned int clustersize = 1 << ctx.clusterbits;
/* initialize `pblk' to keep gcc from printing foolish warnings */
erofs_blk_t mblk, pblk = 0;
- struct page *mpage = *mpage_ret;
+ struct page *mpage = map->mpage;
struct z_erofs_vle_decompressed_index *di;
unsigned int cluster_type, logical_cluster_ofs;
int err = 0;
err = PTR_ERR(mpage);
goto out;
}
- *mpage_ret = mpage;
+ map->mpage = mpage;
} else {
lock_page(mpage);
DBG_BUGON(!PageUptodate(mpage));
/* get the correspoinding first chunk */
err = vle_get_logical_extent_head(&ctx, lcn, &ofs,
&pblk, &map->m_flags);
- mpage = *mpage_ret;
+ mpage = map->mpage;
if (unlikely(err)) {
if (mpage)
} else {
int i;
struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
/*
* we have verified the checksum already, set page
* checked so the end_io handlers know about it
*/
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, cb->orig_bio, i)
+ bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all)
SetPageChecked(bvec->bv_page);
bio_endio(cb->orig_bio);
struct list_head list;
};
+static struct workspace_manager heuristic_wsm;
+
+static void heuristic_init_workspace_manager(void)
+{
+ btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress);
+}
+
+static void heuristic_cleanup_workspace_manager(void)
+{
+ btrfs_cleanup_workspace_manager(&heuristic_wsm);
+}
+
+static struct list_head *heuristic_get_workspace(unsigned int level)
+{
+ return btrfs_get_workspace(&heuristic_wsm, level);
+}
+
+static void heuristic_put_workspace(struct list_head *ws)
+{
+ btrfs_put_workspace(&heuristic_wsm, ws);
+}
+
static void free_heuristic_ws(struct list_head *ws)
{
struct heuristic_ws *workspace;
kfree(workspace);
}
-static struct list_head *alloc_heuristic_ws(void)
+static struct list_head *alloc_heuristic_ws(unsigned int level)
{
struct heuristic_ws *ws;
return ERR_PTR(-ENOMEM);
}
-struct workspaces_list {
- struct list_head idle_ws;
- spinlock_t ws_lock;
- /* Number of free workspaces */
- int free_ws;
- /* Total number of allocated workspaces */
- atomic_t total_ws;
- /* Waiters for a free workspace */
- wait_queue_head_t ws_wait;
+const struct btrfs_compress_op btrfs_heuristic_compress = {
+ .init_workspace_manager = heuristic_init_workspace_manager,
+ .cleanup_workspace_manager = heuristic_cleanup_workspace_manager,
+ .get_workspace = heuristic_get_workspace,
+ .put_workspace = heuristic_put_workspace,
+ .alloc_workspace = alloc_heuristic_ws,
+ .free_workspace = free_heuristic_ws,
};
-static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
-
-static struct workspaces_list btrfs_heuristic_ws;
-
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+ /* The heuristic is represented as compression type 0 */
+ &btrfs_heuristic_compress,
&btrfs_zlib_compress,
&btrfs_lzo_compress,
&btrfs_zstd_compress,
};
-void __init btrfs_init_compress(void)
+void btrfs_init_workspace_manager(struct workspace_manager *wsm,
+ const struct btrfs_compress_op *ops)
{
struct list_head *workspace;
- int i;
- INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws);
- spin_lock_init(&btrfs_heuristic_ws.ws_lock);
- atomic_set(&btrfs_heuristic_ws.total_ws, 0);
- init_waitqueue_head(&btrfs_heuristic_ws.ws_wait);
+ wsm->ops = ops;
- workspace = alloc_heuristic_ws();
+ INIT_LIST_HEAD(&wsm->idle_ws);
+ spin_lock_init(&wsm->ws_lock);
+ atomic_set(&wsm->total_ws, 0);
+ init_waitqueue_head(&wsm->ws_wait);
+
+ /*
+ * Preallocate one workspace for each compression type so we can
+ * guarantee forward progress in the worst case
+ */
+ workspace = wsm->ops->alloc_workspace(0);
if (IS_ERR(workspace)) {
pr_warn(
- "BTRFS: cannot preallocate heuristic workspace, will try later\n");
+ "BTRFS: cannot preallocate compression workspace, will try later\n");
} else {
- atomic_set(&btrfs_heuristic_ws.total_ws, 1);
- btrfs_heuristic_ws.free_ws = 1;
- list_add(workspace, &btrfs_heuristic_ws.idle_ws);
+ atomic_set(&wsm->total_ws, 1);
+ wsm->free_ws = 1;
+ list_add(workspace, &wsm->idle_ws);
}
+}
- for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
- spin_lock_init(&btrfs_comp_ws[i].ws_lock);
- atomic_set(&btrfs_comp_ws[i].total_ws, 0);
- init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
+void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
+{
+ struct list_head *ws;
- /*
- * Preallocate one workspace for each compression type so
- * we can guarantee forward progress in the worst case
- */
- workspace = btrfs_compress_op[i]->alloc_workspace();
- if (IS_ERR(workspace)) {
- pr_warn("BTRFS: cannot preallocate compression workspace, will try later\n");
- } else {
- atomic_set(&btrfs_comp_ws[i].total_ws, 1);
- btrfs_comp_ws[i].free_ws = 1;
- list_add(workspace, &btrfs_comp_ws[i].idle_ws);
- }
+ while (!list_empty(&wsman->idle_ws)) {
+ ws = wsman->idle_ws.next;
+ list_del(ws);
+ wsman->ops->free_workspace(ws);
+ atomic_dec(&wsman->total_ws);
}
}
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-static struct list_head *__find_workspace(int type, bool heuristic)
+struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
+ unsigned int level)
{
struct list_head *workspace;
int cpus = num_online_cpus();
- int idx = type - 1;
unsigned nofs_flag;
struct list_head *idle_ws;
spinlock_t *ws_lock;
wait_queue_head_t *ws_wait;
int *free_ws;
- if (heuristic) {
- idle_ws = &btrfs_heuristic_ws.idle_ws;
- ws_lock = &btrfs_heuristic_ws.ws_lock;
- total_ws = &btrfs_heuristic_ws.total_ws;
- ws_wait = &btrfs_heuristic_ws.ws_wait;
- free_ws = &btrfs_heuristic_ws.free_ws;
- } else {
- idle_ws = &btrfs_comp_ws[idx].idle_ws;
- ws_lock = &btrfs_comp_ws[idx].ws_lock;
- total_ws = &btrfs_comp_ws[idx].total_ws;
- ws_wait = &btrfs_comp_ws[idx].ws_wait;
- free_ws = &btrfs_comp_ws[idx].free_ws;
- }
+ idle_ws = &wsm->idle_ws;
+ ws_lock = &wsm->ws_lock;
+ total_ws = &wsm->total_ws;
+ ws_wait = &wsm->ws_wait;
+ free_ws = &wsm->free_ws;
again:
spin_lock(ws_lock);
* context of btrfs_compress_bio/btrfs_compress_pages
*/
nofs_flag = memalloc_nofs_save();
- if (heuristic)
- workspace = alloc_heuristic_ws();
- else
- workspace = btrfs_compress_op[idx]->alloc_workspace();
+ workspace = wsm->ops->alloc_workspace(level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(workspace)) {
return workspace;
}
-static struct list_head *find_workspace(int type)
+static struct list_head *get_workspace(int type, int level)
{
- return __find_workspace(type, false);
+ return btrfs_compress_op[type]->get_workspace(level);
}
/*
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
-static void __free_workspace(int type, struct list_head *workspace,
- bool heuristic)
+void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
{
- int idx = type - 1;
struct list_head *idle_ws;
spinlock_t *ws_lock;
atomic_t *total_ws;
wait_queue_head_t *ws_wait;
int *free_ws;
- if (heuristic) {
- idle_ws = &btrfs_heuristic_ws.idle_ws;
- ws_lock = &btrfs_heuristic_ws.ws_lock;
- total_ws = &btrfs_heuristic_ws.total_ws;
- ws_wait = &btrfs_heuristic_ws.ws_wait;
- free_ws = &btrfs_heuristic_ws.free_ws;
- } else {
- idle_ws = &btrfs_comp_ws[idx].idle_ws;
- ws_lock = &btrfs_comp_ws[idx].ws_lock;
- total_ws = &btrfs_comp_ws[idx].total_ws;
- ws_wait = &btrfs_comp_ws[idx].ws_wait;
- free_ws = &btrfs_comp_ws[idx].free_ws;
- }
+ idle_ws = &wsm->idle_ws;
+ ws_lock = &wsm->ws_lock;
+ total_ws = &wsm->total_ws;
+ ws_wait = &wsm->ws_wait;
+ free_ws = &wsm->free_ws;
spin_lock(ws_lock);
if (*free_ws <= num_online_cpus()) {
- list_add(workspace, idle_ws);
+ list_add(ws, idle_ws);
(*free_ws)++;
spin_unlock(ws_lock);
goto wake;
}
spin_unlock(ws_lock);
- if (heuristic)
- free_heuristic_ws(workspace);
- else
- btrfs_compress_op[idx]->free_workspace(workspace);
+ wsm->ops->free_workspace(ws);
atomic_dec(total_ws);
wake:
cond_wake_up(ws_wait);
}
-static void free_workspace(int type, struct list_head *ws)
+static void put_workspace(int type, struct list_head *ws)
{
- return __free_workspace(type, ws, false);
-}
-
-/*
- * cleanup function for module exit
- */
-static void free_workspaces(void)
-{
- struct list_head *workspace;
- int i;
-
- while (!list_empty(&btrfs_heuristic_ws.idle_ws)) {
- workspace = btrfs_heuristic_ws.idle_ws.next;
- list_del(workspace);
- free_heuristic_ws(workspace);
- atomic_dec(&btrfs_heuristic_ws.total_ws);
- }
-
- for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
- workspace = btrfs_comp_ws[i].idle_ws.next;
- list_del(workspace);
- btrfs_compress_op[i]->free_workspace(workspace);
- atomic_dec(&btrfs_comp_ws[i].total_ws);
- }
- }
+ return btrfs_compress_op[type]->put_workspace(ws);
}
/*
unsigned long *total_in,
unsigned long *total_out)
{
+ int type = btrfs_compress_type(type_level);
+ int level = btrfs_compress_level(type_level);
struct list_head *workspace;
int ret;
- int type = type_level & 0xF;
-
- workspace = find_workspace(type);
- btrfs_compress_op[type - 1]->set_level(workspace, type_level);
- ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+ workspace = get_workspace(type, level);
+ ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
start, pages,
out_pages,
total_in, total_out);
- free_workspace(type, workspace);
+ put_workspace(type, workspace);
return ret;
}
int ret;
int type = cb->compress_type;
- workspace = find_workspace(type);
- ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb);
- free_workspace(type, workspace);
+ workspace = get_workspace(type, 0);
+ ret = btrfs_compress_op[type]->decompress_bio(workspace, cb);
+ put_workspace(type, workspace);
return ret;
}
struct list_head *workspace;
int ret;
- workspace = find_workspace(type);
-
- ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+ workspace = get_workspace(type, 0);
+ ret = btrfs_compress_op[type]->decompress(workspace, data_in,
dest_page, start_byte,
srclen, destlen);
+ put_workspace(type, workspace);
- free_workspace(type, workspace);
return ret;
}
+void __init btrfs_init_compress(void)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+ btrfs_compress_op[i]->init_workspace_manager();
+}
+
void __cold btrfs_exit_compress(void)
{
- free_workspaces();
+ int i;
+
+ for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+ btrfs_compress_op[i]->cleanup_workspace_manager();
}
/*
*/
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
{
- struct list_head *ws_list = __find_workspace(0, true);
+ struct list_head *ws_list = get_workspace(0, 0);
struct heuristic_ws *ws;
u32 i;
u8 byte;
}
out:
- __free_workspace(0, ws_list, true);
+ put_workspace(0, ws_list);
return ret;
}
-unsigned int btrfs_compress_str2level(const char *str)
+/*
+ * Convert the compression suffix (eg. after "zlib" starting with ":") to
+ * level, unrecognized string will set the default level
+ */
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
{
- if (strncmp(str, "zlib", 4) != 0)
+ unsigned int level = 0;
+ int ret;
+
+ if (!type)
return 0;
- /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */
- if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
- return str[5] - '0';
+ if (str[0] == ':') {
+ ret = kstrtouint(str + 1, 10, &level);
+ if (ret)
+ level = 0;
+ }
+
+ level = btrfs_compress_op[type]->set_level(level);
- return BTRFS_ZLIB_DEFAULT_LEVEL;
+ return level;
}
#include <linux/semaphore.h>
#include <linux/error-injection.h>
#include <linux/crc32c.h>
+#include <linux/sched/mm.h>
#include <asm/unaligned.h>
#include "ctree.h"
#include "disk-io.h"
if (need_lock) {
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
}
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
struct bio_vec *bvec;
struct btrfs_root *root;
int i, ret = 0;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
if (ret)
-buf->len,
fs_info->dirty_metadata_batch);
/* ugh, clear_extent_buffer_dirty needs to lock the page */
- btrfs_set_lock_blocking(buf);
+ btrfs_set_lock_blocking_write(buf);
clear_extent_buffer_dirty(buf);
}
}
INIT_LIST_HEAD(&root->delalloc_root);
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
+ INIT_LIST_HEAD(&root->reloc_dirty_list);
INIT_LIST_HEAD(&root->logged_list[0]);
INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->inode_lock);
root->anon_dev = 0;
spin_lock_init(&root->root_item_lock);
+ btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
}
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *root;
struct btrfs_key key;
+ unsigned int nofs_flag;
int ret = 0;
uuid_le uuid = NULL_UUID_LE;
+ /*
+ * We're holding a transaction handle, so use a NOFS memory allocation
+ * context to avoid deadlock if reclaim happens.
+ */
+ nofs_flag = memalloc_nofs_save();
root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!root)
return ERR_PTR(-ENOMEM);
goto sleep;
}
- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&fs_info->cleaner_mutex);
atomic_set(&fs_info->scrubs_paused, 0);
atomic_set(&fs_info->scrub_cancel_req, 0);
init_waitqueue_head(&fs_info->scrub_pause_wait);
- fs_info->scrub_workers_refcnt = 0;
+ refcount_set(&fs_info->scrub_workers_refcnt, 0);
}
static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
- mutex_init(&fs_info->cleaner_delayed_iput_mutex);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
atomic_set(&fs_info->defrag_running, 0);
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic_set(&fs_info->reada_works_cnt, 0);
+ atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
init_waitqueue_head(&fs_info->transaction_wait);
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
+ init_waitqueue_head(&fs_info->delayed_iputs_wait);
INIT_LIST_HEAD(&fs_info->pinned_chunks);
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
- if (!mutex_trylock(&head->mutex)) {
- refcount_inc(&head->refs);
- spin_unlock(&delayed_refs->lock);
-
- mutex_lock(&head->mutex);
- mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref_head(head);
- spin_lock(&delayed_refs->lock);
+ if (btrfs_delayed_ref_lock(delayed_refs, head))
continue;
- }
+
spin_lock(&head->lock);
while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
ref = rb_entry(n, struct btrfs_delayed_ref_node,
if (head->must_insert_reserved)
pin_bytes = true;
btrfs_free_delayed_extent_op(head->extent_op);
- delayed_refs->num_heads--;
- if (head->processing == 0)
- delayed_refs->num_heads_ready--;
- atomic_dec(&delayed_refs->num_entries);
- rb_erase_cached(&head->href_node, &delayed_refs->href_root);
- RB_CLEAR_NODE(&head->href_node);
+ btrfs_delete_ref_head(delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
return ret;
}
-static void flush_write_bio(struct extent_page_data *epd);
+static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
+{
+ blk_status_t ret = 0;
+ struct bio_vec *bvec = bio_last_bvec_all(bio);
- struct page *page = bvec->bv_page;
++ struct bio_vec bv;
+ struct extent_io_tree *tree = bio->bi_private;
+ u64 start;
+
- start = page_offset(page) + bvec->bv_offset;
++ mp_bvec_last_segment(bvec, &bv);
++ start = page_offset(bv.bv_page) + bv.bv_offset;
+
+ bio->bi_private = NULL;
+
+ if (tree->ops)
+ ret = tree->ops->submit_bio_hook(tree->private_data, bio,
+ mirror_num, bio_flags, start);
+ else
+ btrfsic_submit_bio(bio);
+
+ return blk_status_to_errno(ret);
+}
+
+static void flush_write_bio(struct extent_page_data *epd)
+{
+ if (epd->bio) {
+ int ret;
+
+ ret = submit_one_bio(epd->bio, 0, 0);
+ BUG_ON(ret < 0); /* -ENOMEM */
+ epd->bio = NULL;
+ }
+}
int __init extent_io_init(void)
{
}
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
- struct rb_node **prev_ret,
struct rb_node **next_ret,
+ struct rb_node **prev_ret,
struct rb_node ***p_ret,
struct rb_node **parent_ret)
{
if (parent_ret)
*parent_ret = prev;
- if (prev_ret) {
+ if (next_ret) {
orig_prev = prev;
while (prev && offset > prev_entry->end) {
prev = rb_next(prev);
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
}
- *prev_ret = prev;
+ *next_ret = prev;
prev = orig_prev;
}
- if (next_ret) {
+ if (prev_ret) {
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
while (prev && offset < prev_entry->start) {
prev = rb_prev(prev);
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
}
- *next_ret = prev;
+ *prev_ret = prev;
}
return NULL;
}
struct rb_node ***p_ret,
struct rb_node **parent_ret)
{
- struct rb_node *prev = NULL;
+ struct rb_node *next= NULL;
struct rb_node *ret;
- ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
+ ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
if (!ret)
- return prev;
+ return next;
return ret;
}
if (delete)
bits |= ~EXTENT_CTLBITS;
- bits |= EXTENT_FIRST_DELALLOC;
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
btrfs_debug_check_extent_io_range(tree, start, end);
- bits |= EXTENT_FIRST_DELALLOC;
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
int read_mode = 0;
blk_status_t status;
int ret;
- unsigned failed_bio_pages = bio_pages_all(failed_bio);
+ unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
u64 start;
u64 end;
int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int mirror;
int ret;
int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
return bio;
}
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
-{
- blk_status_t ret = 0;
- struct bio_vec *bvec = bio_last_bvec_all(bio);
- struct bio_vec bv;
- struct extent_io_tree *tree = bio->bi_private;
- u64 start;
-
- mp_bvec_last_segment(bvec, &bv);
- start = page_offset(bv.bv_page) + bv.bv_offset;
-
- bio->bi_private = NULL;
-
- if (tree->ops)
- ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- mirror_num, bio_flags, start);
- else
- btrfsic_submit_bio(bio);
-
- return blk_status_to_errno(ret);
-}
-
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @tree: tree so we can call our merge_bio hook
struct bio_vec *bvec;
struct extent_buffer *eb;
int i, done;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
struct page *page = bvec->bv_page;
eb = (struct extent_buffer *)page->private;
return ret;
}
-static void flush_write_bio(struct extent_page_data *epd)
-{
- if (epd->bio) {
- int ret;
-
- ret = submit_one_bio(epd->bio, 0, 0);
- BUG_ON(ret < 0); /* -ENOMEM */
- epd->bio = NULL;
- }
-}
-
int extent_write_full_page(struct page *page, struct writeback_control *wbc)
{
int ret;
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
- len, 0);
+ em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
if (IS_ERR_OR_NULL(em))
return em;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 blocksize = fs_info->sectorsize;
u64 actual_end;
- u64 isize = i_size_read(inode);
int ret = 0;
struct page **pages = NULL;
unsigned long nr_pages;
inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
SZ_16K);
- actual_end = min_t(u64, isize, end + 1);
+ actual_end = min_t(u64, i_size_read(inode), end + 1);
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
* queued. We walk all the async extents created by compress_file_range
* and send them down to the disk.
*/
-static noinline void submit_compressed_extents(struct inode *inode,
- struct async_cow *async_cow)
+static noinline void submit_compressed_extents(struct async_cow *async_cow)
{
+ struct inode *inode = async_cow->inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct async_extent *async_extent;
u64 alloc_hint = 0;
5 * SZ_1M)
cond_wake_up_nomb(&fs_info->async_submit_wait);
+ /*
+ * ->inode could be NULL if async_cow_start has failed to compress,
+ * in which case we don't have anything to submit, yet we need to
+ * always adjust ->async_delalloc_pages as its paired with the init
+ * happening in cow_file_range_async
+ */
if (async_cow->inode)
- submit_compressed_extents(async_cow->inode, async_cow);
+ submit_compressed_extents(async_cow);
}
static noinline void async_cow_free(struct btrfs_work *work)
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
BUG_ON(!async_cow); /* -ENOMEM */
- async_cow->inode = igrab(inode);
+ /*
+ * igrab is called higher up in the call chain, take only the
+ * lightweight reference for the callback lifetime
+ */
+ ihold(inode);
+ async_cow->inode = inode;
async_cow->fs_info = fs_info;
async_cow->locked_page = locked_page;
async_cow->start = start;
* Function to process delayed allocation (create CoW) for ranges which are
* being touched for the first time.
*/
-int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
+int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc)
{
- struct inode *inode = private_data;
int ret;
int force_cow = need_force_cow(inode, start, end);
unsigned int write_flags = wbc_to_write_flags(wbc);
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
+ atomic_inc(&fs_info->nr_delayed_iputs);
spin_lock(&fs_info->delayed_iput_lock);
ASSERT(list_empty(&binode->delayed_iput));
list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
list_del_init(&inode->delayed_iput);
spin_unlock(&fs_info->delayed_iput_lock);
iput(&inode->vfs_inode);
+ if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
+ wake_up(&fs_info->delayed_iputs_wait);
spin_lock(&fs_info->delayed_iput_lock);
}
spin_unlock(&fs_info->delayed_iput_lock);
}
+/**
+ * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
+ * @fs_info - the fs_info for this fs
+ * @return - EINTR if we were killed, 0 if nothing's pending
+ *
+ * This will wait on any delayed iputs that are currently running with KILLABLE
+ * set. Once they are all done running we will return, unless we are killed in
+ * which case we return EINTR. This helps in user operations like fallocate etc
+ * that might get blocked on the iputs.
+ */
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
+{
+ int ret = wait_event_killable(fs_info->delayed_iputs_wait,
+ atomic_read(&fs_info->nr_delayed_iputs) == 0);
+ if (ret)
+ return -EINTR;
+ return 0;
+}
+
/*
* This creates an orphan entry for the given inode in case something goes wrong
* in the middle of an unlink.
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1);
int failures = 0;
for (;;) {
struct btrfs_trans_handle *trans;
int ret;
- ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
+ ret = btrfs_block_rsv_refill(root, rsv,
+ rsv->size + delayed_refs_extra,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret && ++failures > 2) {
return ERR_PTR(-ENOSPC);
}
+ /*
+ * Evict can generate a large amount of delayed refs without
+ * having a way to add space back since we exhaust our temporary
+ * block rsv. We aren't allowed to do FLUSH_ALL in this case
+ * because we could deadlock with so many things in the flushing
+ * code, so we have to try and hold some extra space to
+ * compensate for our delayed ref generation. If we can't get
+ * that space then we need see if we can steal our minimum from
+ * the global reserve. We will be ratelimited by the amount of
+ * space we have for the delayed refs rsv, so we'll end up
+ * committing and trying again.
+ */
trans = btrfs_join_transaction(root);
- if (IS_ERR(trans) || !ret)
+ if (IS_ERR(trans) || !ret) {
+ if (!IS_ERR(trans)) {
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ trans->bytes_reserved = delayed_refs_extra;
+ btrfs_block_rsv_migrate(rsv, trans->block_rsv,
+ delayed_refs_extra, 1);
+ }
return trans;
+ }
/*
* Try to steal from the global reserve if there is space for
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
- u32 found_type;
+ u8 extent_type;
struct btrfs_path *path = NULL;
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *item;
if (ret < 0) {
err = ret;
goto out;
- }
-
- if (ret != 0) {
+ } else if (ret > 0) {
if (path->slots[0] == 0)
goto not_found;
path->slots[0]--;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- /* are we inside the extent that was found? */
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- found_type = found_key.type;
if (found_key.objectid != objectid ||
- found_type != BTRFS_EXTENT_DATA_KEY) {
+ found_key.type != BTRFS_EXTENT_DATA_KEY) {
/*
* If we backup past the first extent we want to move forward
* and see if there is an extent in front of us, otherwise we'll
goto next;
}
- found_type = btrfs_file_extent_type(leaf, item);
+ extent_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
btrfs_file_extent_num_bytes(leaf, item);
trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
extent_start);
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_ram_bytes(leaf, item);
if (ret < 0) {
err = ret;
goto out;
- }
- if (ret > 0)
+ } else if (ret > 0) {
goto not_found;
+ }
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
goto not_found;
if (start > found_key.offset)
goto next;
+
+ /* New extent overlaps with existing one */
em->start = start;
em->orig_start = start;
em->len = found_key.offset - start;
- goto not_found_em;
+ em->block_start = EXTENT_MAP_HOLE;
+ goto insert;
}
btrfs_extent_item_to_extent_map(inode, path, item,
new_inline, em);
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
goto insert;
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
unsigned long ptr;
char *map;
size_t size;
em->start = start;
em->orig_start = start;
em->len = len;
-not_found_em:
em->block_start = EXTENT_MAP_HOLE;
insert:
btrfs_release_path(path);
}
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- struct page *page,
- size_t pg_offset, u64 start, u64 len,
- int create)
+ u64 start, u64 len)
{
struct extent_map *em;
struct extent_map *hole_em = NULL;
- u64 range_start = start;
+ u64 delalloc_start = start;
u64 end;
- u64 found;
- u64 found_end;
+ u64 delalloc_len;
+ u64 delalloc_end;
int err = 0;
- em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em))
return em;
/*
em = NULL;
/* ok, we didn't find anything, lets look for delalloc */
- found = count_range_bits(&inode->io_tree, &range_start,
+ delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
end, len, EXTENT_DELALLOC, 1);
- found_end = range_start + found;
- if (found_end < range_start)
- found_end = (u64)-1;
+ delalloc_end = delalloc_start + delalloc_len;
+ if (delalloc_end < delalloc_start)
+ delalloc_end = (u64)-1;
/*
- * we didn't find anything useful, return
- * the original results from get_extent()
+ * We didn't find anything useful, return the original results from
+ * get_extent()
*/
- if (range_start > end || found_end <= start) {
+ if (delalloc_start > end || delalloc_end <= start) {
em = hole_em;
hole_em = NULL;
goto out;
}
- /* adjust the range_start to make sure it doesn't
- * go backwards from the start they passed in
+ /*
+ * Adjust the delalloc_start to make sure it doesn't go backwards from
+ * the start they passed in
*/
- range_start = max(start, range_start);
- found = found_end - range_start;
+ delalloc_start = max(start, delalloc_start);
+ delalloc_len = delalloc_end - delalloc_start;
- if (found > 0) {
- u64 hole_start = start;
- u64 hole_len = len;
+ if (delalloc_len > 0) {
+ u64 hole_start;
+ u64 hole_len;
+ const u64 hole_end = extent_map_end(hole_em);
em = alloc_extent_map();
if (!em) {
err = -ENOMEM;
goto out;
}
+ em->bdev = NULL;
+
+ ASSERT(hole_em);
/*
- * when btrfs_get_extent can't find anything it
- * returns one huge hole
+ * When btrfs_get_extent can't find anything it returns one
+ * huge hole
*
- * make sure what it found really fits our range, and
- * adjust to make sure it is based on the start from
- * the caller
+ * Make sure what it found really fits our range, and adjust to
+ * make sure it is based on the start from the caller
*/
- if (hole_em) {
- u64 calc_end = extent_map_end(hole_em);
-
- if (calc_end <= start || (hole_em->start > end)) {
- free_extent_map(hole_em);
- hole_em = NULL;
- } else {
- hole_start = max(hole_em->start, start);
- hole_len = calc_end - hole_start;
- }
+ if (hole_end <= start || hole_em->start > end) {
+ free_extent_map(hole_em);
+ hole_em = NULL;
+ } else {
+ hole_start = max(hole_em->start, start);
+ hole_len = hole_end - hole_start;
}
- em->bdev = NULL;
- if (hole_em && range_start > hole_start) {
- /* our hole starts before our delalloc, so we
- * have to return just the parts of the hole
- * that go until the delalloc starts
+
+ if (hole_em && delalloc_start > hole_start) {
+ /*
+ * Our hole starts before our delalloc, so we have to
+ * return just the parts of the hole that go until the
+ * delalloc starts
*/
- em->len = min(hole_len,
- range_start - hole_start);
+ em->len = min(hole_len, delalloc_start - hole_start);
em->start = hole_start;
em->orig_start = hole_start;
/*
- * don't adjust block start at all,
- * it is fixed at EXTENT_MAP_HOLE
+ * Don't adjust block start at all, it is fixed at
+ * EXTENT_MAP_HOLE
*/
em->block_start = hole_em->block_start;
em->block_len = hole_len;
if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
} else {
- em->start = range_start;
- em->len = found;
- em->orig_start = range_start;
+ /*
+ * Hole is out of passed range or it starts after
+ * delalloc range
+ */
+ em->start = delalloc_start;
+ em->len = delalloc_len;
+ em->orig_start = delalloc_start;
em->block_start = EXTENT_MAP_DELALLOC;
- em->block_len = found;
+ em->block_len = delalloc_len;
}
} else {
return hole_em;
struct bio_vec *bvec;
struct extent_io_tree *io_tree, *failure_tree;
int i;
+ struct bvec_iter_all iter_all;
if (bio->bi_status)
goto end;
done->uptodate = 1;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
io_tree, done->start, bvec->bv_page,
btrfs_ino(BTRFS_I(inode)), 0);
int uptodate;
int ret;
int i;
+ struct bvec_iter_all iter_all;
if (bio->bi_status)
goto end;
failure_tree = &BTRFS_I(inode)->io_failure_tree;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
bvec->bv_offset, done->start,
bvec->bv_len);
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- WARN_ON_ONCE(!inode);
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
btrfs_run_delalloc_work, NULL, NULL);
#include <linux/bio.h>
#include <linux/fs.h>
#include <linux/list_sort.h>
-#include <linux/blkdev.h>
-#include "bmap.h"
#include "dir.h"
#include "gfs2.h"
#include "incore.h"
* that is pinned in the pagecache.
*/
- static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
+ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
+ struct bio_vec *bvec,
blk_status_t error)
{
struct buffer_head *bh, *next;
/**
* gfs2_end_log_write - end of i/o to the log
* @bio: The bio
+ * @error: Status of i/o request
*
* Each bio_vec contains either data from the pagecache or data
* relating to the log itself. Here we iterate over the bio_vec
struct bio_vec *bvec;
struct page *page;
int i;
+ struct bvec_iter_all iter_all;
if (bio->bi_status) {
fs_err(sdp, "Error %d writing to journal, jid=%u\n",
wake_up(&sdp->sd_logd_waitq);
}
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
page = bvec->bv_page;
if (page_has_buffers(page))
gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
/**
* gfs2_log_submit_bio - Submit any pending log bio
* @biop: Address of the bio pointer
- * @opf: REQ_OP | op_flags
+ * @op: REQ_OP
+ * @op_flags: req_flag_bits
*
* Submit any pending part-built or full bio to the block device. If
* there is no pending bio, then this is a no-op.
*/
-void gfs2_log_submit_bio(struct bio **biop, int opf)
+void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags)
{
struct bio *bio = *biop;
if (bio) {
struct gfs2_sbd *sdp = bio->bi_private;
atomic_inc(&sdp->sd_log_in_flight);
- bio->bi_opf = opf;
+ bio_set_op_attrs(bio, op, op_flags);
submit_bio(bio);
*biop = NULL;
}
nblk >>= sdp->sd_fsb2bb_shift;
if (blkno == nblk && !flush)
return bio;
- gfs2_log_submit_bio(biop, op);
+ gfs2_log_submit_bio(biop, op, 0);
}
*biop = gfs2_log_alloc_bio(sdp, blkno, end_io);
gfs2_log_bmap(sdp));
}
-/**
- * gfs2_end_log_read - end I/O callback for reads from the log
- * @bio: The bio
- *
- * Simply unlock the pages in the bio. The main thread will wait on them and
- * process them in order as necessary.
- */
-
-static void gfs2_end_log_read(struct bio *bio)
-{
- struct page *page;
- struct bio_vec *bvec;
- int i;
- struct bvec_iter_all iter_all;
-
- bio_for_each_segment_all(bvec, bio, i, iter_all) {
- page = bvec->bv_page;
- if (bio->bi_status) {
- int err = blk_status_to_errno(bio->bi_status);
-
- SetPageError(page);
- mapping_set_error(page->mapping, err);
- }
- unlock_page(page);
- }
-
- bio_put(bio);
-}
-
-/**
- * gfs2_jhead_pg_srch - Look for the journal head in a given page.
- * @jd: The journal descriptor
- * @page: The page to look in
- *
- * Returns: 1 if found, 0 otherwise.
- */
-
-static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
- struct gfs2_log_header_host *head,
- struct page *page)
-{
- struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
- struct gfs2_log_header_host uninitialized_var(lh);
- void *kaddr = kmap_atomic(page);
- unsigned int offset;
- bool ret = false;
-
- for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) {
- if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) {
- if (lh.lh_sequence > head->lh_sequence)
- *head = lh;
- else {
- ret = true;
- break;
- }
- }
- }
- kunmap_atomic(kaddr);
- return ret;
-}
-
-/**
- * gfs2_jhead_process_page - Search/cleanup a page
- * @jd: The journal descriptor
- * @index: Index of the page to look into
- * @done: If set, perform only cleanup, else search and set if found.
- *
- * Find the page with 'index' in the journal's mapping. Search the page for
- * the journal head if requested (cleanup == false). Release refs on the
- * page so the page cache can reclaim it (put_page() twice). We grabbed a
- * reference on this page two times, first when we did a find_or_create_page()
- * to obtain the page to add it to the bio and second when we do a
- * find_get_page() here to get the page to wait on while I/O on it is being
- * completed.
- * This function is also used to free up a page we might've grabbed but not
- * used. Maybe we added it to a bio, but not submitted it for I/O. Or we
- * submitted the I/O, but we already found the jhead so we only need to drop
- * our references to the page.
- */
-
-static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
- struct gfs2_log_header_host *head,
- bool *done)
-{
- struct page *page;
-
- page = find_get_page(jd->jd_inode->i_mapping, index);
- wait_on_page_locked(page);
-
- if (PageError(page))
- *done = true;
-
- if (!*done)
- *done = gfs2_jhead_pg_srch(jd, head, page);
-
- put_page(page); /* Once for find_get_page */
- put_page(page); /* Once more for find_or_create_page */
-}
-
-/**
- * gfs2_find_jhead - find the head of a log
- * @jd: The journal descriptor
- * @head: The log descriptor for the head of the log is returned here
- *
- * Do a search of a journal by reading it in large chunks using bios and find
- * the valid log entry with the highest sequence number. (i.e. the log head)
- *
- * Returns: 0 on success, errno otherwise
- */
-
-int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
-{
- struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
- struct address_space *mapping = jd->jd_inode->i_mapping;
- struct gfs2_journal_extent *je;
- u32 block, read_idx = 0, submit_idx = 0, index = 0;
- int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift;
- int blocks_per_page = 1 << shift, sz, ret = 0;
- struct bio *bio = NULL;
- struct page *page;
- bool done = false;
- errseq_t since;
-
- memset(head, 0, sizeof(*head));
- if (list_empty(&jd->extent_list))
- gfs2_map_journal_extents(sdp, jd);
-
- since = filemap_sample_wb_err(mapping);
- list_for_each_entry(je, &jd->extent_list, list) {
- for (block = 0; block < je->blocks; block += blocks_per_page) {
- index = (je->lblock + block) >> shift;
-
- page = find_or_create_page(mapping, index, GFP_NOFS);
- if (!page) {
- ret = -ENOMEM;
- done = true;
- goto out;
- }
-
- if (bio) {
- sz = bio_add_page(bio, page, PAGE_SIZE, 0);
- if (sz == PAGE_SIZE)
- goto page_added;
- submit_idx = index;
- submit_bio(bio);
- bio = NULL;
- }
-
- bio = gfs2_log_alloc_bio(sdp,
- je->dblock + (index << shift),
- gfs2_end_log_read);
- bio->bi_opf = REQ_OP_READ;
- sz = bio_add_page(bio, page, PAGE_SIZE, 0);
- gfs2_assert_warn(sdp, sz == PAGE_SIZE);
-
-page_added:
- if (submit_idx <= read_idx + BIO_MAX_PAGES) {
- /* Keep at least one bio in flight */
- continue;
- }
-
- gfs2_jhead_process_page(jd, read_idx++, head, &done);
- if (done)
- goto out; /* found */
- }
- }
-
-out:
- if (bio)
- submit_bio(bio);
- while (read_idx <= index)
- gfs2_jhead_process_page(jd, read_idx++, head, &done);
-
- if (!ret)
- ret = filemap_check_wb_err(mapping, since);
-
- return ret;
-}
-
static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
u32 ld_length, u32 ld_data1)
{
*/
struct xfs_writepage_ctx {
struct xfs_bmbt_irec imap;
- unsigned int io_type;
+ int fork;
+ unsigned int data_seq;
unsigned int cow_seq;
struct xfs_ioend *ioend;
};
static void
xfs_finish_page_writeback(
struct inode *inode,
- struct bio_vec *bvec,
+ struct bio_vec *bvec,
int error)
{
struct iomap_page *iop = to_iomap_page(bvec->bv_page);
for (bio = &ioend->io_inline_bio; bio; bio = next) {
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
/*
* For the last bio, bi_private points to the ioend, so we
next = bio->bi_private;
/* walk each page on bio, ending page IO on them */
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
xfs_finish_page_writeback(inode, bvec, error);
bio_put(bio);
}
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- switch (ioend->io_type) {
- case XFS_IO_COW:
+ if (ioend->io_fork == XFS_COW_FORK)
xfs_reflink_cancel_cow_range(ip, offset, size, true);
- break;
- }
-
goto done;
}
/*
- * Success: commit the COW or unwritten blocks if needed.
+ * Success: commit the COW or unwritten blocks if needed.
*/
- switch (ioend->io_type) {
- case XFS_IO_COW:
+ if (ioend->io_fork == XFS_COW_FORK)
error = xfs_reflink_end_cow(ip, offset, size);
- break;
- case XFS_IO_UNWRITTEN:
- /* writeback should never update isize */
+ else if (ioend->io_state == XFS_EXT_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
- break;
- default:
+ else
ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
- break;
- }
done:
if (ioend->io_append_trans)
struct xfs_ioend *ioend = bio->bi_private;
struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
- if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
+ if (ioend->io_fork == XFS_COW_FORK ||
+ ioend->io_state == XFS_EXT_UNWRITTEN)
queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
else if (ioend->io_append_trans)
queue_work(mp->m_data_workqueue, &ioend->io_work);
xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
}
+/*
+ * Fast revalidation of the cached writeback mapping. Return true if the current
+ * mapping is valid, false otherwise.
+ */
+static bool
+xfs_imap_valid(
+ struct xfs_writepage_ctx *wpc,
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb)
+{
+ if (offset_fsb < wpc->imap.br_startoff ||
+ offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ return false;
+ /*
+ * If this is a COW mapping, it is sufficient to check that the mapping
+ * covers the offset. Be careful to check this first because the caller
+ * can revalidate a COW mapping without updating the data seqno.
+ */
+ if (wpc->fork == XFS_COW_FORK)
+ return true;
+
+ /*
+ * This is not a COW mapping. Check the sequence number of the data fork
+ * because concurrent changes could have invalidated the extent. Check
+ * the COW fork because concurrent changes since the last time we
+ * checked (and found nothing at this offset) could have added
+ * overlapping blocks.
+ */
+ if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+ return false;
+ if (xfs_inode_has_cow_data(ip) &&
+ wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+ return false;
+ return true;
+}
+
+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in wpc->imap.
+ *
+ * The current page is held locked so nothing could have removed the block
+ * backing offset_fsb, although it could have moved from the COW to the data
+ * fork by another thread.
+ */
+static int
+xfs_convert_blocks(
+ struct xfs_writepage_ctx *wpc,
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb)
+{
+ int error;
+
+ /*
+ * Attempt to allocate whatever delalloc extent currently backs
+ * offset_fsb and put the result into wpc->imap. Allocate in a loop
+ * because it may take several attempts to allocate real blocks for a
+ * contiguous delalloc extent if free space is sufficiently fragmented.
+ */
+ do {
+ error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
+ &wpc->imap, wpc->fork == XFS_COW_FORK ?
+ &wpc->cow_seq : &wpc->data_seq);
+ if (error)
+ return error;
+ } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+
+ return 0;
+}
+
STATIC int
xfs_map_blocks(
struct xfs_writepage_ctx *wpc,
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
ssize_t count = i_blocksize(inode);
- xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb = NULLFILEOFF;
struct xfs_bmbt_irec imap;
- int whichfork = XFS_DATA_FORK;
struct xfs_iext_cursor icur;
- bool imap_valid;
+ int retries = 0;
int error = 0;
- /*
- * We have to make sure the cached mapping is within EOF to protect
- * against eofblocks trimming on file release leaving us with a stale
- * mapping. Otherwise, a page for a subsequent file extending buffered
- * write could get picked up by this writeback cycle and written to the
- * wrong blocks.
- *
- * Note that what we really want here is a generic mapping invalidation
- * mechanism to protect us from arbitrary extent modifying contexts, not
- * just eofblocks.
- */
- xfs_trim_extent_eof(&wpc->imap, ip);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
/*
* COW fork blocks can overlap data fork blocks even if the blocks
* against concurrent updates and provides a memory barrier on the way
* out that ensures that we always see the current value.
*/
- imap_valid = offset_fsb >= wpc->imap.br_startoff &&
- offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
- if (imap_valid &&
- (!xfs_inode_has_cow_data(ip) ||
- wpc->io_type == XFS_IO_COW ||
- wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq)))
+ if (xfs_imap_valid(wpc, ip, offset_fsb))
return 0;
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
/*
* If we don't have a valid map, now it's time to get a new one for this
* offset. This will convert delayed allocations (including COW ones)
* into real extents. If we return without a valid map, it means we
* landed in a hole and we skip the block.
*/
+retry:
xfs_ilock(ip, XFS_ILOCK_SHARED);
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
- ASSERT(offset <= mp->m_super->s_maxbytes);
-
- if (offset > mp->m_super->s_maxbytes - count)
- count = mp->m_super->s_maxbytes - offset;
- end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
/*
* Check if this is offset is covered by a COW extents, and if yes use
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- /*
- * Truncate can race with writeback since writeback doesn't
- * take the iolock and truncate decreases the file size before
- * it starts truncating the pages between new_size and old_size.
- * Therefore, we can end up in the situation where writeback
- * gets a CoW fork mapping but the truncate makes the mapping
- * invalid and we end up in here trying to get a new mapping.
- * bail out here so that we simply never get a valid mapping
- * and so we drop the write altogether. The page truncation
- * will kill the contents anyway.
- */
- if (offset > i_size_read(inode)) {
- wpc->io_type = XFS_IO_HOLE;
- return 0;
- }
- whichfork = XFS_COW_FORK;
- wpc->io_type = XFS_IO_COW;
+
+ wpc->fork = XFS_COW_FORK;
goto allocate_blocks;
}
/*
- * Map valid and no COW extent in the way? We're done.
+ * No COW extent overlap. Revalidate now that we may have updated
+ * ->cow_seq. If the data mapping is still valid, we're done.
*/
- if (imap_valid) {
+ if (xfs_imap_valid(wpc, ip, offset_fsb)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return 0;
}
*/
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
imap.br_startoff = end_fsb; /* fake a hole past EOF */
+ wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ wpc->fork = XFS_DATA_FORK;
+
+ /* landed in a hole or beyond EOF? */
if (imap.br_startoff > offset_fsb) {
- /* landed in a hole or beyond EOF */
imap.br_blockcount = imap.br_startoff - offset_fsb;
imap.br_startoff = offset_fsb;
imap.br_startblock = HOLESTARTBLOCK;
- wpc->io_type = XFS_IO_HOLE;
- } else {
- /*
- * Truncate to the next COW extent if there is one. This is the
- * only opportunity to do this because we can skip COW fork
- * lookups for the subsequent blocks in the mapping; however,
- * the requirement to treat the COW range separately remains.
- */
- if (cow_fsb != NULLFILEOFF &&
- cow_fsb < imap.br_startoff + imap.br_blockcount)
- imap.br_blockcount = cow_fsb - imap.br_startoff;
-
- if (isnullstartblock(imap.br_startblock)) {
- /* got a delalloc extent */
- wpc->io_type = XFS_IO_DELALLOC;
- goto allocate_blocks;
- }
-
- if (imap.br_state == XFS_EXT_UNWRITTEN)
- wpc->io_type = XFS_IO_UNWRITTEN;
- else
- wpc->io_type = XFS_IO_OVERWRITE;
+ imap.br_state = XFS_EXT_NORM;
}
+ /*
+ * Truncate to the next COW extent if there is one. This is the only
+ * opportunity to do this because we can skip COW fork lookups for the
+ * subsequent blocks in the mapping; however, the requirement to treat
+ * the COW range separately remains.
+ */
+ if (cow_fsb != NULLFILEOFF &&
+ cow_fsb < imap.br_startoff + imap.br_blockcount)
+ imap.br_blockcount = cow_fsb - imap.br_startoff;
+
+ /* got a delalloc extent? */
+ if (imap.br_startblock != HOLESTARTBLOCK &&
+ isnullstartblock(imap.br_startblock))
+ goto allocate_blocks;
+
wpc->imap = imap;
- xfs_trim_extent_eof(&wpc->imap, ip);
- trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
+ trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
return 0;
allocate_blocks:
- error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
- &wpc->cow_seq);
- if (error)
+ error = xfs_convert_blocks(wpc, ip, offset_fsb);
+ if (error) {
+ /*
+ * If we failed to find the extent in the COW fork we might have
+ * raced with a COW to data fork conversion or truncate.
+ * Restart the lookup to catch the extent in the data fork for
+ * the former case, but prevent additional retries to avoid
+ * looping forever for the latter case.
+ */
+ if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+ goto retry;
+ ASSERT(error != -EAGAIN);
return error;
- ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
- imap.br_startoff + imap.br_blockcount <= cow_fsb);
- wpc->imap = imap;
- xfs_trim_extent_eof(&wpc->imap, ip);
- trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
+ }
+
+ /*
+ * Due to merging the return real extent might be larger than the
+ * original delalloc one. Trim the return extent to the next COW
+ * boundary again to force a re-lookup.
+ */
+ if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
+ cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+
+ ASSERT(wpc->imap.br_startoff <= offset_fsb);
+ ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
+ trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
return 0;
}
int status)
{
/* Convert CoW extents to regular */
- if (!status && ioend->io_type == XFS_IO_COW) {
+ if (!status && ioend->io_fork == XFS_COW_FORK) {
/*
* Yuk. This can do memory allocation, but is not a
* transactional operation so everything is done in GFP_KERNEL
/* Reserve log space if we might write beyond the on-disk inode size. */
if (!status &&
- ioend->io_type != XFS_IO_UNWRITTEN &&
+ (ioend->io_fork == XFS_COW_FORK ||
+ ioend->io_state != XFS_EXT_UNWRITTEN) &&
xfs_ioend_is_append(ioend) &&
!ioend->io_append_trans)
status = xfs_setfilesize_trans_alloc(ioend);
static struct xfs_ioend *
xfs_alloc_ioend(
struct inode *inode,
- unsigned int type,
+ int fork,
+ xfs_exntst_t state,
xfs_off_t offset,
struct block_device *bdev,
sector_t sector)
ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_type = type;
+ ioend->io_fork = fork;
+ ioend->io_state = state;
ioend->io_inode = inode;
ioend->io_size = 0;
ioend->io_offset = offset;
sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
- if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+ if (!wpc->ioend ||
+ wpc->fork != wpc->ioend->io_fork ||
+ wpc->imap.br_state != wpc->ioend->io_state ||
sector != bio_end_sector(wpc->ioend->io_bio) ||
offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
if (wpc->ioend)
list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
- bdev, sector);
+ wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
+ wpc->imap.br_state, offset, bdev, sector);
}
- if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
+ if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) {
if (iop)
atomic_inc(&iop->write_count);
if (bio_full(wpc->ioend->io_bio))
xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
- __bio_add_page(wpc->ioend->io_bio, page, len, poff);
+ bio_add_page(wpc->ioend->io_bio, page, len, poff);
}
wpc->ioend->io_size += len;
error = xfs_map_blocks(wpc, inode, file_offset);
if (error)
break;
- if (wpc->io_type == XFS_IO_HOLE)
+ if (wpc->imap.br_startblock == HOLESTARTBLOCK)
continue;
xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
&submit_list);
struct page *page,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_HOLE,
- };
+ struct xfs_writepage_ctx wpc = { };
int ret;
ret = xfs_do_writepage(page, wbc, &wpc);
struct address_space *mapping,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_HOLE,
- };
+ struct xfs_writepage_ctx wpc = { };
int ret;
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
* Since we don't pass back blockdev info, we can't return bmap
* information for rt files either.
*/
- if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
+ if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
return 0;
return iomap_bmap(mapping, block, &xfs_iomap_ops);
}
* We can't properly handle unaligned direct I/O to reflink
* files yet, as we can't unshare a partial block.
*/
- if (xfs_is_reflink_inode(ip)) {
+ if (xfs_is_cow_inode(ip)) {
trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
return -EREMCHG;
}
goto out_unlock;
}
- if (mode & FALLOC_FL_ZERO_RANGE)
+ if (mode & FALLOC_FL_ZERO_RANGE) {
error = xfs_zero_file_space(ip, offset, len);
- else {
- if (mode & FALLOC_FL_UNSHARE_RANGE) {
- error = xfs_reflink_unshare(ip, offset, len);
- if (error)
- goto out_unlock;
+ } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+ error = xfs_reflink_unshare(ip, offset, len);
+ if (error)
+ goto out_unlock;
+
+ if (!xfs_is_always_cow_inode(ip)) {
+ error = xfs_alloc_file_space(ip, offset, len,
+ XFS_BMAPI_PREALLOC);
}
+ } else {
+ /*
+ * If always_cow mode we can't use preallocations and
+ * thus should not create them.
+ */
+ if (xfs_is_always_cow_inode(ip)) {
+ error = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
}
default:
return generic_file_llseek(file, offset, whence);
case SEEK_HOLE:
- offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
+ offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
break;
case SEEK_DATA:
- offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
+ offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
break;
}
.write_iter = xfs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
+ .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
#include <linux/uuid.h>
#include <linux/errseq.h>
#include <linux/ioprio.h>
+#include <linux/fs_types.h>
+#include <linux/build_bug.h>
+#include <linux/stddef.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
struct kiocb {
struct file *ki_filp;
+
+ /* The 'ki_filp' pointer is shared in a union for aio */
+ randomized_struct_fields_start
+
loff_t ki_pos;
void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
void *private;
int ki_flags;
u16 ki_hint;
u16 ki_ioprio; /* See linux/ioprio.h */
-} __randomize_layout;
+ unsigned int ki_cookie; /* for ->iopoll */
+
+ randomized_struct_fields_end
+};
static inline bool is_sync_kiocb(struct kiocb *kiocb)
{
u64 phys, u64 len, u32 flags);
int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
-/*
- * File types
- *
- * NOTE! These match bits 12..15 of stat.st_mode
- * (ie "(i_mode >> 12) & 15").
- */
-#define DT_UNKNOWN 0
-#define DT_FIFO 1
-#define DT_CHR 2
-#define DT_DIR 4
-#define DT_BLK 6
-#define DT_REG 8
-#define DT_LNK 10
-#define DT_SOCK 12
-#define DT_WHT 14
-
/*
* This is the "filldir" function type, used by readdir() to let
* the kernel specify what kind of dirent layout it wants to have.
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+ int (*iopoll)(struct kiocb *kiocb, bool spin);
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
* I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to
* synchronize competing switching instances and to tell
* wb stat updates to grab the i_pages lock. See
- * inode_switch_wb_work_fn() for details.
+ * inode_switch_wbs_work_fn() for details.
*
* I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper
* and work dirs among overlayfs mounts.
struct audit_names *aname;
const char iname[];
};
+static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
extern long vfs_truncate(const struct path *, loff_t);
extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,