Merge tag 'for-5.12/block-ipi-2021-02-21' of git://git.kernel.dk/linux-block

author Linus Torvalds <[email protected]>

Mon, 22 Feb 2021 18:53:05 +0000 (10:53 -0800)

committer Linus Torvalds <[email protected]>

Mon, 22 Feb 2021 18:53:05 +0000 (10:53 -0800)
author Linus Torvalds <[email protected]>
Mon, 22 Feb 2021 18:53:05 +0000 (10:53 -0800)
committer Linus Torvalds <[email protected]>
Mon, 22 Feb 2021 18:53:05 +0000 (10:53 -0800)
diff --combined block/blk-mq.c

index f21d922ecfaf0b92e09810a2df423a5927f52648,463de2981df8ad6f600a8ac101c4ecb3f9a9490b..d4d7c1caa439666f212c63df86a2a61a81a68ca9
--- 1/block/blk-mq.c
--- 2/block/blk-mq.c
+++ b/block/blk-mq.c
@@@ -41,7 -41,7 +41,7 @@@
   #include "blk-mq-sched.h"
   #include "blk-rq-qos.h"
   
- static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+ static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
   
   static void blk_mq_poll_stats_start(struct request_queue *q);
   static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
@@@ -567,80 -567,29 +567,29 @@@ void blk_mq_end_request(struct request 
   }
   EXPORT_SYMBOL(blk_mq_end_request);
   
- /*
-  * Softirq action handler - move entries to local list and loop over them
-  * while passing them to the queue registered handler.
-  */
- static __latent_entropy void blk_done_softirq(struct softirq_action *h)
+ static void blk_complete_reqs(struct llist_head *list)
   {
-       struct list_head *cpu_list, local_list;
- 
-       local_irq_disable();
-       cpu_list = this_cpu_ptr(&blk_cpu_done);
-       list_replace_init(cpu_list, &local_list);
-       local_irq_enable();
- 
-       while (!list_empty(&local_list)) {
-               struct request *rq;
+       struct llist_node *entry = llist_reverse_order(llist_del_all(list));
+       struct request *rq, *next;
   
-               rq = list_entry(local_list.next, struct request, ipi_list);
-               list_del_init(&rq->ipi_list);
+       llist_for_each_entry_safe(rq, next, entry, ipi_list)
                 rq->q->mq_ops->complete(rq);
-       }
   }
   
- static void blk_mq_trigger_softirq(struct request *rq)
+ static __latent_entropy void blk_done_softirq(struct softirq_action *h)
   {
-       struct list_head *list;
-       unsigned long flags;
- 
-       local_irq_save(flags);
-       list = this_cpu_ptr(&blk_cpu_done);
-       list_add_tail(&rq->ipi_list, list);
- 
-       /*
-        * If the list only contains our just added request, signal a raise of
-        * the softirq.  If there are already entries there, someone already
-        * raised the irq but it hasn't run yet.
-        */
-       if (list->next == &rq->ipi_list)
-               raise_softirq_irqoff(BLOCK_SOFTIRQ);
-       local_irq_restore(flags);
+       blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
   }
   
   static int blk_softirq_cpu_dead(unsigned int cpu)
   {
-       /*
-        * If a CPU goes away, splice its entries to the current CPU
-        * and trigger a run of the softirq
-        */
-       local_irq_disable();
-       list_splice_init(&per_cpu(blk_cpu_done, cpu),
-                        this_cpu_ptr(&blk_cpu_done));
-       raise_softirq_irqoff(BLOCK_SOFTIRQ);
-       local_irq_enable();
- 
+       blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
         return 0;
   }
   
- 
   static void __blk_mq_complete_request_remote(void *data)
   {
-       struct request *rq = data;
- 
-       /*
-        * For most of single queue controllers, there is only one irq vector
-        * for handling I/O completion, and the only irq's affinity is set
-        * to all possible CPUs.  On most of ARCHs, this affinity means the irq
-        * is handled on one specific CPU.
-        *
-        * So complete I/O requests in softirq context in case of single queue
-        * devices to avoid degrading I/O performance due to irqsoff latency.
-        */
-       if (rq->q->nr_hw_queues == 1)
-               blk_mq_trigger_softirq(rq);
-       else
-               rq->q->mq_ops->complete(rq);
+       __raise_softirq_irqoff(BLOCK_SOFTIRQ);
   }
   
   static inline bool blk_mq_complete_need_ipi(struct request *rq)
@@@ -669,6 -618,30 +618,30 @@@
         return cpu_online(rq->mq_ctx->cpu);
   }
   
+ static void blk_mq_complete_send_ipi(struct request *rq)
+ {
+       struct llist_head *list;
+       unsigned int cpu;
+ 
+       cpu = rq->mq_ctx->cpu;
+       list = &per_cpu(blk_cpu_done, cpu);
+       if (llist_add(&rq->ipi_list, list)) {
+               INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
+               smp_call_function_single_async(cpu, &rq->csd);
+       }
+ }
+ 
+ static void blk_mq_raise_softirq(struct request *rq)
+ {
+       struct llist_head *list;
+ 
+       preempt_disable();
+       list = this_cpu_ptr(&blk_cpu_done);
+       if (llist_add(&rq->ipi_list, list))
+               raise_softirq(BLOCK_SOFTIRQ);
+       preempt_enable();
+ }
+ 
   bool blk_mq_complete_request_remote(struct request *rq)
   {
         WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
@@@ -681,15 -654,15 +654,15 @@@
                 return false;
   
         if (blk_mq_complete_need_ipi(rq)) {
-               INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
-               smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
-       } else {
-               if (rq->q->nr_hw_queues > 1)
-                       return false;
-               blk_mq_trigger_softirq(rq);
+               blk_mq_complete_send_ipi(rq);
+               return true;
         }
   
-       return true;
+       if (rq->q->nr_hw_queues == 1) {
+               blk_mq_raise_softirq(rq);
+               return true;
+       }
+       return false;
   }
   EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
   
@@@ -1646,42 -1619,6 +1619,42 @@@ void blk_mq_run_hw_queue(struct blk_mq_
   }
   EXPORT_SYMBOL(blk_mq_run_hw_queue);
   
+ +/*
+ + * Is the request queue handled by an IO scheduler that does not respect
+ + * hardware queues when dispatching?
+ + */
+ +static bool blk_mq_has_sqsched(struct request_queue *q)
+ +{
+ +      struct elevator_queue *e = q->elevator;
+ +
+ +      if (e && e->type->ops.dispatch_request &&
+ +          !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
+ +              return true;
+ +      return false;
+ +}
+ +
+ +/*
+ + * Return prefered queue to dispatch from (if any) for non-mq aware IO
+ + * scheduler.
+ + */
+ +static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
+ +{
+ +      struct blk_mq_hw_ctx *hctx;
+ +
+ +      /*
+ +       * If the IO scheduler does not respect hardware queues when
+ +       * dispatching, we just don't bother with multiple HW queues and
+ +       * dispatch from hctx for the current CPU since running multiple queues
+ +       * just causes lock contention inside the scheduler and pointless cache
+ +       * bouncing.
+ +       */
+ +      hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT,
+ +                                   raw_smp_processor_id());
+ +      if (!blk_mq_hctx_stopped(hctx))
+ +              return hctx;
+ +      return NULL;
+ +}
+ +
   /**
    * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
    * @q: Pointer to the request queue to run.
@@@ -1689,23 -1626,14 +1662,23 @@@
    */
   void blk_mq_run_hw_queues(struct request_queue *q, bool async)
   {
- -      struct blk_mq_hw_ctx *hctx;
+ +      struct blk_mq_hw_ctx *hctx, *sq_hctx;
         int i;
   
+ +      sq_hctx = NULL;
+ +      if (blk_mq_has_sqsched(q))
+ +              sq_hctx = blk_mq_get_sq_hctx(q);
         queue_for_each_hw_ctx(q, hctx, i) {
                 if (blk_mq_hctx_stopped(hctx))
                         continue;
- -
- -              blk_mq_run_hw_queue(hctx, async);
+ +              /*
+ +               * Dispatch from this hctx either if there's no hctx preferred
+ +               * by IO scheduler or if it has requests that bypass the
+ +               * scheduler.
+ +               */
+ +              if (!sq_hctx || sq_hctx == hctx ||
+ +                  !list_empty_careful(&hctx->dispatch))
+ +                      blk_mq_run_hw_queue(hctx, async);
         }
   }
   EXPORT_SYMBOL(blk_mq_run_hw_queues);
@@@ -1717,23 -1645,14 +1690,23 @@@
    */
   void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
   {
- -      struct blk_mq_hw_ctx *hctx;
+ +      struct blk_mq_hw_ctx *hctx, *sq_hctx;
         int i;
   
+ +      sq_hctx = NULL;
+ +      if (blk_mq_has_sqsched(q))
+ +              sq_hctx = blk_mq_get_sq_hctx(q);
         queue_for_each_hw_ctx(q, hctx, i) {
                 if (blk_mq_hctx_stopped(hctx))
                         continue;
- -
- -              blk_mq_delay_run_hw_queue(hctx, msecs);
+ +              /*
+ +               * Dispatch from this hctx either if there's no hctx preferred
+ +               * by IO scheduler or if it has requests that bypass the
+ +               * scheduler.
+ +               */
+ +              if (!sq_hctx || sq_hctx == hctx ||
+ +                  !list_empty_careful(&hctx->dispatch))
+ +                      blk_mq_delay_run_hw_queue(hctx, msecs);
         }
   }
   EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
@@@ -2182,7 -2101,7 +2155,7 @@@ static void blk_add_rq_to_plug(struct b
    */
   blk_qc_t blk_mq_submit_bio(struct bio *bio)
   {
- -      struct request_queue *q = bio->bi_disk->queue;
+ +      struct request_queue *q = bio->bi_bdev->bd_disk->queue;
         const int is_sync = op_is_sync(bio->bi_opf);
         const int is_flush_fua = op_is_flush(bio->bi_opf);
         struct blk_mq_alloc_data data = {
@@@ -2707,6 -2626,7 +2680,6 @@@ blk_mq_alloc_hctx(struct request_queue 
                 goto free_hctx;
   
         atomic_set(&hctx->nr_active, 0);
- -      atomic_set(&hctx->elevator_queued, 0);
         if (node == NUMA_NO_NODE)
                 node = set->numa_node;
         hctx->numa_node = node;
@@@ -3957,7 -3877,7 +3930,7 @@@ static int __init blk_mq_init(void
         int i;
   
         for_each_possible_cpu(i)
-               INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+               init_llist_head(&per_cpu(blk_cpu_done, i));
         open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
   
         cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
diff --combined include/linux/blkdev.h

index 9149f4a5adb36eb2f112ec86e664ac321904e62d,89a444c5a58334c510f8624b660997a876e9ddc1..69035e9f632b3cb709e2b2b248d98c52b268d489
--- 1/include/linux/blkdev.h
--- 2/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -153,7 -153,7 +153,7 @@@ struct request 
          */
         union {
                 struct hlist_node hash; /* merge hash */
-               struct list_head ipi_list;
+               struct llist_node ipi_list;
         };
   
         /*
@@@ -337,7 -337,6 +337,7 @@@ struct queue_limits 
         unsigned int            max_zone_append_sectors;
         unsigned int            discard_granularity;
         unsigned int            discard_alignment;
+ +      unsigned int            zone_write_granularity;
   
         unsigned short          max_segments;
         unsigned short          max_integrity_segments;
@@@ -949,8 -948,9 +949,8 @@@ extern int blk_rq_map_kern(struct reque
   extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
                                struct rq_map_data *, const struct iov_iter *,
                                gfp_t);
- -extern void blk_execute_rq(struct request_queue *, struct gendisk *,
- -                        struct request *, int);
- -extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
+ +extern void blk_execute_rq(struct gendisk *, struct request *, int);
+ +extern void blk_execute_rq_nowait(struct gendisk *,
                                   struct request *, int, rq_end_io_fn *);
   
   /* Helper to convert REQ_OP_XXX to its string format XXX */
@@@ -1161,8 -1161,6 +1161,8 @@@ extern void blk_queue_logical_block_siz
   extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
                 unsigned int max_zone_append_sectors);
   extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
+ +void blk_queue_zone_write_granularity(struct request_queue *q,
+ +                                    unsigned int size);
   extern void blk_queue_alignment_offset(struct request_queue *q,
                                        unsigned int alignment);
   void blk_queue_update_readahead(struct request_queue *q);
@@@ -1291,7 -1289,7 +1291,7 @@@ static inline bool blk_needs_flush_plug
                  !list_empty(&plug->cb_list));
   }
   
- -int blkdev_issue_flush(struct block_device *, gfp_t);
+ +int blkdev_issue_flush(struct block_device *bdev);
   long nr_blockdev_pages(void);
   #else /* CONFIG_BLOCK */
   struct blk_plug {
@@@ -1319,7 -1317,7 +1319,7 @@@ static inline bool blk_needs_flush_plug
         return false;
   }
   
- -static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask)
+ +static inline int blkdev_issue_flush(struct block_device *bdev)
   {
         return 0;
   }
@@@ -1476,18 -1474,6 +1476,18 @@@ static inline int bdev_io_opt(struct bl
         return queue_io_opt(bdev_get_queue(bdev));
   }
   
+ +static inline unsigned int
+ +queue_zone_write_granularity(const struct request_queue *q)
+ +{
+ +      return q->limits.zone_write_granularity;
+ +}
+ +
+ +static inline unsigned int
+ +bdev_zone_write_granularity(struct block_device *bdev)
+ +{
+ +      return queue_zone_write_granularity(bdev_get_queue(bdev));
+ +}
+ +
   static inline int queue_alignment_offset(const struct request_queue *q)
   {
         if (q->limits.misaligned)
@@@ -1968,9 -1954,21 +1968,9 @@@ unsigned long disk_start_io_acct(struc
   void disk_end_io_acct(struct gendisk *disk, unsigned int op,
                 unsigned long start_time);
   
- -unsigned long part_start_io_acct(struct gendisk *disk,
- -              struct block_device **part, struct bio *bio);
- -void part_end_io_acct(struct block_device *part, struct bio *bio,
- -                    unsigned long start_time);
- -
- -/**
- - * bio_start_io_acct - start I/O accounting for bio based drivers
- - * @bio:      bio to start account for
- - *
- - * Returns the start time that should be passed back to bio_end_io_acct().
- - */
- -static inline unsigned long bio_start_io_acct(struct bio *bio)
- -{
- -      return disk_start_io_acct(bio->bi_disk, bio_sectors(bio), bio_op(bio));
- -}
+ +unsigned long bio_start_io_acct(struct bio *bio);
+ +void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
+ +              struct block_device *orig_bdev);
   
   /**
    * bio_end_io_acct - end I/O accounting for bio based drivers
@@@ -1979,7 -1977,7 +1979,7 @@@
    */
   static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
   {
- -      return disk_end_io_acct(bio->bi_disk, bio_op(bio), start_time);
+ +      return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev);
   }
   
   int bdev_read_only(struct block_device *bdev);
@@@ -2014,16 -2012,21 +2014,16 @@@ void bdev_add(struct block_device *bdev
   struct block_device *I_BDEV(struct inode *inode);
   struct block_device *bdgrab(struct block_device *bdev);
   void bdput(struct block_device *);
+ +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
+ +              loff_t lend);
   
   #ifdef CONFIG_BLOCK
   void invalidate_bdev(struct block_device *bdev);
- -int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
- -                      loff_t lend);
   int sync_blockdev(struct block_device *bdev);
   #else
   static inline void invalidate_bdev(struct block_device *bdev)
   {
   }
- -static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
- -                                    loff_t lstart, loff_t lend)
- -{
- -      return 0;
- -}
   static inline int sync_blockdev(struct block_device *bdev)
   {
         return 0;
author	Linus Torvalds <[email protected]>
	Mon, 22 Feb 2021 18:53:05 +0000 (10:53 -0800)
committer	Linus Torvalds <[email protected]>
	Mon, 22 Feb 2021 18:53:05 +0000 (10:53 -0800)
		1	2
block/blk-mq.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blkdev.h	patch \|	diff1 \|	diff2 \|	blob \| history